diff --git a/README.md b/README.md
index d3a12ba4c..8fb279b8b 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@
 
 ![FreeCut Timeline Editor](./public/assets/landing/timeline.png)
 
-FreeCut is a browser-based multi-track video editor. No installation, no uploads — everything runs locally in your browser using WebGPU, WebCodecs, OPFS, and the File System Access API.
+FreeCut is a browser-based multi-track video editor. No installation, no uploads — everything runs locally in your browser using WebGPU, WebCodecs, and the File System Access API. Projects, media metadata, thumbnails, waveforms, and transcripts are written as plain files to a workspace folder you pick on disk.
 
 ## Features
 
@@ -45,8 +45,10 @@ Layer masks with keyframeable geometry transforms for compositing and selective
 
 ### Transitions
 
-- **CPU transitions** — fade, wipe, slide, 3D flip, clock wipe, iris — each with directional variants
-- **GPU transitions** — dissolve, sparkles, glitch, light leak, pixelate, chromatic aberration, radial blur
+All transitions are WebGPU-accelerated with a Canvas 2D fallback for non-WebGPU environments.
+
+- Fade, wipe, slide, 3D flip, clock wipe, iris — each with directional variants
+- Dissolve, sparkles, glitch, light leak, pixelate, chromatic aberration, radial blur
 - Adjustable duration and alignment
 
 ### Keyframe Animation
@@ -78,7 +80,7 @@ Layer masks with keyframeable geometry transforms for compositing and selective
 - **Audio:** MP3, WAV, AAC, OGG, Opus
 - **Image:** JPG, PNG, GIF (animated), WebP
 - Up to 5 GB per file
-- OPFS proxy video generation for smooth preview
+- Proxy video generation for smooth preview (cached to the workspace folder)
 - Media relinking for moved or deleted files
 - Scene detection and optical flow analysis
 
@@ -89,19 +91,24 @@ Layer masks with keyframeable geometry transforms for compositing and selective
 - Auto-generate caption text items from transcripts
 - Multi-language support
 
+### Text-to-Speech
+
+- In-browser voiceover generation via KittenTTS (WebGPU)
+- Adds the generated audio clip directly to the timeline
+
 ### Other
 
 - Native SVG shapes — rectangle, circle, triangle, ellipse, star, polygon, heart
 - Text overlays with custom fonts, colors, and positioning
 - Project bundles — export/import projects as ZIP files with Zod-validated schemas
-- IndexedDB persistence with content-addressable storage
+- Workspace folder persistence via File System Access API — your projects live as plain files on disk, not locked away in browser storage
 - Auto-save
 - Customizable keyboard shortcuts with preset import/export
 - Configurable settings (default FPS, snap, waveforms, filmstrips, preview quality, export defaults, undo depth, auto-save interval)
 
 ## Quick Start
 
-**Prerequisites:** Node.js 18+
+**Prerequisites:** Node.js 20+
 
 ```bash
 git clone https://github.com/walterlow/freecut.git
@@ -114,12 +121,13 @@ Open [http://localhost:5173](http://localhost:5173) in Chrome.
 
 ### Workflow
 
-1. Create a project from the projects page
-2. Import media by dragging files into the media library
-3. Drag clips to the timeline — trim, arrange, add effects and transitions
-4. Animate with the keyframe editor
-5. Preview your edit in real time
-6. Export directly from the browser
+1. Pick a workspace folder when prompted — FreeCut writes all projects, media metadata, and caches into this folder
+2. Create a project from the projects page
+3. Import media by dragging files into the media library
+4. Drag clips to the timeline — trim, arrange, add effects and transitions
+5. Animate with the keyframe editor
+6. Preview your edit in real time
+7. Export directly from the browser
 
 ## Browser Support
 
@@ -184,7 +192,9 @@ Brave disables the File System Access API by default. To enable it:
 - [Tailwind CSS 4](https://tailwindcss.com/) + [shadcn/ui](https://ui.shadcn.com/) — styling and UI components
 - [Mediabunny](https://mediabunny.dev/) — media decoding and metadata extraction
 - [WebCodecs](https://developer.mozilla.org/en-US/docs/Web/API/WebCodecs_API) — composition rendering and export
-- [OPFS](https://developer.mozilla.org/en-US/docs/Web/API/File_System_API/Origin_private_file_system) + [IndexedDB](https://developer.mozilla.org/en-US/docs/Web/API/IndexedDB_API) — local persistence
+- [File System Access API](https://developer.mozilla.org/en-US/docs/Web/API/File_System_API) — workspace folder persistence
+- [Transformers.js](https://huggingface.co/docs/transformers.js) — in-browser Whisper transcription
+- [KittenTTS](https://github.com/KittenML/kitten-tts-webgpu) — WebGPU text-to-speech
 - Web Workers — heavy processing off the main thread
 
 ## Development
@@ -235,7 +245,7 @@ src/
 |  |- animation/             # Easing functions and interpolation
 |  |- projects/              # Project domain types
 |  \- timeline/              # Transitions (engine, registry, renderers)
-|- infrastructure/          # Browser/storage/GPU adapters
+|- infrastructure/          # Browser/storage/GPU adapters (workspace-fs, handles-db, gpu facades)
 |- lib/
 |  |- gpu-effects/           # WebGPU effect pipeline + shader definitions
 |  |- gpu-transitions/       # WebGPU transition pipeline + shaders
@@ -256,10 +266,11 @@ src/
 |  |- export/                # WebCodecs export pipeline (Web Worker)
 |  |- effects/               # GPU effect system and UI panels
 |  |- keyframes/             # Keyframe animation, Bezier editor, easing
-|  |- media-library/         # Media import, metadata, OPFS proxies, transcription
+|  |- media-library/         # Media import, metadata, proxy cache, transcription, TTS
 |  |- project-bundle/        # Project ZIP export/import
 |  |- projects/              # Project management
-|  \- settings/              # App settings, keyboard shortcut editor
+|  |- settings/              # App settings, keyboard shortcut editor
+|  \- workspace-gate/        # Workspace folder picker / permission gate
 |- shared/                  # Shared UI/state/utilities across layers
 |- components/ui/            # shadcn/ui components
 |- config/hotkeys.ts         # Keyboard shortcut definitions
diff --git a/src/app/state/editor/store.ts b/src/app/state/editor/store.ts
index 15949b3d9..7b1a35125 100644
--- a/src/app/state/editor/store.ts
+++ b/src/app/state/editor/store.ts
@@ -51,6 +51,7 @@ export const useEditorStore = create<EditorState & EditorActions>((set) => ({
   mediaSkimPreviewFrame: null,
   compoundClipSkimPreviewCompositionId: null,
   compoundClipSkimPreviewFrame: null,
+  transcriptionDialogDepth: 0,
   sourcePatchVideoEnabled: true,
   sourcePatchAudioEnabled: true,
   sourcePatchVideoTrackId: null,
@@ -179,6 +180,12 @@ export const useEditorStore = create<EditorState & EditorActions>((set) => ({
       compoundClipSkimPreviewFrame: null,
     };
   }),
+  beginTranscriptionDialog: () => set((state) => ({
+    transcriptionDialogDepth: state.transcriptionDialogDepth + 1,
+  })),
+  endTranscriptionDialog: () => set((state) => ({
+    transcriptionDialogDepth: Math.max(0, state.transcriptionDialogDepth - 1),
+  })),
   setSourcePatchVideoEnabled: (enabled) => set({ sourcePatchVideoEnabled: enabled }),
   setSourcePatchAudioEnabled: (enabled) => set({ sourcePatchAudioEnabled: enabled }),
   setSourcePatchVideoTrackId: (trackId) => set({ sourcePatchVideoTrackId: trackId }),
diff --git a/src/app/state/editor/types.ts b/src/app/state/editor/types.ts
index b3d42f298..3705494c1 100644
--- a/src/app/state/editor/types.ts
+++ b/src/app/state/editor/types.ts
@@ -16,6 +16,7 @@ export interface EditorState {
   mediaSkimPreviewFrame: number | null;
   compoundClipSkimPreviewCompositionId: string | null;
   compoundClipSkimPreviewFrame: number | null;
+  transcriptionDialogDepth: number;
   sourcePatchVideoEnabled: boolean;
   sourcePatchAudioEnabled: boolean;
   sourcePatchVideoTrackId: string | null;
@@ -54,6 +55,8 @@ export interface EditorActions {
   clearMediaSkimPreview: () => void;
   setCompoundClipSkimPreview: (compositionId: string | null, frame?: number | null) => void;
   clearCompoundClipSkimPreview: () => void;
+  beginTranscriptionDialog: () => void;
+  endTranscriptionDialog: () => void;
   setSourcePatchVideoEnabled: (enabled: boolean) => void;
   setSourcePatchAudioEnabled: (enabled: boolean) => void;
   setSourcePatchVideoTrackId: (trackId: string | null) => void;
diff --git a/src/config/hotkeys.ts b/src/config/hotkeys.ts
index aadf3f79a..5abfec2e8 100644
--- a/src/config/hotkeys.ts
+++ b/src/config/hotkeys.ts
@@ -67,6 +67,7 @@ export const HOTKEYS = {
 
   // UI
   TOGGLE_SNAP: 's',
+  OPEN_SCENE_BROWSER: 'mod+shift+f',
 
   // Markers
   ADD_MARKER: 'm',
@@ -325,6 +326,7 @@ export const HOTKEY_DESCRIPTIONS: Record<HotkeyKey, string> = {
 
   // UI
   TOGGLE_SNAP: 'Toggle snap',
+  OPEN_SCENE_BROWSER: 'Open Scene Browser (search AI captions)',
 
   // Markers
   ADD_MARKER: 'Add marker at playhead',
diff --git a/src/features/composition-runtime/utils/audio-decode-cache.ts b/src/features/composition-runtime/utils/audio-decode-cache.ts
index 5b423082f..fc9a9c281 100644
--- a/src/features/composition-runtime/utils/audio-decode-cache.ts
+++ b/src/features/composition-runtime/utils/audio-decode-cache.ts
@@ -4,11 +4,11 @@
  * Caches decoded AudioBuffers for custom-decoded audio tracks so that
  * split clips from the same source share a single decode.
  *
- * Storage: Decoded audio is persisted to IndexedDB in 10-second bins
+ * Storage: Decoded audio is persisted to workspace-backed files in 10-second bins
  * (Int16 @ 22050 Hz stereo ~ 0.84 MB/bin). This avoids large single
  * records and allows progressive persistence during decode.
  *
- * On refresh, bins are loaded from IndexedDB in parallel and
+ * On refresh, bins are loaded from the workspace cache in parallel and
  * reassembled into an AudioBuffer with no re-decode needed.
  *
  * Surround (5.1/7.1) sources are downmixed to stereo during decode
@@ -72,10 +72,10 @@ const PLAYABLE_PARTIAL_PREROLL_SECONDS = 0.25;
 const STARTUP_PLAYABLE_PARTIAL_READY_SECONDS = 1;
 const PENDING_PLAYBACK_SLICE_REUSE_HEADROOM_SECONDS = 1;
 
-/** Sample rate for IndexedDB storage; 22050 Hz is sufficient for preview. */
+/** Sample rate for persisted preview-audio bins; 22050 Hz is sufficient for preview. */
 const STORAGE_SAMPLE_RATE = 22050;
 
-/** Bin duration in seconds for chunked IndexedDB storage. */
+/** Bin duration in seconds for chunked persisted storage. */
 const BIN_DURATION_SEC = 10;
 
 export interface PlaybackAudioSlice {
@@ -224,7 +224,7 @@ function createInputSource(
 
 /**
  * Get a cached AudioBuffer or decode one via mediabunny.
- * Checks: memory cache -> IndexedDB bins -> decode (persists bins progressively).
+ * Checks: memory cache -> persisted bins -> decode (persists bins progressively).
  * Concurrent calls for the same mediaId share a single promise.
  */
 function ensureDecodeStarted(mediaId: string, src: PreviewAudioSource): Promise<AudioBuffer> {
@@ -619,11 +619,11 @@ export function clearPreviewAudioCache(): void {
 }
 
 // ---------------------------------------------------------------------------
-// Load from IndexedDB bins
+// Load from persisted bins
 // ---------------------------------------------------------------------------
 
 async function loadOrDecodeAudio(mediaId: string, src: PreviewAudioSource): Promise<AudioBuffer> {
-  // Try IndexedDB
+  // Try persisted workspace cache
   try {
     const cached = await getDecodedPreviewAudio(mediaId);
     if (cached && 'kind' in cached && cached.kind === 'meta') {
@@ -638,7 +638,7 @@ async function loadOrDecodeAudio(mediaId: string, src: PreviewAudioSource): Prom
       await deleteDecodedPreviewAudio(mediaId).catch(() => undefined);
     }
   } catch (err) {
-    log.warn('Failed to load from IndexedDB, will decode', { mediaId, err });
+    log.warn('Failed to load persisted decoded audio, will decode', { mediaId, err });
   }
 
   // Full decode with progressive bin persistence
@@ -694,7 +694,7 @@ async function loadFromBins(meta: DecodedPreviewAudioMeta): Promise<AudioBuffer>
     throw new Error(`Decoded audio bins incomplete: ${offset}/${totalFrames} frames`);
   }
 
-  log.info('Loaded decoded audio from IndexedDB', {
+  log.info('Loaded decoded audio from workspace cache', {
     mediaId,
     binCount,
     sampleRate,
@@ -804,7 +804,7 @@ async function buildPreviewStereoBuffer(
 }
 
 /**
- * Downsample, convert to Int16, and persist one bin to IndexedDB.
+ * Downsample, convert to Int16, and persist one bin to workspace-backed storage.
  * Returns persisted Int16 data so playback can be assembled without
  * retaining a massive full-resolution decode in memory.
  */
@@ -1015,9 +1015,9 @@ async function decodeFullAudio(
         binDurationSec: BIN_DURATION_SEC,
         createdAt: Date.now(),
       }).then(() => {
-        log.info('All bins persisted to IndexedDB', { mediaId, binCount: totalBins });
+        log.info('All bins persisted to workspace cache', { mediaId, binCount: totalBins });
       }).catch((err) => {
-        log.warn('Failed to persist bins to IndexedDB', { mediaId, err });
+        log.warn('Failed to persist bins to workspace cache', { mediaId, err });
       });
 
       return combined;
diff --git a/src/features/composition-runtime/utils/preview-audio-conform.ts b/src/features/composition-runtime/utils/preview-audio-conform.ts
index 62b5df8f1..4d5af80eb 100644
--- a/src/features/composition-runtime/utils/preview-audio-conform.ts
+++ b/src/features/composition-runtime/utils/preview-audio-conform.ts
@@ -4,10 +4,12 @@ import { opfsService } from '@/features/composition-runtime/deps/media-library';
 import { createLogger } from '@/shared/logging/logger';
 import type { MediaMetadata } from '@/types/storage';
 import {
-  mirrorBytesToWorkspace,
   readWorkspaceBlob,
   removeWorkspaceCacheEntry,
 } from '@/infrastructure/storage/workspace-fs/cache-mirror';
+import { previewAudioPath } from '@/infrastructure/storage/workspace-fs/paths';
+import { requireWorkspaceRoot } from '@/infrastructure/storage/workspace-fs/root';
+import { writeBlob } from '@/infrastructure/storage/workspace-fs/fs-primitives';
 import { audioBufferToWavBlob } from './audio-buffer-wav';
 
 const log = createLogger('PreviewAudioConform');
@@ -18,7 +20,7 @@ const PREVIEW_AUDIO_CONFORM_MIME_TYPE = 'audio/wav';
 const pendingPreviewAudioConformLoads = new Map<string, Promise<string | null>>();
 const pendingPreviewAudioConformPersists = new Map<string, Promise<void>>();
 
-function buildPreviewAudioConformOpfsPath(mediaId: string): string {
+function buildPreviewAudioConformPath(mediaId: string): string {
   const shard1 = mediaId.slice(0, 2) || '00';
   const shard2 = mediaId.slice(2, 4) || '00';
   return `${PREVIEW_AUDIO_CONFORM_DIR}/${shard1}/${shard2}/${mediaId}.wav`;
@@ -50,33 +52,38 @@ export async function resolvePreviewAudioConformUrl(mediaId: string): Promise<st
       if (!media?.previewAudioOpfsPath) {
         return null;
       }
-      const opfsPath = media.previewAudioOpfsPath;
+      const persistedPath = media.previewAudioOpfsPath;
       const mimeType = media.previewAudioMimeType || PREVIEW_AUDIO_CONFORM_MIME_TYPE;
 
-      let bytes: ArrayBuffer | null = null;
-      try {
-        bytes = await opfsService.getFile(opfsPath);
-      } catch {
-        // Fall through to workspace fallback below.
+      const workspaceBlob = await readWorkspaceBlob(previewAudioPath(persistedPath));
+      if (workspaceBlob) {
+        return blobUrlManager.acquire(
+          cacheKey,
+          new Blob([await workspaceBlob.arrayBuffer()], { type: mimeType }),
+        );
       }
 
-      if (!bytes) {
-        // Cross-origin fallback: try to hydrate from the workspace folder.
-        const wsBlob = await readWorkspaceBlob(opfsPath.split('/'));
-        if (!wsBlob) return null;
-        const wsBytes = await wsBlob.arrayBuffer();
-        try {
-          await opfsService.saveFile(opfsPath, wsBytes);
-        } catch (err) {
-          log.warn('Failed to back-fill OPFS from workspace', { mediaId, err });
-        }
-        bytes = wsBytes;
+      // Legacy fallback: older sessions wrote only to OPFS. If found there,
+      // hydrate the workspace copy so subsequent reads stay workspace-first.
+      try {
+        const bytes = await opfsService.getFile(persistedPath);
+        await writeBlob(
+          requireWorkspaceRoot(),
+          previewAudioPath(persistedPath),
+          new Uint8Array(bytes),
+        );
+        return blobUrlManager.acquire(
+          cacheKey,
+          new Blob([bytes], { type: mimeType }),
+        );
+      } catch (err) {
+        log.warn('Failed to resolve preview audio conform asset from legacy OPFS', {
+          mediaId,
+          path: persistedPath,
+          err,
+        });
+        return null;
       }
-
-      return blobUrlManager.acquire(
-        cacheKey,
-        new Blob([bytes], { type: mimeType }),
-      );
     } catch (err) {
       log.warn('Failed to resolve preview audio conform asset', { mediaId, err });
       return null;
@@ -112,35 +119,20 @@ export async function persistPreviewAudioConform(
       blobUrlManager.acquire(cacheKey, wavBlob);
     }
 
-    if (media.previewAudioOpfsPath) {
-      if (media.previewAudioMimeType !== PREVIEW_AUDIO_CONFORM_MIME_TYPE) {
-        await updateMedia(mediaId, {
-          previewAudioMimeType: PREVIEW_AUDIO_CONFORM_MIME_TYPE,
-          previewAudioConformedAt: media.previewAudioConformedAt ?? Date.now(),
-        });
-      }
-      return;
-    }
-
     const nextBlob = wavBlob ?? audioBufferToWavBlob(buffer);
-    const opfsPath = buildPreviewAudioConformOpfsPath(mediaId);
+    const persistedPath = media.previewAudioOpfsPath ?? buildPreviewAudioConformPath(mediaId);
     const bytes = await nextBlob.arrayBuffer();
-    await opfsService.saveFile(opfsPath, bytes);
-
-    try {
-      await updateMedia(mediaId, {
-        previewAudioOpfsPath: opfsPath,
-        previewAudioMimeType: PREVIEW_AUDIO_CONFORM_MIME_TYPE,
-        previewAudioConformedAt: Date.now(),
-      });
-    } catch (err) {
-      await opfsService.deleteFile(opfsPath).catch(() => undefined);
-      throw err;
-    }
-
-    // Mirror to the workspace folder so other origins can reuse the
-    // conformed WAV without re-running the decode/encode. Fire-and-forget.
-    void mirrorBytesToWorkspace(opfsPath.split('/'), bytes);
+    await writeBlob(
+      requireWorkspaceRoot(),
+      previewAudioPath(persistedPath),
+      new Uint8Array(bytes),
+    );
+
+    await updateMedia(mediaId, {
+      previewAudioOpfsPath: persistedPath,
+      previewAudioMimeType: PREVIEW_AUDIO_CONFORM_MIME_TYPE,
+      previewAudioConformedAt: Date.now(),
+    });
   })()
     .catch((err) => {
       log.warn('Failed to persist preview audio conform asset', { mediaId, err });
@@ -169,16 +161,17 @@ export async function deletePreviewAudioConform(
   }
 
   if (media.previewAudioOpfsPath) {
+    const persistedPath = media.previewAudioOpfsPath;
     try {
-      await opfsService.deleteFile(media.previewAudioOpfsPath);
+      await opfsService.deleteFile(persistedPath);
     } catch (err) {
-      log.warn('Failed to delete preview audio conform asset', {
+      log.debug('Legacy OPFS preview audio conform asset was already absent or unreadable', {
         mediaId,
-        path: media.previewAudioOpfsPath,
+        path: persistedPath,
         err,
       });
     }
-    void removeWorkspaceCacheEntry(media.previewAudioOpfsPath.split('/'));
+    void removeWorkspaceCacheEntry(previewAudioPath(persistedPath));
   }
 
   if (options?.clearMetadata) {
diff --git a/src/features/editor/components/audio-meter-panel.tsx b/src/features/editor/components/audio-meter-panel.tsx
index 35f14375a..175e72d6f 100644
--- a/src/features/editor/components/audio-meter-panel.tsx
+++ b/src/features/editor/components/audio-meter-panel.tsx
@@ -8,7 +8,7 @@ import {
   importWaveformCache,
 } from '@/features/editor/deps/timeline-store';
 import { importMediaLibraryService } from '@/features/editor/deps/media-library';
-import { usePlaybackStore } from '@/shared/state/playback';
+import { getResolvedPlaybackFrame, usePlaybackStore } from '@/shared/state/playback';
 import { usePreviewBridgeStore } from '@/shared/state/preview-bridge';
 import { useEditorStore } from '@/app/state/editor/store';
 import { EDITOR_LAYOUT_CSS_VALUES } from '@/app/editor-layout';
@@ -174,7 +174,14 @@ export const AudioMeterPanel = memo(function AudioMeterPanel() {
     lastTimestamp: 0,
   });
 
-  const effectiveFrame = previewFrame ?? displayedFrame ?? currentFrame;
+  const effectiveFrame = useMemo(() => getResolvedPlaybackFrame({
+    currentFrame,
+    currentFrameEpoch: usePlaybackStore.getState().currentFrameEpoch,
+    previewFrame,
+    previewFrameEpoch: usePlaybackStore.getState().previewFrameEpoch,
+    isPlaying,
+    displayedFrame,
+  }), [currentFrame, displayedFrame, isPlaying, previewFrame]);
   const combinedTracks = useMemo(() => {
     return tracks
       .filter((track) => !track.isGroup)
diff --git a/src/features/editor/components/editor.test.tsx b/src/features/editor/components/editor.test.tsx
index 0e33600fc..717313fcf 100644
--- a/src/features/editor/components/editor.test.tsx
+++ b/src/features/editor/components/editor.test.tsx
@@ -24,6 +24,7 @@ const mocks = vi.hoisted(() => ({
   }),
   initTransitionChainSubscription: vi.fn(() => vi.fn()),
   createProjectUpgradeBackup: vi.fn(),
+  resizablePanelGroup: vi.fn(),
 }));
 
 vi.mock('@tanstack/react-router', () => ({
@@ -42,7 +43,18 @@ vi.mock('@/shared/logging/logger', () => ({
 }));
 
 vi.mock('@/components/ui/resizable', () => ({
-  ResizablePanelGroup: ({ children }: { children: ReactNode }) => <div>{children}</div>,
+  ResizablePanelGroup: ({
+    children,
+    ...props
+  }: {
+    children: ReactNode;
+    autoSaveId?: string;
+    className?: string;
+    direction?: string;
+  }) => {
+    mocks.resizablePanelGroup(props);
+    return <div>{children}</div>;
+  },
   ResizablePanel: ({ children }: { children: ReactNode }) => <div>{children}</div>,
   ResizableHandle: () => <div data-testid="resizable-handle" />,
 }));
@@ -326,4 +338,31 @@ describe('LoadedEditor migration metadata refresh', () => {
 
     await waitFor(() => expect(mocks.invalidate).not.toHaveBeenCalled());
   });
+
+  it('persists the timeline split layout in localStorage', async () => {
+    render(
+      <LoadedEditor
+        projectId="project-1"
+        project={{
+          id: 'project-1',
+          name: 'Current Project',
+          width: 1920,
+          height: 1080,
+          fps: 30,
+        }}
+        migration={{
+          storedSchemaVersion: 9,
+          currentSchemaVersion: 9,
+          requiresUpgrade: false,
+        }}
+      />
+    );
+
+    expect(mocks.resizablePanelGroup).toHaveBeenCalledWith(
+      expect.objectContaining({
+        autoSaveId: 'editor:timeline-layout',
+        direction: 'vertical',
+      })
+    );
+  });
 });
diff --git a/src/features/editor/components/editor.tsx b/src/features/editor/components/editor.tsx
index 4a5c28942..e4aeab202 100644
--- a/src/features/editor/components/editor.tsx
+++ b/src/features/editor/components/editor.tsx
@@ -461,7 +461,11 @@ export const LoadedEditor = memo(function LoadedEditor({
         )}
 
         {/* Right side: Preview/Properties + Timeline */}
-        <ResizablePanelGroup direction="vertical" className="flex-1 min-w-0">
+        <ResizablePanelGroup
+          direction="vertical"
+          className="flex-1 min-w-0"
+          autoSaveId="editor:timeline-layout"
+        >
           {/* Top - Preview + Properties (inline mode) */}
           <ResizablePanel
             defaultSize={100 - editorLayout.timelineDefaultSize}
diff --git a/src/features/editor/components/preview-area.test.tsx b/src/features/editor/components/preview-area.test.tsx
index 3bce2e8db..c46ed632a 100644
--- a/src/features/editor/components/preview-area.test.tsx
+++ b/src/features/editor/components/preview-area.test.tsx
@@ -159,7 +159,7 @@ describe('PreviewArea mask editor toolbar', () => {
     );
 
     expect(screen.getByTestId('inline-source-preview')).toBeInTheDocument();
-    expect(screen.queryByTestId('video-preview')).not.toBeInTheDocument();
+    expect(screen.getByTestId('video-preview')).toBeInTheDocument();
     expect(screen.getByTestId('playback-controls')).toBeInTheDocument();
   });
 
@@ -176,7 +176,7 @@ describe('PreviewArea mask editor toolbar', () => {
     );
 
     expect(screen.getByTestId('inline-composition-preview')).toBeInTheDocument();
-    expect(screen.queryByTestId('video-preview')).not.toBeInTheDocument();
+    expect(screen.getByTestId('video-preview')).toBeInTheDocument();
     expect(screen.getByTestId('playback-controls')).toBeInTheDocument();
   });
 
diff --git a/src/features/editor/components/preview-area.tsx b/src/features/editor/components/preview-area.tsx
index 687a22b81..5b990d228 100644
--- a/src/features/editor/components/preview-area.tsx
+++ b/src/features/editor/components/preview-area.tsx
@@ -92,28 +92,34 @@ const ProgramPreviewSurface = memo(function ProgramPreviewSurface({
   const mediaSkimPreviewFrame = useEditorStore((s) => s.mediaSkimPreviewFrame);
   const compoundClipSkimPreviewCompositionId = useEditorStore((s) => s.compoundClipSkimPreviewCompositionId);
   const compoundClipSkimPreviewFrame = useEditorStore((s) => s.compoundClipSkimPreviewFrame);
+  const skimPreviewOverlay = compoundClipSkimPreviewCompositionId ? (
+    <InlineCompositionPreview
+      compositionId={compoundClipSkimPreviewCompositionId}
+      seekFrame={compoundClipSkimPreviewFrame}
+      containerSize={containerSize}
+    />
+  ) : mediaSkimPreviewMediaId ? (
+    <InlineSourcePreview
+      mediaId={mediaSkimPreviewMediaId}
+      seekFrame={mediaSkimPreviewFrame}
+      containerSize={containerSize}
+    />
+  ) : null;
 
   return (
     <ErrorBoundary level="component">
-      {compoundClipSkimPreviewCompositionId ? (
-        <InlineCompositionPreview
-          compositionId={compoundClipSkimPreviewCompositionId}
-          seekFrame={compoundClipSkimPreviewFrame}
-          containerSize={containerSize}
-        />
-      ) : mediaSkimPreviewMediaId ? (
-        <InlineSourcePreview
-          mediaId={mediaSkimPreviewMediaId}
-          seekFrame={mediaSkimPreviewFrame}
-          containerSize={containerSize}
-        />
-      ) : (
+      <div className="relative w-full h-full">
         <VideoPreview
           project={project}
           containerSize={containerSize}
           suspendOverlay={suspendOverlay}
         />
-      )}
+        {skimPreviewOverlay && (
+          <div className="absolute inset-0 z-40 bg-video-preview-background">
+            {skimPreviewOverlay}
+          </div>
+        )}
+      </div>
     </ErrorBoundary>
   );
 });
diff --git a/src/features/editor/components/settings-dialog.tsx b/src/features/editor/components/settings-dialog.tsx
index 2e2dc4205..82edbef64 100644
--- a/src/features/editor/components/settings-dialog.tsx
+++ b/src/features/editor/components/settings-dialog.tsx
@@ -1,5 +1,6 @@
 import { useState, useCallback } from 'react';
 import type { MediaMetadata } from '@/types/storage';
+import { toast } from 'sonner';
 import {
   Dialog,
   DialogContent,
@@ -17,7 +18,6 @@ import {
   AlertDialogTitle,
 } from '@/components/ui/alert-dialog';
 import { Button } from '@/components/ui/button';
-import { Combobox } from '@/components/ui/combobox';
 import { Label } from '@/components/ui/label';
 import { Separator } from '@/components/ui/separator';
 import {
@@ -30,19 +30,23 @@ import {
 import { Switch } from '@/components/ui/switch';
 import { Slider } from '@/components/ui/slider';
 import { ScrollArea } from '@/components/ui/scroll-area';
+import { Input } from '@/components/ui/input';
 import {
-  RotateCcw, Trash2, Loader2, Check, ImagePlus, Film,
-  Settings2, Rows3, AudioLines, HardDrive,
+  RotateCcw, Trash2, Loader2, Check, ImagePlus, Film, TriangleAlert,
+  Settings2, Rows3, HardDrive, Sparkles,
 } from 'lucide-react';
 import {
   LocalInferenceUnloadControl,
   LocalModelCacheControl,
   useSettingsStore,
+  CAPTIONING_INTERVAL_BOUNDS,
+  DEFAULT_CAPTIONING_INTERVAL_SECONDS,
+  resolveCaptioningIntervalSec,
+  type CaptioningIntervalUnit,
 } from '@/features/editor/deps/settings';
 import {
   useMediaLibraryStore,
   getSharedProxyKey,
-  getMediaTranscriptionModelOptions,
   importProxyService,
   importMediaLibraryService,
   importThumbnailGenerator,
@@ -56,25 +60,28 @@ import { clearPreviewAudioCache } from '@/features/editor/deps/composition-runti
 import { createLogger } from '@/shared/logging/logger';
 import { cn } from '@/shared/ui/cn';
 import { EDITOR_DENSITY_OPTIONS } from '@/app/editor-layout';
-import {
-  getWhisperQuantizationOption,
-  getWhisperLanguageSelectValue,
-  getWhisperLanguageSettingValue,
-  WHISPER_LANGUAGE_OPTIONS,
-  WHISPER_QUANTIZATION_OPTIONS,
-} from '@/shared/utils/whisper-settings';
-import type { MediaTranscriptModel, MediaTranscriptQuantization } from '@/types/storage';
 
 const log = createLogger('SettingsDialog');
-const TRANSCRIPTION_MODEL_OPTIONS = getMediaTranscriptionModelOptions();
 
 const SETTINGS_SECTIONS = [
   { id: 'general', label: 'General', icon: Settings2 },
   { id: 'timeline', label: 'Timeline', icon: Rows3 },
-  { id: 'whisper', label: 'Whisper', icon: AudioLines },
+  { id: 'ai', label: 'AI', icon: Sparkles },
   { id: 'storage', label: 'Storage', icon: HardDrive },
 ] as const;
 
+const ESTIMATE_REFERENCE_DURATION_SEC = 60;
+const ESTIMATE_REFERENCE_FPS = 30;
+
+function formatCaptionEstimate(unit: CaptioningIntervalUnit, value: number): string {
+  const intervalSec = resolveCaptioningIntervalSec(unit, value, ESTIMATE_REFERENCE_FPS);
+  if (intervalSec <= 0) {
+    return 'Enter an interval above zero.';
+  }
+  const sceneCount = Math.max(1, Math.round(ESTIMATE_REFERENCE_DURATION_SEC / intervalSec));
+  return `~${sceneCount} ${sceneCount === 1 ? 'scene' : 'scenes'} per 1-min clip at ${ESTIMATE_REFERENCE_FPS}fps`;
+}
+
 type SettingsSectionId = (typeof SETTINGS_SECTIONS)[number]['id'];
 
 interface SettingsDialogProps {
@@ -82,6 +89,99 @@ interface SettingsDialogProps {
   onOpenChange: (open: boolean) => void;
 }
 
+interface BatchActionResult {
+  total: number;
+  succeeded: number;
+  failed: number;
+  failedItems: string[];
+}
+
+interface ActionFeedback {
+  tone: 'success' | 'error';
+  message: string;
+}
+
+function formatCount(count: number, noun: string): string {
+  return `${count} ${noun}${count === 1 ? '' : 's'}`;
+}
+
+function formatFailedItems(items: string[]): string {
+  if (items.length === 0) return '';
+  if (items.length <= 2) return items.join(', ');
+  return `${items.slice(0, 2).join(', ')}, +${items.length - 2} more`;
+}
+
+function createBatchResult(total: number, failedItems: string[]): BatchActionResult {
+  return {
+    total,
+    succeeded: Math.max(0, total - failedItems.length),
+    failed: failedItems.length,
+    failedItems,
+  };
+}
+
+function getBatchOutcomeFeedback(
+  actionLabel: string,
+  result: BatchActionResult,
+): ActionFeedback {
+  if (result.total === 0) {
+    return {
+      tone: 'success',
+      message: `No project media to ${actionLabel.toLowerCase()}.`,
+    };
+  }
+
+  if (result.failed === 0) {
+    return {
+      tone: 'success',
+      message: `${actionLabel} completed for ${formatCount(result.succeeded, 'item')}.`,
+    };
+  }
+
+  const failedLabel = formatFailedItems(result.failedItems);
+
+  if (result.succeeded === 0) {
+    return {
+      tone: 'error',
+      message: `Couldn't ${actionLabel.toLowerCase()} ${formatCount(result.failed, 'item')}${failedLabel ? `: ${failedLabel}` : '.'}`,
+    };
+  }
+
+  return {
+    tone: 'error',
+    message: `${actionLabel} completed for ${result.succeeded}/${result.total} items. Needs attention: ${failedLabel}.`,
+  };
+}
+
+function showBatchOutcomeToast(
+  successTitle: string,
+  partialTitle: string,
+  failureTitle: string,
+  result: BatchActionResult,
+): void {
+  if (result.total === 0) {
+    toast.success(successTitle, {
+      description: 'No project media needed updating.',
+    });
+    return;
+  }
+
+  if (result.failed === 0) {
+    toast.success(successTitle, {
+      description: `${formatCount(result.succeeded, 'item')} updated.`,
+    });
+    return;
+  }
+
+  const description = result.succeeded === 0
+    ? formatFailedItems(result.failedItems)
+    : `${formatCount(result.succeeded, 'item')} updated. Failed: ${formatFailedItems(result.failedItems)}`;
+
+  toast.error(result.succeeded === 0 ? failureTitle : partialTitle, {
+    description,
+  });
+}
+
 /**
  * Clear regenerable cache data for the current project's media only.
  * Clears filmstrips, waveforms, GIF frames, and decoded audio
@@ -89,8 +189,10 @@ interface SettingsDialogProps {
  *
  * Does NOT clear thumbnails (not auto-regenerated) or proxies (separate action).
  */
-async function clearProjectCaches(mediaIds: string[]): Promise<void> {
-  if (mediaIds.length === 0) return;
+async function clearProjectCaches(
+  mediaItems: Array<Pick<MediaMetadata, 'id' | 'fileName'>>,
+): Promise<BatchActionResult> {
+  if (mediaItems.length === 0) return createBatchResult(0, []);
 
   const [
     { deleteWaveform, deleteGifFrames, deleteDecodedPreviewAudio },
@@ -109,49 +211,73 @@ async function clearProjectCaches(mediaIds: string[]): Promise<void> {
   // Clear in-memory preview audio cache (not keyed per-media, so clear all)
   clearPreviewAudioCache();
 
-  await Promise.all(
-    mediaIds.flatMap((id) => [
-      deleteWaveform(id).catch((e) => { log.debug('Failed to delete waveform:', id, e); }),
-      deleteGifFrames(id).catch((e) => { log.debug('Failed to delete GIF frames:', id, e); }),
-      deleteDecodedPreviewAudio(id).catch((e) => { log.debug('Failed to delete decoded audio:', id, e); }),
-      deletePreviewAudioConform(id, { clearMetadata: true }).catch((e) => { log.debug('Failed to delete preview conform audio:', id, e); }),
-      gifFrameCache.clearMedia(id).catch((e) => { log.debug('Failed to clear GIF cache:', id, e); }),
-      filmstripCache.clearMedia(id).catch((e) => { log.debug('Failed to clear filmstrip cache:', id, e); }),
-      waveformCache.clearMedia(id).catch((e) => { log.debug('Failed to clear waveform cache:', id, e); }),
-    ])
-  );
+  const failedItems: string[] = [];
+
+  await Promise.all(mediaItems.map(async ({ id, fileName }) => {
+    const results = await Promise.allSettled([
+      deleteWaveform(id),
+      deleteGifFrames(id),
+      deleteDecodedPreviewAudio(id),
+      deletePreviewAudioConform(id, { clearMetadata: true }),
+      gifFrameCache.clearMedia(id),
+      filmstripCache.clearMedia(id),
+      waveformCache.clearMedia(id),
+    ]);
+
+    const failures = results.filter((result) => result.status === 'rejected');
+    if (failures.length > 0) {
+      log.warn('Failed to fully clear project cache for media item', {
+        mediaId: id,
+        fileName,
+        failures: failures.map((result) => String(result.reason)),
+      });
+      failedItems.push(fileName);
+    }
+  }));
 
-  log.info(`Cleared caches for ${mediaIds.length} media items`);
+  const result = createBatchResult(mediaItems.length, failedItems);
+  log.info(`Cleared caches for ${result.succeeded}/${result.total} media items`);
+  return result;
 }
 
 /** Delete all proxy videos for the given media items and clear their store status. */
 async function clearProjectProxies(
   mediaItems: MediaMetadata[]
-): Promise<void> {
-  if (mediaItems.length === 0) return;
+): Promise<BatchActionResult> {
+  if (mediaItems.length === 0) return createBatchResult(0, []);
 
   const { proxyService } = await importProxyService();
+  const failedItems: string[] = [];
 
   await Promise.all(mediaItems.map(async (media) => {
     try {
       await proxyService.deleteProxy(media.id, getSharedProxyKey(media));
-    } catch { /* already absent */ }
-    useMediaLibraryStore.getState().clearProxyStatus(media.id);
-    proxyService.clearProxyKey(media.id);
+      useMediaLibraryStore.getState().clearProxyStatus(media.id);
+      proxyService.clearProxyKey(media.id);
+    } catch (error) {
+      log.warn('Failed to clear proxy for media item', {
+        mediaId: media.id,
+        fileName: media.fileName,
+        error,
+      });
+      failedItems.push(media.fileName);
+    }
   }));
 
-  log.info(`Cleared proxies for ${mediaItems.length} media items`);
+  const result = createBatchResult(mediaItems.length, failedItems);
+  log.info(`Cleared proxies for ${result.succeeded}/${result.total} media items`);
+  return result;
 }
 
 /**
  * Regenerate thumbnails for all media in the current project.
- * Fetches each media file, generates a new thumbnail, and saves it to IndexedDB.
+ * Fetches each media file, generates a new thumbnail, and saves it to workspace storage.
  */
 async function regenerateProjectThumbnails(
   mediaItems: Array<{ id: string; fileName: string; mimeType: string }>,
   onProgress?: (done: number, total: number) => void,
-): Promise<number> {
-  if (mediaItems.length === 0) return 0;
+): Promise<BatchActionResult> {
+  if (mediaItems.length === 0) return createBatchResult(0, []);
 
   const [
     { mediaLibraryService },
@@ -163,7 +289,8 @@ async function regenerateProjectThumbnails(
     import('@/infrastructure/storage'),
   ]);
 
-  let regenerated = 0;
+  let succeeded = 0;
+  const failedItems: string[] = [];
 
   for (const media of mediaItems) {
     try {
@@ -189,18 +316,20 @@ async function regenerateProjectThumbnails(
 
       // Clear the in-memory blob URL cache so UI picks up the new thumbnail
       mediaLibraryService.clearThumbnailCache(media.id);
-      regenerated++;
+      succeeded++;
     } catch (err) {
       log.warn(`Failed to regenerate thumbnail for ${media.fileName}:`, err);
+      failedItems.push(media.fileName);
     }
-    onProgress?.(regenerated, mediaItems.length);
+    onProgress?.(succeeded + failedItems.length, mediaItems.length);
   }
 
   // Reload store so MediaCards see the updated thumbnailId and re-fetch
   await useMediaLibraryStore.getState().loadMediaItems();
 
-  log.info(`Regenerated ${regenerated}/${mediaItems.length} thumbnails`);
-  return regenerated;
+  const result = createBatchResult(mediaItems.length, failedItems);
+  log.info(`Regenerated ${result.succeeded}/${result.total} thumbnails`);
+  return result;
 }
 
 export function SettingsDialog({ open, onOpenChange }: SettingsDialogProps) {
@@ -210,32 +339,46 @@ export function SettingsDialog({ open, onOpenChange }: SettingsDialogProps) {
   const showFilmstrips = useSettingsStore((s) => s.showFilmstrips);
   const autoSaveInterval = useSettingsStore((s) => s.autoSaveInterval);
   const maxUndoHistory = useSettingsStore((s) => s.maxUndoHistory);
-  const defaultWhisperModel = useSettingsStore((s) => s.defaultWhisperModel);
-  const defaultWhisperQuantization = useSettingsStore((s) => s.defaultWhisperQuantization);
-  const defaultWhisperLanguage = useSettingsStore((s) => s.defaultWhisperLanguage);
+  const captioningIntervalUnit = useSettingsStore((s) => s.captioningIntervalUnit);
+  const captioningIntervalValue = useSettingsStore((s) => s.captioningIntervalValue);
   const setSetting = useSettingsStore((s) => s.setSetting);
   const resetToDefaults = useSettingsStore((s) => s.resetToDefaults);
 
+  const intervalBounds = CAPTIONING_INTERVAL_BOUNDS[captioningIntervalUnit];
+  const intervalInputStep = captioningIntervalUnit === 'seconds' ? 0.5 : 1;
+  const intervalUnitLabel = captioningIntervalUnit === 'seconds' ? 'sec' : 'frames';
+
   const mediaItems = useMediaLibraryStore((s) => s.mediaItems);
   const proxyStatus = useMediaLibraryStore((s) => s.proxyStatus);
 
   const [activeSection, setActiveSection] = useState<SettingsSectionId>('general');
-  const [clearState, setClearState] = useState<'idle' | 'clearing' | 'done'>('idle');
+  const [clearState, setClearState] = useState<'idle' | 'clearing' | 'done' | 'partial'>('idle');
   const [showClearConfirm, setShowClearConfirm] = useState(false);
-  const [regenState, setRegenState] = useState<'idle' | 'working' | 'done'>('idle');
+  const [regenState, setRegenState] = useState<'idle' | 'working' | 'done' | 'partial'>('idle');
   const [regenProgress, setRegenProgress] = useState('');
-  const [proxyState, setProxyState] = useState<'idle' | 'clearing' | 'done'>('idle');
+  const [proxyState, setProxyState] = useState<'idle' | 'clearing' | 'done' | 'partial'>('idle');
   const [proxyGenerateState, setProxyGenerateState] = useState<'idle' | 'queueing' | 'done'>('idle');
+  const [clearFeedback, setClearFeedback] = useState<ActionFeedback | null>(null);
+  const [regenFeedback, setRegenFeedback] = useState<ActionFeedback | null>(null);
+  const [proxyFeedback, setProxyFeedback] = useState<ActionFeedback | null>(null);
 
   const handleClearCache = useCallback(async () => {
     setClearState('clearing');
     try {
-      const ids = mediaItems.map((m) => m.id);
-      await clearProjectCaches(ids);
-      setClearState('done');
+      const items = mediaItems.map((m) => ({ id: m.id, fileName: m.fileName }));
+      const result = await clearProjectCaches(items);
+      const feedback = getBatchOutcomeFeedback('Clear Cache', result);
+      setClearFeedback(feedback);
+      setClearState(result.failed === 0 ? 'done' : 'partial');
+      showBatchOutcomeToast('Project cache cleared', 'Project cache partially cleared', 'Project cache not cleared', result);
       setTimeout(() => setClearState('idle'), 2000);
     } catch (err) {
       log.error('Failed to clear caches', err);
+      setClearFeedback({
+        tone: 'error',
+        message: 'Couldn\'t clear project cache.',
+      });
+      toast.error('Failed to clear project cache');
       setClearState('idle');
     }
   }, [mediaItems]);
@@ -245,16 +388,24 @@ export function SettingsDialog({ open, onOpenChange }: SettingsDialogProps) {
     setRegenProgress('0/' + mediaItems.length);
     try {
       const items = mediaItems.map((m) => ({ id: m.id, fileName: m.fileName, mimeType: m.mimeType }));
-      await regenerateProjectThumbnails(items, (done, total) => {
+      const result = await regenerateProjectThumbnails(items, (done, total) => {
         setRegenProgress(`${done}/${total}`);
       });
-      setRegenState('done');
+      const feedback = getBatchOutcomeFeedback('Regenerate Thumbnails', result);
+      setRegenFeedback(feedback);
+      setRegenState(result.failed === 0 ? 'done' : 'partial');
+      showBatchOutcomeToast('Thumbnails regenerated', 'Thumbnails partially regenerated', 'Thumbnails not regenerated', result);
       setTimeout(() => {
         setRegenState('idle');
         setRegenProgress('');
       }, 2000);
     } catch (err) {
       log.error('Failed to regenerate thumbnails', err);
+      setRegenFeedback({
+        tone: 'error',
+        message: 'Couldn\'t regenerate thumbnails.',
+      });
+      toast.error('Failed to regenerate thumbnails');
       setRegenState('idle');
       setRegenProgress('');
     }
@@ -263,11 +414,19 @@ export function SettingsDialog({ open, onOpenChange }: SettingsDialogProps) {
   const handleClearProxies = useCallback(async () => {
     setProxyState('clearing');
     try {
-      await clearProjectProxies(mediaItems);
-      setProxyState('done');
+      const result = await clearProjectProxies(mediaItems);
+      const feedback = getBatchOutcomeFeedback('Delete Proxies', result);
+      setProxyFeedback(feedback);
+      setProxyState(result.failed === 0 ? 'done' : 'partial');
+      showBatchOutcomeToast('Proxies deleted', 'Proxies partially deleted', 'Proxies not deleted', result);
       setTimeout(() => setProxyState('idle'), 2000);
     } catch (err) {
       log.error('Failed to clear proxies', err);
+      setProxyFeedback({
+        tone: 'error',
+        message: 'Couldn\'t delete proxies.',
+      });
+      toast.error('Failed to delete proxies');
       setProxyState('idle');
     }
   }, [mediaItems]);
@@ -318,8 +477,6 @@ export function SettingsDialog({ open, onOpenChange }: SettingsDialogProps) {
     }
   }, [mediaItems]);
 
-  const defaultWhisperLanguageValue = getWhisperLanguageSelectValue(defaultWhisperLanguage);
-  const defaultWhisperQuantizationOption = getWhisperQuantizationOption(defaultWhisperQuantization);
   const missingProjectProxyCount = mediaItems.filter((media) => (
     media.mimeType.startsWith('video/')
     && proxyStatus.get(media.id) !== 'ready'
@@ -424,6 +581,75 @@ export function SettingsDialog({ open, onOpenChange }: SettingsDialogProps) {
                 </div>
               )}
 
+              {activeSection === 'ai' && (
+                <div className="space-y-3">
+                  <div className="space-y-2">
+                    <div className="flex items-center justify-between">
+                      <div className="space-y-0.5">
+                        <Label className="text-sm">Caption sample interval</Label>
+                        <p className="text-xs text-muted-foreground">
+                          How often Analyze with AI samples a frame for captioning.
+                        </p>
+                      </div>
+                    </div>
+                    <div className="flex items-center gap-2">
+                      <div className="flex items-center rounded-md border border-border bg-secondary p-0.5">
+                        {(['seconds', 'frames'] as const).map((unit) => (
+                          <button
+                            key={unit}
+                            type="button"
+                            onClick={() => setSetting('captioningIntervalUnit', unit)}
+                            className={cn(
+                              'rounded px-2.5 py-1 text-xs transition-colors',
+                              captioningIntervalUnit === unit
+                                ? 'bg-primary/15 text-primary'
+                                : 'text-muted-foreground hover:text-foreground',
+                            )}
+                          >
+                            {unit === 'seconds' ? 'Seconds' : 'Frames'}
+                          </button>
+                        ))}
+                      </div>
+                      <Input
+                        type="number"
+                        inputMode="decimal"
+                        className="h-8 w-24"
+                        min={intervalBounds.min}
+                        max={intervalBounds.max}
+                        step={intervalInputStep}
+                        value={captioningIntervalValue}
+                        onChange={(event) => {
+                          const parsed = Number(event.target.value);
+                          if (Number.isFinite(parsed)) {
+                            setSetting('captioningIntervalValue', parsed);
+                          }
+                        }}
+                      />
+                      <span className="text-xs text-muted-foreground">{intervalUnitLabel}</span>
+                      <Button
+                        variant="ghost"
+                        size="sm"
+                        className="h-7 px-2 text-xs text-muted-foreground"
+                        onClick={() => {
+                          setSetting('captioningIntervalUnit', 'seconds');
+                          setSetting('captioningIntervalValue', DEFAULT_CAPTIONING_INTERVAL_SECONDS);
+                        }}
+                        disabled={
+                          captioningIntervalUnit === 'seconds'
+                          && captioningIntervalValue === DEFAULT_CAPTIONING_INTERVAL_SECONDS
+                        }
+                      >
+                        Reset
+                      </Button>
+                    </div>
+                    <p className="text-xs text-muted-foreground">
+                      {formatCaptionEstimate(captioningIntervalUnit, captioningIntervalValue)}.
+                      Smaller intervals produce denser scenes but take longer to generate.
+                    </p>
+                  </div>
+                </div>
+              )}
+
               {activeSection === 'timeline' && (
                 <div className="space-y-3">
                   <div className="flex items-center justify-between">
@@ -444,77 +670,6 @@ export function SettingsDialog({ open, onOpenChange }: SettingsDialogProps) {
                 </div>
               )}
 
-              {activeSection === 'whisper' && (
-                <div className="space-y-3">
-                  <div className="space-y-1.5">
-                    <Label className="text-sm">Default Model</Label>
-                    <Select
-                      value={defaultWhisperModel}
-                      onValueChange={(value) =>
-                        setSetting('defaultWhisperModel', value as MediaTranscriptModel)
-                      }
-                    >
-                      <SelectTrigger>
-                        <SelectValue />
-                      </SelectTrigger>
-                      <SelectContent>
-                        {TRANSCRIPTION_MODEL_OPTIONS.map((option) => (
-                          <SelectItem key={option.value} value={option.value}>
-                            {option.label}
-                          </SelectItem>
-                        ))}
-                      </SelectContent>
-                    </Select>
-                    <p className="text-xs text-muted-foreground">
-                      Used when transcription starts without an explicit model override.
-                    </p>
-                  </div>
-
-                  <div className="space-y-1.5">
-                    <Label className="text-sm">Default Quantization</Label>
-                    <Select
-                      value={defaultWhisperQuantization}
-                      onValueChange={(value) =>
-                        setSetting('defaultWhisperQuantization', value as MediaTranscriptQuantization)
-                      }
-                    >
-                      <SelectTrigger>
-                        <SelectValue />
-                      </SelectTrigger>
-                      <SelectContent>
-                        {WHISPER_QUANTIZATION_OPTIONS.map((option) => (
-                          <SelectItem key={option.value} value={option.value}>
-                            {option.label}
-                          </SelectItem>
-                        ))}
-                      </SelectContent>
-                    </Select>
-                    <p className="text-xs text-muted-foreground">
-                      Pick based on memory first. {defaultWhisperQuantizationOption.description}
-                    </p>
-                  </div>
-
-                  <div className="space-y-1.5">
-                    <Label className="text-sm">Default Language</Label>
-                    <Combobox
-                      value={defaultWhisperLanguageValue}
-                      onValueChange={(value) =>
-                        setSetting('defaultWhisperLanguage', getWhisperLanguageSettingValue(value))
-                      }
-                      options={WHISPER_LANGUAGE_OPTIONS}
-                      placeholder="Auto-detect"
-                      searchPlaceholder="Search languages..."
-                      emptyMessage="No languages match that search."
-                    />
-                    <p className="text-xs text-muted-foreground">
-                      Choose Auto-detect to infer the language, or lock transcription to a known language for faster startup.
-                    </p>
-                  </div>
-
-                  <LocalInferenceUnloadControl />
-                </div>
-              )}
-
               {activeSection === 'storage' && (
                 <div className="space-y-3">
                   <div className="flex items-center justify-between">
@@ -550,6 +705,15 @@ export function SettingsDialog({ open, onOpenChange }: SettingsDialogProps) {
                       <p className="text-xs text-muted-foreground mt-0.5">
                         Waveforms, filmstrips, GIF frames, decoded audio
                       </p>
+                      {clearFeedback && (
+                        <p className={cn(
+                          'mt-1 text-xs',
+                          clearFeedback.tone === 'error' ? 'text-amber-400' : 'text-muted-foreground',
+                        )}
+                        >
+                          {clearFeedback.message}
+                        </p>
+                      )}
                     </div>
                     <Button
                       variant="outline"
@@ -560,8 +724,15 @@ export function SettingsDialog({ open, onOpenChange }: SettingsDialogProps) {
                     >
                       {clearState === 'clearing' && <Loader2 className="w-3.5 h-3.5 animate-spin" />}
                       {clearState === 'done' && <Check className="w-3.5 h-3.5" />}
+                      {clearState === 'partial' && <TriangleAlert className="w-3.5 h-3.5" />}
                       {clearState === 'idle' && <Trash2 className="w-3.5 h-3.5" />}
-                      {clearState === 'clearing' ? 'Clearing...' : clearState === 'done' ? 'Cleared' : 'Clear'}
+                      {clearState === 'clearing'
+                        ? 'Clearing...'
+                        : clearState === 'done'
+                          ? 'Cleared'
+                          : clearState === 'partial'
+                            ? 'Partial'
+                            : 'Clear'}
                     </Button>
                   </div>
                   <div className="flex items-center justify-between">
@@ -570,6 +741,15 @@ export function SettingsDialog({ open, onOpenChange }: SettingsDialogProps) {
                       <p className="text-xs text-muted-foreground mt-0.5">
                         Re-create media library thumbnails for this project
                       </p>
+                      {regenFeedback && (
+                        <p className={cn(
+                          'mt-1 text-xs',
+                          regenFeedback.tone === 'error' ? 'text-amber-400' : 'text-muted-foreground',
+                        )}
+                        >
+                          {regenFeedback.message}
+                        </p>
+                      )}
                     </div>
                     <Button
                       variant="outline"
@@ -580,8 +760,15 @@ export function SettingsDialog({ open, onOpenChange }: SettingsDialogProps) {
                     >
                       {regenState === 'working' && <Loader2 className="w-3.5 h-3.5 animate-spin" />}
                       {regenState === 'done' && <Check className="w-3.5 h-3.5" />}
+                      {regenState === 'partial' && <TriangleAlert className="w-3.5 h-3.5" />}
                       {regenState === 'idle' && <ImagePlus className="w-3.5 h-3.5" />}
-                      {regenState === 'working' ? regenProgress : regenState === 'done' ? 'Done' : 'Regenerate'}
+                      {regenState === 'working'
+                        ? regenProgress
+                        : regenState === 'done'
+                          ? 'Done'
+                          : regenState === 'partial'
+                            ? 'Partial'
+                            : 'Regenerate'}
                     </Button>
                   </div>
                   <div className="flex items-center justify-between">
@@ -590,6 +777,15 @@ export function SettingsDialog({ open, onOpenChange }: SettingsDialogProps) {
                       <p className="text-xs text-muted-foreground mt-0.5">
                         Remove generated proxy videos for this project
                       </p>
+                      {proxyFeedback && (
+                        <p className={cn(
+                          'mt-1 text-xs',
+                          proxyFeedback.tone === 'error' ? 'text-amber-400' : 'text-muted-foreground',
+                        )}
+                        >
+                          {proxyFeedback.message}
+                        </p>
+                      )}
                     </div>
                     <Button
                       variant="outline"
@@ -600,12 +796,28 @@ export function SettingsDialog({ open, onOpenChange }: SettingsDialogProps) {
                     >
                       {proxyState === 'clearing' && <Loader2 className="w-3.5 h-3.5 animate-spin" />}
                       {proxyState === 'done' && <Check className="w-3.5 h-3.5" />}
+                      {proxyState === 'partial' && <TriangleAlert className="w-3.5 h-3.5" />}
                       {proxyState === 'idle' && <Film className="w-3.5 h-3.5" />}
-                      {proxyState === 'clearing' ? 'Deleting...' : proxyState === 'done' ? 'Deleted' : 'Delete'}
+                      {proxyState === 'clearing'
+                        ? 'Deleting...'
+                        : proxyState === 'done'
+                          ? 'Deleted'
+                          : proxyState === 'partial'
+                            ? 'Partial'
+                            : 'Delete'}
                     </Button>
                   </div>
                   <Separator className="bg-white/8" />
-                  <LocalModelCacheControl />
+                  <div className="space-y-3">
+                    <div className="space-y-1">
+                      <Label className="text-sm">Local AI</Label>
+                      <p className="text-xs text-muted-foreground">
+                        Unload resident runtimes or clear cached model downloads.
+                      </p>
+                    </div>
+                    <LocalInferenceUnloadControl />
+                    <LocalModelCacheControl />
+                  </div>
                 </div>
               )}
             </div>
diff --git a/src/features/editor/deps/scene-browser-contract.ts b/src/features/editor/deps/scene-browser-contract.ts
new file mode 100644
index 000000000..3f3eb127d
--- /dev/null
+++ b/src/features/editor/deps/scene-browser-contract.ts
@@ -0,0 +1,5 @@
+/**
+ * Adapter — editor shell wires the Scene Browser hotkey through this barrel.
+ */
+
+export { useSceneBrowserStore } from '@/features/scene-browser';
diff --git a/src/features/editor/deps/scene-browser.ts b/src/features/editor/deps/scene-browser.ts
new file mode 100644
index 000000000..bf30c3858
--- /dev/null
+++ b/src/features/editor/deps/scene-browser.ts
@@ -0,0 +1 @@
+export * from './scene-browser-contract';
diff --git a/src/features/editor/deps/settings-contract.ts b/src/features/editor/deps/settings-contract.ts
index b9fd760db..ba95bb9d7 100644
--- a/src/features/editor/deps/settings-contract.ts
+++ b/src/features/editor/deps/settings-contract.ts
@@ -3,7 +3,13 @@
  * Editor modules should import settings stores/services from here.
  */
 
-export { useSettingsStore } from '@/features/settings/stores/settings-store';
+export {
+  useSettingsStore,
+  CAPTIONING_INTERVAL_BOUNDS,
+  DEFAULT_CAPTIONING_INTERVAL_SECONDS,
+  resolveCaptioningIntervalSec,
+} from '@/features/settings/stores/settings-store';
+export type { CaptioningIntervalUnit } from '@/features/settings/stores/settings-store';
 export { LocalInferenceUnloadControl } from '@/features/settings/components/local-inference-unload-control';
 export { LocalModelCacheControl } from '@/features/settings/components/local-model-cache-control';
 export { useResolvedHotkeys } from '@/features/settings/hooks/use-resolved-hotkeys';
diff --git a/src/features/editor/hooks/use-editor-hotkeys.ts b/src/features/editor/hooks/use-editor-hotkeys.ts
index 1f06d114d..2775f26fa 100644
--- a/src/features/editor/hooks/use-editor-hotkeys.ts
+++ b/src/features/editor/hooks/use-editor-hotkeys.ts
@@ -2,6 +2,8 @@ import { useHotkeys } from 'react-hotkeys-hook';
 import { HOTKEY_OPTIONS } from '@/config/hotkeys';
 import { useResolvedHotkeys } from '@/features/editor/deps/settings';
 
+import { useSceneBrowserStore } from '@/features/editor/deps/scene-browser';
+
 interface EditorHotkeyCallbacks {
   onSave?: () => void;
   onExport?: () => void;
@@ -13,6 +15,7 @@ interface EditorHotkeyCallbacks {
  * Handles editor-level shortcuts that work across all components:
  * - Save (Ctrl+S) - Saves timeline to project
  * - Export (Ctrl+Shift+E) - Exports video
+ * - Open Scene Browser (Ctrl+Shift+F) - Opens caption search across media
  *
  * Note: Undo/Redo are handled in useTimelineShortcuts since they're timeline-specific
  *
@@ -46,4 +49,17 @@ export function useEditorHotkeys(callbacks: EditorHotkeyCallbacks = {}) {
     { ...HOTKEY_OPTIONS, eventListenerOptions: { capture: true } },
     [callbacks.onExport]
   );
+
+  // Open Scene Browser: Cmd/Ctrl+Shift+F — capture phase because the
+  // default browser binding is a no-op here but Chrome will still eat it
+  // if our listener is in bubbling phase.
+  useHotkeys(
+    hotkeys.OPEN_SCENE_BROWSER,
+    (event) => {
+      event.preventDefault();
+      useSceneBrowserStore.getState().openBrowser({ focus: true });
+    },
+    { ...HOTKEY_OPTIONS, eventListenerOptions: { capture: true } },
+    []
+  );
 }
diff --git a/src/features/export/utils/canvas-effects.test.ts b/src/features/export/utils/canvas-effects.test.ts
index 184251324..684940da0 100644
--- a/src/features/export/utils/canvas-effects.test.ts
+++ b/src/features/export/utils/canvas-effects.test.ts
@@ -1,8 +1,21 @@
-import { describe, expect, it } from 'vitest';
-import { getAdjustmentLayerEffects, type AdjustmentLayerWithTrackOrder } from './canvas-effects';
+import { beforeEach, describe, expect, it, vi } from 'vitest';
 import type { AdjustmentItem } from '@/types/timeline';
 import type { ItemEffect } from '@/types/effects';
 
+const mockFns = vi.hoisted(() => ({
+  applyMasksMock: vi.fn(),
+}));
+
+vi.mock('./canvas-masks', () => ({
+  applyMasks: mockFns.applyMasksMock,
+}));
+
+import {
+  getAdjustmentLayerEffects,
+  renderEffectsFromMaskedSource,
+  type AdjustmentLayerWithTrackOrder,
+} from './canvas-effects';
+
 function createGpuEffect(id: string, amount: number): ItemEffect {
   return {
     id,
@@ -33,6 +46,14 @@ function createAdjustmentLayer(
   return { layer, trackOrder };
 }
 
+function createMock2dContext(canvas: OffscreenCanvas): OffscreenCanvasRenderingContext2D {
+  return {
+    canvas,
+    drawImage: vi.fn(),
+    clearRect: vi.fn(),
+  } as unknown as OffscreenCanvasRenderingContext2D;
+}
+
 describe('getAdjustmentLayerEffects', () => {
   it('prefers preview overrides for active adjustment layers in preview mode', () => {
     const committedEffect = createGpuEffect('effect-1', 0.25);
@@ -58,6 +79,27 @@ describe('getAdjustmentLayerEffects', () => {
     expect(effects).toEqual([committedEffect]);
   });
 
+  it('uses the live adjustment layer snapshot when committed effects change in preview mode', () => {
+    const committedEffect = createGpuEffect('effect-1', 0.25);
+    const updatedEffect = createGpuEffect('effect-1', 0.8);
+    const adjustmentLayers = [createAdjustmentLayer('adj-1', 1, [committedEffect])];
+
+    const effects = getAdjustmentLayerEffects(
+      3,
+      adjustmentLayers,
+      10,
+      undefined,
+      (itemId) => itemId === 'adj-1'
+        ? {
+            ...adjustmentLayers[0]!.layer,
+            effects: [updatedEffect],
+          }
+        : undefined,
+    );
+
+    expect(effects).toEqual([updatedEffect]);
+  });
+
   it('ignores inactive or out-of-scope adjustment layers before checking overrides', () => {
     const activeEffect = createGpuEffect('active', 0.4);
     const inactiveEffect = createGpuEffect('inactive', 0.7);
@@ -89,3 +131,50 @@ describe('getAdjustmentLayerEffects', () => {
     expect(previewLookups).toEqual(['adj-active']);
   });
 });
+
+describe('renderEffectsFromMaskedSource', () => {
+  beforeEach(() => {
+    mockFns.applyMasksMock.mockReset();
+  });
+
+  it('pre-masks the effect source before the effect chain draws from it', async () => {
+    const sourceCanvas = { width: 1920, height: 1080 } as OffscreenCanvas;
+    const maskedSourceCanvas = { width: 1920, height: 1080 } as OffscreenCanvas;
+    const effectCanvas = { width: 1920, height: 1080 } as OffscreenCanvas;
+    const maskedSourceCtx = createMock2dContext(maskedSourceCanvas);
+    const effectCtx = createMock2dContext(effectCanvas);
+    const canvasPool = {
+      acquire: vi.fn()
+        .mockReturnValueOnce({ canvas: maskedSourceCanvas, ctx: maskedSourceCtx })
+        .mockReturnValueOnce({ canvas: effectCanvas, ctx: effectCtx }),
+    };
+    const masks = [{
+      path: {} as Path2D,
+      inverted: false,
+      feather: 0,
+      maskType: 'clip' as const,
+    }];
+    const effect = createGpuEffect('fx-1', 0.5);
+
+    const result = await renderEffectsFromMaskedSource(
+      canvasPool,
+      sourceCanvas,
+      [effect],
+      masks,
+      12,
+      { width: 1920, height: 1080, fps: 30 },
+    );
+
+    expect(mockFns.applyMasksMock).toHaveBeenCalledWith(
+      maskedSourceCtx,
+      sourceCanvas,
+      masks,
+      { width: 1920, height: 1080, fps: 30 },
+    );
+    expect(effectCtx.drawImage).toHaveBeenCalledWith(maskedSourceCanvas, 0, 0);
+    expect(result).toEqual({
+      source: effectCanvas,
+      poolCanvases: [maskedSourceCanvas, effectCanvas],
+    });
+  });
+});
diff --git a/src/features/export/utils/canvas-effects.ts b/src/features/export/utils/canvas-effects.ts
index a16e35917..63cdfe383 100644
--- a/src/features/export/utils/canvas-effects.ts
+++ b/src/features/export/utils/canvas-effects.ts
@@ -5,9 +5,11 @@
  */
 
 import type { ItemEffect, GpuEffect } from '@/types/effects';
-import type { AdjustmentItem } from '@/types/timeline';
+import type { AdjustmentItem, TimelineItem } from '@/types/timeline';
 import { createLogger } from '@/shared/logging/logger';
 import type { EffectsPipeline, GpuEffectInstance } from '@/infrastructure/gpu/effects';
+import { applyMasks, type MaskCanvasSettings } from './canvas-masks';
+import type { CanvasPool } from './canvas-pool';
 
 const log = createLogger('CanvasEffects');
 
@@ -19,6 +21,51 @@ export interface AdjustmentLayerWithTrackOrder {
   trackOrder: number;
 }
 
+/**
+ * Applies any track-scoped shape masks to the source canvas before running the
+ * effect stack. The caller can still apply a final post-effect mask pass during
+ * compositing so effect bleed is trimmed to the same shape.
+ */
+export async function renderEffectsFromMaskedSource(
+  canvasPool: Pick<CanvasPool, 'acquire'>,
+  sourceCanvas: OffscreenCanvas,
+  effects: ItemEffect[],
+  masks: EffectSourceMask[],
+  frame: number,
+  canvas: EffectCanvasSettings & MaskCanvasSettings,
+  gpuPipeline?: EffectsPipeline | null,
+): Promise<{ source: OffscreenCanvas; poolCanvases: OffscreenCanvas[] }> {
+  const poolCanvases: OffscreenCanvas[] = [];
+  let effectSource = sourceCanvas;
+
+  if (masks.length > 0) {
+    const { canvas: maskedSourceCanvas, ctx: maskedSourceCtx } = canvasPool.acquire();
+    applyMasks(maskedSourceCtx, sourceCanvas, masks, canvas);
+    effectSource = maskedSourceCanvas;
+    poolCanvases.push(maskedSourceCanvas);
+  }
+
+  if (effects.length === 0) {
+    return { source: effectSource, poolCanvases };
+  }
+
+  const { canvas: effectCanvas, ctx: effectCtx } = canvasPool.acquire();
+  const deferredGpuCanvas = await applyAllEffectsAsync(
+    effectCtx,
+    effectSource,
+    effects,
+    frame,
+    canvas,
+    gpuPipeline,
+  );
+  poolCanvases.push(effectCanvas);
+
+  return {
+    source: deferredGpuCanvas ?? effectCanvas,
+    poolCanvases,
+  };
+}
+
 /**
  * Canvas settings for effect rendering
  */
@@ -27,6 +74,14 @@ interface EffectCanvasSettings {
   height: number;
 }
 
+export interface EffectSourceMask {
+  path: Path2D;
+  inverted: boolean;
+  feather: number;
+  maskType: 'clip' | 'alpha';
+  trackOrder?: number;
+}
+
 // ============================================================================
 // GPU Effects
 // ============================================================================
@@ -110,10 +165,18 @@ export function getAdjustmentLayerEffects(
   adjustmentLayers: AdjustmentLayerWithTrackOrder[],
   frame: number,
   getPreviewEffectsOverride?: (itemId: string) => ItemEffect[] | undefined,
+  getLiveItemSnapshot?: (itemId: string) => TimelineItem | undefined,
 ): ItemEffect[] {
   if (adjustmentLayers.length === 0) return [];
 
   return adjustmentLayers
+    .map(({ layer, trackOrder }) => {
+      const liveLayer = getLiveItemSnapshot?.(layer.id);
+      return {
+        layer: liveLayer?.type === 'adjustment' ? liveLayer : layer,
+        trackOrder,
+      };
+    })
     .filter(({ layer, trackOrder }) => {
       // Item must be BEHIND the adjustment (higher track order = lower zIndex)
       if (itemTrackOrder <= trackOrder) return false;
diff --git a/src/features/export/utils/canvas-item-renderer.ts b/src/features/export/utils/canvas-item-renderer.ts
index 6c3db0368..5103993c3 100644
--- a/src/features/export/utils/canvas-item-renderer.ts
+++ b/src/features/export/utils/canvas-item-renderer.ts
@@ -26,9 +26,10 @@ import { doesMaskAffectTrack } from '@/shared/utils/mask-scope';
 // Subsystem imports
 import { getAnimatedTransform } from './canvas-keyframes';
 import {
-  applyAllEffectsAsync,
+  renderEffectsFromMaskedSource,
   getAdjustmentLayerEffects,
   combineEffects,
+  type EffectSourceMask,
   type AdjustmentLayerWithTrackOrder,
 } from './canvas-effects';
 import {
@@ -122,6 +123,7 @@ export interface ItemRenderContext {
   renderMode: 'export' | 'preview';
   scrubbingCache?: ScrubbingCache | null;
   getCurrentItemSnapshot?: <TItem extends TimelineItem>(item: TItem) => TItem;
+  getLiveItemSnapshotById?: (itemId: string) => TimelineItem | undefined;
   getCurrentKeyframes?: (itemId: string) => ItemKeyframes | undefined;
   getPreviewTransformOverride?: (itemId: string) => Partial<ItemTransform> | undefined;
   getPreviewCornerPinOverride?: (itemId: string) => TimelineItem['cornerPin'] | undefined;
@@ -1536,6 +1538,7 @@ export async function renderTransitionToCanvas(
   frame: number,
   rctx: ItemRenderContext,
   trackOrder: number,
+  trackMasks: EffectSourceMask[] = [],
 ): Promise<void> {
   const { canvasPool, canvasSettings } = rctx;
   const { leftClip, rightClip } = activeTransition;
@@ -1568,41 +1571,49 @@ export async function renderTransitionToCanvas(
   const hasLeftEffects = leftCombinedEffects.length > 0;
   const hasRightEffects = rightCombinedEffects.length > 0;
 
-  // Track pool effect canvases separately — in GPU batch mode the final
-  // source may be a GPU output canvas (not from the pool), but the pool
-  // canvases still need to be released.
-  let leftEffectPoolCanvas: OffscreenCanvas | null = null;
-  let rightEffectPoolCanvas: OffscreenCanvas | null = null;
+  const leftEffectPoolCanvases: OffscreenCanvas[] = [];
+  const rightEffectPoolCanvases: OffscreenCanvas[] = [];
 
   if (hasLeftEffects || hasRightEffects) {
-    // In GPU batch mode, applyAllEffectsAsync returns a deferred GPU canvas
-    // instead of drawing back to the effect canvas. We must capture and use
-    // the returned canvas, otherwise effects are silently dropped.
-    let leftGpuPromise: Promise<OffscreenCanvas | null> | undefined;
-    let rightGpuPromise: Promise<OffscreenCanvas | null> | undefined;
+    let leftEffectsPromise: Promise<{ source: OffscreenCanvas; poolCanvases: OffscreenCanvas[] }> | undefined;
+    let rightEffectsPromise: Promise<{ source: OffscreenCanvas; poolCanvases: OffscreenCanvas[] }> | undefined;
 
     if (hasLeftEffects) {
-      const { canvas: leftEffectCanvas, ctx: leftEffectCtx } = canvasPool.acquire();
-      leftEffectPoolCanvas = leftEffectCanvas;
-      leftFinalCanvas = leftEffectCanvas;
-      leftGpuPromise = applyAllEffectsAsync(leftEffectCtx, leftCanvas, leftCombinedEffects, frame, canvasSettings, rctx.gpuPipeline);
+      leftEffectsPromise = renderEffectsFromMaskedSource(
+        canvasPool,
+        leftCanvas,
+        leftCombinedEffects,
+        trackMasks,
+        frame,
+        canvasSettings,
+        rctx.gpuPipeline,
+      );
     }
     if (hasRightEffects) {
-      const { canvas: rightEffectCanvas, ctx: rightEffectCtx } = canvasPool.acquire();
-      rightEffectPoolCanvas = rightEffectCanvas;
-      rightFinalCanvas = rightEffectCanvas;
-      rightGpuPromise = applyAllEffectsAsync(rightEffectCtx, rightCanvas, rightCombinedEffects, frame, canvasSettings, rctx.gpuPipeline);
+      rightEffectsPromise = renderEffectsFromMaskedSource(
+        canvasPool,
+        rightCanvas,
+        rightCombinedEffects,
+        trackMasks,
+        frame,
+        canvasSettings,
+        rctx.gpuPipeline,
+      );
     }
 
-    const [leftGpu, rightGpu] = await Promise.all([
-      leftGpuPromise ?? Promise.resolve(null),
-      rightGpuPromise ?? Promise.resolve(null),
+    const [leftEffects, rightEffects] = await Promise.all([
+      leftEffectsPromise ?? Promise.resolve(null),
+      rightEffectsPromise ?? Promise.resolve(null),
     ]);
 
-    // Use deferred GPU canvas when returned (batch mode), otherwise the
-    // effect canvas already has the result drawn into it.
-    if (leftGpu) leftFinalCanvas = leftGpu;
-    if (rightGpu) rightFinalCanvas = rightGpu;
+    if (leftEffects) {
+      leftFinalCanvas = leftEffects.source;
+      leftEffectPoolCanvases.push(...leftEffects.poolCanvases);
+    }
+    if (rightEffects) {
+      rightFinalCanvas = rightEffects.source;
+      rightEffectPoolCanvases.push(...rightEffects.poolCanvases);
+    }
   }
 
   // Render transition with effect-applied canvases
@@ -1610,9 +1621,9 @@ export async function renderTransitionToCanvas(
   renderTransition(ctx, activeTransition, leftFinalCanvas, rightFinalCanvas, transitionSettings, rctx.gpuTransitionPipeline);
 
   // Release all pool canvases (GPU output canvases are managed by the pipeline)
-  if (leftEffectPoolCanvas) canvasPool.release(leftEffectPoolCanvas);
+  for (const effectCanvas of leftEffectPoolCanvases) canvasPool.release(effectCanvas);
   canvasPool.release(leftCanvas);
-  if (rightEffectPoolCanvas) canvasPool.release(rightEffectPoolCanvas);
+  for (const effectCanvas of rightEffectPoolCanvases) canvasPool.release(effectCanvas);
   canvasPool.release(rightCanvas);
 }
 
@@ -1660,6 +1671,7 @@ export function resolveTransitionParticipantRenderState<TItem extends TimelineIt
     rctx.adjustmentLayers,
     frame,
     rctx.renderMode === 'preview' ? rctx.getPreviewEffectsOverride : undefined,
+    rctx.renderMode === 'preview' ? rctx.getLiveItemSnapshotById : undefined,
   );
 
   return {
diff --git a/src/features/export/utils/client-render-engine.ts b/src/features/export/utils/client-render-engine.ts
index 80c59d375..42f553dc1 100644
--- a/src/features/export/utils/client-render-engine.ts
+++ b/src/features/export/utils/client-render-engine.ts
@@ -34,7 +34,7 @@ import { VideoSourcePool } from '@/features/export/deps/player-contract';
 // Import subsystems
 import { getAnimatedTransform, buildKeyframesMap } from './canvas-keyframes';
 import {
-  applyAllEffectsAsync,
+  renderEffectsFromMaskedSource,
   getAdjustmentLayerEffects,
   combineEffects,
   type AdjustmentLayerWithTrackOrder,
@@ -496,6 +496,7 @@ export async function createCompositionRenderer(
     renderMode,
     scrubbingCache,
     getCurrentItemSnapshot: getCurrentItem,
+    getLiveItemSnapshotById: getLiveItemSnapshot,
     getCurrentKeyframes,
     getPreviewTransformOverride,
     getPreviewCornerPinOverride,
@@ -1202,8 +1203,10 @@ export async function createCompositionRenderer(
           adjustmentLayers,
           frame,
           renderMode === 'preview' ? getPreviewEffectsOverride : undefined,
+          renderMode === 'preview' ? getLiveItemSnapshot : undefined,
         );
         const combinedEffects = combineEffects(itemEffects, adjEffects);
+        const applicableMasks = activeMasks.filter((mask) => doesMaskAffectTrack(mask.trackOrder, trackOrder));
 
         // NOTE: The importExternalTexture zero-copy path is disabled because
         // textureSampleBaseClampToEdge produces subtly different edge pixel values
@@ -1233,16 +1236,22 @@ export async function createCompositionRenderer(
               getLog().warn('GPU pipeline init failed — GPU effects will be skipped');
             }
           }
-          const { canvas: effectCanvas, ctx: effectCtx } = canvasPool.acquire();
-          const deferredGpuCanvas = await applyAllEffectsAsync(effectCtx, itemCanvas, combinedEffects, frame, canvasSettings, itemRenderContext.gpuPipeline);
+          const { source, poolCanvases } = await renderEffectsFromMaskedSource(
+            canvasPool,
+            itemCanvas,
+            combinedEffects,
+            applicableMasks,
+            frame,
+            maskSettings,
+            itemRenderContext.gpuPipeline,
+          );
           canvasPool.release(itemCanvas);
 
-          const source = deferredGpuCanvas ?? effectCanvas;
           if (deferred) {
-            return { source, poolCanvases: [effectCanvas] };
+            return { source, poolCanvases };
           }
           targetCtx.drawImage(source, 0, 0);
-          canvasPool.release(effectCanvas);
+          for (const effectCanvas of poolCanvases) canvasPool.release(effectCanvas);
           return null;
         }
 
@@ -1334,6 +1343,7 @@ export async function createCompositionRenderer(
           adjustmentLayers,
           frame,
           renderMode === 'preview' ? getPreviewEffectsOverride : undefined,
+          renderMode === 'preview' ? getLiveItemSnapshot : undefined,
         );
         const allEffects = [...itemEffects, ...adjEffects];
 
@@ -1462,7 +1472,14 @@ export async function createCompositionRenderer(
             }
             // Transitions: render to a dedicated canvas
             const { canvas: trCanvas, ctx: trCtx } = canvasPool.acquire();
-            await renderTransitionToCanvas(trCtx, task.transition, frame, itemRenderContext, task.trackOrder);
+            await renderTransitionToCanvas(
+              trCtx,
+              task.transition,
+              frame,
+              itemRenderContext,
+              task.trackOrder,
+              activeMasks.filter((mask) => doesMaskAffectTrack(mask.trackOrder, task.trackOrder)),
+            );
             return { source: trCanvas, poolCanvases: [trCanvas] } as { source: OffscreenCanvas; poolCanvases: OffscreenCanvas[] };
           }),
         );
diff --git a/src/features/media-library/components/background-task-progress.test.tsx b/src/features/media-library/components/background-task-progress.test.tsx
new file mode 100644
index 000000000..6db6525a5
--- /dev/null
+++ b/src/features/media-library/components/background-task-progress.test.tsx
@@ -0,0 +1,49 @@
+import { fireEvent, render, screen } from '@testing-library/react';
+import { describe, expect, it, vi } from 'vitest';
+import { BackgroundTaskProgress } from './background-task-progress';
+
+describe('BackgroundTaskProgress', () => {
+  it('renders determinate progress with custom meta actions', () => {
+    const onCancel = vi.fn();
+
+    render(
+      <BackgroundTaskProgress
+        icon={<span>icon</span>}
+        label="Generating transcripts"
+        progressAriaLabel="Transcript generation progress"
+        progressPercent={42.4}
+        meta={(
+          <>
+            <span>42%</span>
+            <button type="button" onClick={onCancel}>Cancel all</button>
+          </>
+        )}
+        fillClassName="bg-blue-500"
+      />
+    );
+
+    expect(screen.getByText('Generating transcripts')).toBeInTheDocument();
+    expect(screen.getByRole('progressbar', { name: 'Transcript generation progress' }))
+      .toHaveAttribute('aria-valuenow', '42');
+
+    fireEvent.click(screen.getByText('Cancel all'));
+    expect(onCancel).toHaveBeenCalledTimes(1);
+  });
+
+  it('renders indeterminate progress without a numeric value', () => {
+    render(
+      <BackgroundTaskProgress
+        icon={<span>icon</span>}
+        label="Analyzing with AI"
+        progressAriaLabel="AI analysis progress"
+        indeterminate
+        meta={<span>Working...</span>}
+        fillClassName="bg-purple-500"
+      />
+    );
+
+    expect(screen.getByRole('progressbar', { name: 'AI analysis progress' }))
+      .not.toHaveAttribute('aria-valuenow');
+    expect(screen.getByText('Working...')).toBeInTheDocument();
+  });
+});
diff --git a/src/features/media-library/components/background-task-progress.tsx b/src/features/media-library/components/background-task-progress.tsx
new file mode 100644
index 000000000..b4730e453
--- /dev/null
+++ b/src/features/media-library/components/background-task-progress.tsx
@@ -0,0 +1,66 @@
+import type { ReactNode } from 'react';
+import { cn } from '@/shared/ui/cn';
+
+interface BackgroundTaskProgressProps {
+  icon: ReactNode;
+  label: string;
+  progressAriaLabel: string;
+  progressPercent?: number | null;
+  indeterminate?: boolean;
+  meta?: ReactNode;
+  trailing?: ReactNode;
+  fillClassName: string;
+}
+
+export function BackgroundTaskProgress({
+  icon,
+  label,
+  progressAriaLabel,
+  progressPercent = null,
+  indeterminate = false,
+  meta,
+  trailing,
+  fillClassName,
+}: BackgroundTaskProgressProps) {
+  const clampedPercent = progressPercent == null
+    ? null
+    : Math.max(0, Math.min(100, Math.round(progressPercent)));
+
+  return (
+    <div className="px-3 py-2 border-t border-border flex-shrink-0 bg-panel-bg/50">
+      <div className="flex items-center gap-2 text-xs">
+        {icon}
+        <div className="flex-1 min-w-0">
+          <div className="mb-1 flex items-center justify-between gap-2">
+            <span className="text-muted-foreground truncate">
+              {label}
+            </span>
+            {meta && (
+              <div className="flex items-center gap-2 text-muted-foreground">
+                {meta}
+              </div>
+            )}
+          </div>
+          <div
+            role="progressbar"
+            aria-label={progressAriaLabel}
+            aria-valuemin={indeterminate ? undefined : 0}
+            aria-valuemax={indeterminate ? undefined : 100}
+            aria-valuenow={indeterminate || clampedPercent == null ? undefined : clampedPercent}
+            className="h-1 overflow-hidden rounded-full bg-secondary"
+          >
+            <div
+              className={cn(
+                'h-full rounded-full transition-all duration-300',
+                fillClassName,
+                indeterminate && 'w-1/3 animate-pulse',
+              )}
+              style={indeterminate || clampedPercent == null ? undefined : { width: `${clampedPercent}%` }}
+            />
+          </div>
+        </div>
+        {trailing}
+      </div>
+    </div>
+  );
+}
diff --git a/src/features/media-library/components/compositions-section.tsx b/src/features/media-library/components/compositions-section.tsx
index 00ed03720..6ede2d955 100644
--- a/src/features/media-library/components/compositions-section.tsx
+++ b/src/features/media-library/components/compositions-section.tsx
@@ -52,6 +52,7 @@ export function CompositionsSection() {
   const viewMode = useMediaLibraryStore((s) => s.viewMode);
   const mediaItemSize = useMediaLibraryStore((s) => s.mediaItemSize);
   const selectedCompositionIds = useMediaLibraryStore((s) => s.selectedCompositionIds);
+  const isTranscriptionDialogOpen = useEditorStore((s) => s.transcriptionDialogDepth > 0);
   const selectedCompositionIdSet = useMemo(() => new Set(selectedCompositionIds), [selectedCompositionIds]);
   const [open, setOpen] = useState(true);
   const [deleteTarget, setDeleteTarget] = useState<SubComposition | null>(null);
@@ -200,6 +201,7 @@ export function CompositionsSection() {
                 composition={comp}
                 viewMode={viewMode}
                 selected={selectedCompositionIdSet.has(comp.id)}
+                isTranscriptionDialogOpen={isTranscriptionDialogOpen}
                 dragDisabled={wouldCreateCompositionCycle({
                   parentCompositionId: activeCompositionId,
                   insertedCompositionId: comp.id,
@@ -263,6 +265,7 @@ interface CompositionCardProps {
   composition: SubComposition;
   viewMode: 'grid' | 'list';
   selected: boolean;
+  isTranscriptionDialogOpen: boolean;
   dragDisabled: boolean;
   isEditing: boolean;
   editValue: string;
@@ -279,6 +282,7 @@ const CompositionCard = memo(function CompositionCard({
   composition,
   viewMode,
   selected,
+  isTranscriptionDialogOpen,
   dragDisabled,
   isEditing,
   editValue,
@@ -375,7 +379,7 @@ const CompositionCard = memo(function CompositionCard({
     [isEditing, onSelect]
   );
 
-  const canHoverPreview = composition.durationInFrames > 0;
+  const canHoverPreview = composition.durationInFrames > 0 && !isTranscriptionDialogOpen;
 
   const updateSkimPreview = useCallback((clientX: number) => {
     const thumbnailContainer = thumbnailContainerRef.current;
diff --git a/src/features/media-library/components/media-card.test.tsx b/src/features/media-library/components/media-card.test.tsx
index 4c8a1eca7..01b4c648c 100644
--- a/src/features/media-library/components/media-card.test.tsx
+++ b/src/features/media-library/components/media-card.test.tsx
@@ -7,6 +7,7 @@ const mediaLibraryServiceMocks = vi.hoisted(() => ({
   getThumbnailBlobUrl: vi.fn(),
   getMediaFile: vi.fn(),
   getMediaBlobUrl: vi.fn(),
+  updateMediaCaptions: vi.fn(),
 }));
 
 const proxyServiceMocks = vi.hoisted(() => ({
@@ -20,6 +21,8 @@ const proxyServiceMocks = vi.hoisted(() => ({
 
 const mediaTranscriptionServiceMocks = vi.hoisted(() => ({
   transcribeMedia: vi.fn(),
+  deleteTranscript: vi.fn(),
+  cancelTranscription: vi.fn(),
 }));
 
 const mediaStoreState = vi.hoisted(() => ({
@@ -28,7 +31,7 @@ const mediaStoreState = vi.hoisted(() => ({
   importingIds: [] as string[],
   proxyStatus: new Map<string, 'generating' | 'ready' | 'error'>(),
   proxyProgress: new Map<string, number>(),
-  transcriptStatus: new Map<string, 'idle' | 'transcribing' | 'ready' | 'error'>(),
+  transcriptStatus: new Map<string, 'idle' | 'queued' | 'transcribing' | 'ready' | 'error'>(),
   transcriptProgress: new Map(),
   taggingMediaIds: new Set<string>(),
   setProxyStatus: vi.fn(),
@@ -37,7 +40,18 @@ const mediaStoreState = vi.hoisted(() => ({
   setTranscriptProgress: vi.fn(),
   clearTranscriptProgress: vi.fn(),
   setTaggingMedia: vi.fn(),
+  updateMediaCaptions: vi.fn(),
   showNotification: vi.fn(),
+  analysisProgress: null as null | { total: number; completed: number; cancelRequested: boolean },
+  beginAnalysisRun: vi.fn(),
+  incrementAnalysisCompleted: vi.fn(),
+  requestAnalysisCancel: vi.fn(),
+  endAnalysisRun: vi.fn(),
+}));
+
+const analysisMocks = vi.hoisted(() => ({
+  captionVideo: vi.fn(),
+  captionImage: vi.fn(),
 }));
 
 const editorStoreState = vi.hoisted(() => ({
@@ -47,6 +61,10 @@ const editorStoreState = vi.hoisted(() => ({
   mediaSkimPreviewMediaId: null as string | null,
 }));
 
+const playbackStoreState = vi.hoisted(() => ({
+  pause: vi.fn(),
+}));
+
 const sourcePlayerStoreState = vi.hoisted(() => ({
   setCurrentMediaId: vi.fn(),
   clearInOutPoints: vi.fn(),
@@ -82,6 +100,41 @@ vi.mock('@/components/ui/button', () => ({
   }) => <button disabled={disabled} onClick={onClick}>{children}</button>,
 }));
 
+vi.mock('./transcribe-dialog', () => ({
+  TranscribeDialog: ({
+    open,
+    onStart,
+    onCancel,
+  }: {
+    open: boolean;
+    onStart: (values: {
+      model: string;
+      quantization: string;
+      language: string;
+    }) => void;
+    onCancel: () => void;
+  }) =>
+    open ? (
+      <div data-testid="transcribe-dialog">
+        <button
+          type="button"
+          onClick={() =>
+            onStart({
+              model: 'whisper-base',
+              quantization: 'hybrid',
+              language: '',
+            })
+          }
+        >
+          Start Transcription
+        </button>
+        <button type="button" onClick={() => onCancel()}>
+          Stop Transcription
+        </button>
+      </div>
+    ) : null,
+}));
+
 vi.mock('./media-info-popover', () => ({
   MediaInfoPopover: ({ onSeekToCaption }: { onSeekToCaption?: (timeSec: number) => void }) => (
     <button data-testid="media-info-popover" onClick={() => onSeekToCaption?.(2.5)}>
@@ -124,6 +177,17 @@ vi.mock('@/app/state/editor', () => {
   return { useEditorStore };
 });
 
+vi.mock('@/shared/state/playback', () => {
+  const usePlaybackStore = Object.assign(
+    (selector: (state: typeof playbackStoreState) => unknown) => selector(playbackStoreState),
+    {
+      getState: () => playbackStoreState,
+    }
+  );
+
+  return { usePlaybackStore };
+});
+
 vi.mock('@/shared/state/source-player', () => ({
   useSourcePlayerStore: {
     getState: () => sourcePlayerStoreState,
@@ -143,6 +207,37 @@ vi.mock('@/shared/state/local-inference', () => ({
   isLocalInferenceCancellationError: vi.fn(() => false),
 }));
 
+vi.mock('../deps/analysis', () => analysisMocks);
+
+const settingsStoreState = vi.hoisted(() => ({
+  captioningIntervalUnit: 'seconds' as const,
+  captioningIntervalValue: 3,
+}));
+
+vi.mock('../deps/settings-contract', () => ({
+  useSettingsStore: {
+    getState: () => settingsStoreState,
+  },
+  resolveCaptioningIntervalSec: (unit: 'seconds' | 'frames', value: number, fps: number) => (
+    unit === 'seconds' ? value : value / (fps > 0 ? fps : 30)
+  ),
+  DEFAULT_CAPTIONING_INTERVAL_SECONDS: 3,
+}));
+
+vi.mock('@/infrastructure/storage', () => ({
+  saveCaptionThumbnail: vi.fn(async () => undefined),
+  deleteCaptionThumbnails: vi.fn(async () => undefined),
+  deleteCaptionEmbeddings: vi.fn(async () => undefined),
+  saveCaptionEmbeddings: vi.fn(async () => undefined),
+  saveCaptionImageEmbeddings: vi.fn(async () => undefined),
+  getCaptionThumbnailBlob: vi.fn(async () => null),
+  getTranscript: vi.fn(async () => null),
+}));
+
+vi.mock('../deps/scene-browser', () => ({
+  invalidateMediaCaptionThumbnails: vi.fn(),
+}));
+
 import { MediaCard } from './media-card';
 
 function makeMedia(overrides: Partial<MediaMetadata> = {}): MediaMetadata {
@@ -168,6 +263,7 @@ function makeMedia(overrides: Partial<MediaMetadata> = {}): MediaMetadata {
 describe('MediaCard', () => {
   beforeEach(() => {
     vi.clearAllMocks();
+    vi.useRealTimers();
     mediaStoreState.selectedMediaIds = [];
     mediaStoreState.mediaItems = [makeMedia()];
     mediaStoreState.importingIds = [];
@@ -177,6 +273,7 @@ describe('MediaCard', () => {
     mediaStoreState.transcriptProgress = new Map();
     mediaStoreState.taggingMediaIds = new Set();
     editorStoreState.mediaSkimPreviewMediaId = null;
+    playbackStoreState.pause.mockReset();
 
     mediaLibraryServiceMocks.getThumbnailBlobUrl.mockResolvedValue(null);
     mediaLibraryServiceMocks.getMediaFile.mockResolvedValue(new Blob(['video-data']));
@@ -204,6 +301,90 @@ describe('MediaCard', () => {
     expect(typeof generateProxyCall?.[1]).toBe('function');
   });
 
+  it('opens the transcribe dialog and defers work until the user confirms', async () => {
+    const media = makeMedia();
+    mediaTranscriptionServiceMocks.transcribeMedia.mockResolvedValue(undefined);
+
+    render(<MediaCard media={media} viewMode="list" />);
+
+    fireEvent.click(screen.getByText('Generate Transcript'));
+
+    // Clicking the menu item opens the dialog; transcription has NOT started.
+    expect(screen.getByTestId('transcribe-dialog')).toBeInTheDocument();
+    expect(mediaTranscriptionServiceMocks.transcribeMedia).not.toHaveBeenCalled();
+
+    fireEvent.click(screen.getByText('Start Transcription'));
+
+    await waitFor(() => {
+      expect(mediaTranscriptionServiceMocks.transcribeMedia).toHaveBeenCalledWith(
+        'media-1',
+        expect.objectContaining({
+          model: 'whisper-base',
+          quantization: 'hybrid',
+          onProgress: expect.any(Function),
+        }),
+      );
+    });
+  });
+
+  it('uses transcript wording in the media action menu', () => {
+    const { rerender } = render(<MediaCard media={makeMedia()} viewMode="list" />);
+    expect(screen.getByText('Generate Transcript')).toBeInTheDocument();
+
+    mediaStoreState.transcriptStatus = new Map([['media-1', 'ready']]);
+    rerender(<MediaCard media={makeMedia()} viewMode="list" />);
+    expect(screen.getByText('Refresh Transcript')).toBeInTheDocument();
+    expect(screen.getByText('Delete Transcript')).toBeInTheDocument();
+  });
+
+  it('shows transcript progress bars while transcribing', () => {
+    mediaStoreState.transcriptStatus = new Map([['media-1', 'queued']]);
+    mediaStoreState.transcriptProgress = new Map([
+      ['media-1', { stage: 'queued', progress: 0 }],
+    ]);
+
+    render(<MediaCard media={makeMedia()} viewMode="list" />);
+
+    expect(screen.getByText('Queued (0%)')).toBeInTheDocument();
+    expect(screen.getByRole('progressbar', { name: 'Transcript menu progress' }))
+      .toHaveAttribute('aria-valuenow', '0');
+    expect(screen.getByRole('progressbar', { name: 'Transcript progress' }))
+      .toHaveAttribute('aria-valuenow', '0');
+    expect(screen.getByText('Cancel Transcript')).toBeInTheDocument();
+  });
+
+  it('cancels transcript generation from the action menu', () => {
+    mediaStoreState.transcriptStatus = new Map([['media-1', 'queued']]);
+    mediaStoreState.transcriptProgress = new Map([
+      ['media-1', { stage: 'queued', progress: 0 }],
+    ]);
+
+    render(<MediaCard media={makeMedia()} viewMode="list" />);
+
+    fireEvent.click(screen.getByText('Cancel Transcript'));
+
+    expect(mediaTranscriptionServiceMocks.cancelTranscription).toHaveBeenCalledWith('media-1');
+  });
+
+  it('deletes a transcript from the media action menu', async () => {
+    mediaStoreState.transcriptStatus = new Map([['media-1', 'ready']]);
+    mediaTranscriptionServiceMocks.deleteTranscript.mockResolvedValue(undefined);
+
+    render(<MediaCard media={makeMedia()} viewMode="list" />);
+
+    fireEvent.click(screen.getByText('Delete Transcript'));
+
+    await waitFor(() => {
+      expect(mediaTranscriptionServiceMocks.deleteTranscript).toHaveBeenCalledWith('media-1');
+    });
+    expect(mediaStoreState.setTranscriptStatus).toHaveBeenCalledWith('media-1', 'idle');
+    expect(mediaStoreState.clearTranscriptProgress).toHaveBeenCalledWith('media-1');
+    expect(mediaStoreState.showNotification).toHaveBeenCalledWith({
+      type: 'success',
+      message: 'Transcript deleted for "clip.mp4"',
+    });
+  });
+
   it('uses the shared action menu to relink broken media in grid view', () => {
     const onRelink = vi.fn();
     render(<MediaCard media={makeMedia()} isBroken onRelink={onRelink} viewMode="grid" />);
@@ -224,6 +405,22 @@ describe('MediaCard', () => {
     expect(proxyServiceMocks.cancelProxy).toHaveBeenCalledWith('media-1', 'proxy-media-1');
   });
 
+  it('shows an active AI analysis badge in list view while analysis is running', () => {
+    mediaStoreState.taggingMediaIds = new Set(['media-1']);
+
+    const { container } = render(<MediaCard media={makeMedia()} viewMode="list" />);
+
+    expect(container.querySelector('[title="Analyzing with AI"]')).toBeTruthy();
+  });
+
+  it('shows an active AI analysis badge in grid view while analysis is running', () => {
+    mediaStoreState.taggingMediaIds = new Set(['media-1']);
+
+    const { container } = render(<MediaCard media={makeMedia()} viewMode="grid" />);
+
+    expect(container.querySelector('[title="Analyzing with AI"]')).toBeTruthy();
+  });
+
   it('opens a caption in the source monitor with a default three-second I/O range', () => {
     render(<MediaCard media={makeMedia()} viewMode="list" />);
 
@@ -236,4 +433,108 @@ describe('MediaCard', () => {
     expect(sourcePlayerStoreState.setPendingSeekFrame).toHaveBeenCalledWith(75);
     expect(editorStoreState.setSourcePreviewMediaId).toHaveBeenCalledWith('media-1');
   });
+
+  it('pauses timeline playback and updates skim preview while hovering a video thumbnail', () => {
+    const { container } = render(<MediaCard media={makeMedia()} viewMode="list" />);
+
+    const thumbnail = container.querySelector('.w-12.h-9') as HTMLDivElement;
+    expect(thumbnail).toBeTruthy();
+    vi.spyOn(thumbnail, 'getBoundingClientRect').mockReturnValue({
+      x: 0,
+      y: 0,
+      top: 0,
+      left: 0,
+      bottom: 36,
+      right: 100,
+      width: 100,
+      height: 36,
+      toJSON: () => ({}),
+    });
+
+    fireEvent.pointerEnter(thumbnail, {
+      clientX: 20,
+      pointerType: 'mouse',
+    });
+
+    expect(playbackStoreState.pause).toHaveBeenCalledTimes(1);
+    expect(editorStoreState.setMediaSkimPreview).toHaveBeenCalledWith('media-1', 30);
+
+    fireEvent.pointerMove(thumbnail, {
+      clientX: 50,
+      pointerType: 'mouse',
+    });
+
+    fireEvent.pointerLeave(thumbnail);
+
+    expect(editorStoreState.clearMediaSkimPreview).toHaveBeenCalledTimes(1);
+  });
+
+  it('stores AI analysis on the media item without inserting timeline captions', async () => {
+    const media = makeMedia({
+      fileName: 'frame.png',
+      mimeType: 'image/png',
+      duration: 0,
+      fps: 0,
+      codec: '',
+    });
+    const fetchMock = vi.spyOn(globalThis, 'fetch').mockResolvedValue({
+      blob: async () => new Blob(['image-data']),
+    } as Response);
+    analysisMocks.captionImage.mockResolvedValue([
+      {
+        timeSec: 1.25,
+        text: 'First line',
+        sceneData: {
+          caption: 'First line',
+          shotType: 'medium close-up',
+          timeOfDay: 'dusk',
+          weather: 'rainy',
+        },
+      },
+      { timeSec: 2.5, text: 'Second line' },
+    ]);
+
+    render(<MediaCard media={media} viewMode="list" />);
+
+    fireEvent.click(screen.getByText('Analyze with AI'));
+
+    await waitFor(() => {
+      expect(mediaLibraryServiceMocks.updateMediaCaptions).toHaveBeenCalledWith(
+        'media-1',
+        [
+          {
+            timeSec: 1.25,
+            text: 'First line',
+            sceneData: {
+              caption: 'First line',
+              shotType: 'medium close-up',
+              timeOfDay: 'dusk',
+              weather: 'rainy',
+            },
+          },
+          { timeSec: 2.5, text: 'Second line' },
+        ],
+        expect.objectContaining({ sampleIntervalSec: expect.any(Number) }),
+      );
+    });
+
+    expect(mediaStoreState.updateMediaCaptions).toHaveBeenCalledWith('media-1', [
+      {
+        timeSec: 1.25,
+        text: 'First line',
+        sceneData: {
+          caption: 'First line',
+          shotType: 'medium close-up',
+          timeOfDay: 'dusk',
+          weather: 'rainy',
+        },
+      },
+      { timeSec: 2.5, text: 'Second line' },
+    ]);
+    expect(mediaStoreState.showNotification).toHaveBeenCalledWith({
+      type: 'success',
+      message: 'Generated 2 scene captions for "frame.png"',
+    });
+    fetchMock.mockRestore();
+  });
 });
diff --git a/src/features/media-library/components/media-card.tsx b/src/features/media-library/components/media-card.tsx
index 31b50b969..b55f96281 100644
--- a/src/features/media-library/components/media-card.tsx
+++ b/src/features/media-library/components/media-card.tsx
@@ -9,6 +9,7 @@ import {
 import { Button } from '@/components/ui/button';
 import type { MediaMetadata } from '@/types/storage';
 import { mediaLibraryService } from '../services/media-library-service';
+import { mediaAnalysisService } from '../services/media-analysis-service';
 import { getMediaType, formatDuration } from '../utils/validation';
 import { MediaInfoPopover } from './media-info-popover';
 import { getSharedProxyKey } from '../utils/proxy-key';
@@ -17,14 +18,20 @@ import { CARD_GRID_BASE, CARD_LIST_BASE, CARD_PERF_STYLE } from './card-styles';
 import { setMediaDragData, clearMediaDragData } from '../utils/drag-data-cache';
 import { proxyService } from '../services/proxy-service';
 import { mediaTranscriptionService } from '../services/media-transcription-service';
-import { isLocalInferenceCancellationError } from '@/shared/state/local-inference';
 import { useEditorStore } from '@/app/state/editor';
+import { usePlaybackStore } from '@/shared/state/playback';
 import { useSourcePlayerStore } from '@/shared/state/source-player';
-import { captionVideo, captionImage } from '../deps/analysis';
 import {
   getTranscriptionOverallPercent,
   getTranscriptionStageLabel,
 } from '@/shared/utils/transcription-progress';
+import { scheduleAfterPaint } from '@/shared/utils/schedule-after-paint';
+import {
+  isTranscriptionCancellationError,
+  isTranscriptionOutOfMemoryError,
+  TRANSCRIPTION_OOM_HINT,
+} from '@/shared/utils/transcription-cancellation';
+import { TranscribeDialog, type TranscribeDialogValues } from './transcribe-dialog';
 
 interface MediaCardProps {
   media: MediaMetadata;
@@ -47,14 +54,17 @@ interface MediaCardActionMenuProps {
   isTranscribable: boolean;
   isTranscribing: boolean;
   hasTranscript: boolean;
-  transcriptProgressLabel: string;
+  transcriptProgressPercent: number | null;
+  transcriptBusyLabel: string;
   isTaggable: boolean;
   isTagging: boolean;
   hasTags: boolean;
   onGenerateProxy: (event: React.MouseEvent) => void | Promise<void>;
   onCancelProxy: (event: React.MouseEvent) => void | Promise<void>;
   onDeleteProxy: (event: React.MouseEvent) => Promise<void>;
-  onGenerateTranscript: (event: React.MouseEvent) => Promise<void>;
+  onGenerateTranscript: (event: React.MouseEvent) => void | Promise<void>;
+  onCancelTranscript: (event: React.MouseEvent) => void;
+  onDeleteTranscript: (event: React.MouseEvent) => Promise<void>;
   onAnalyzeWithAI: (event: React.MouseEvent) => void;
   onDelete: (event: React.MouseEvent) => void;
 }
@@ -71,7 +81,8 @@ function MediaCardActionMenuItems({
   isTranscribable,
   isTranscribing,
   hasTranscript,
-  transcriptProgressLabel,
+  transcriptProgressPercent,
+  transcriptBusyLabel,
   isTaggable,
   isTagging,
   hasTags,
@@ -79,6 +90,8 @@ function MediaCardActionMenuItems({
   onCancelProxy,
   onDeleteProxy,
   onGenerateTranscript,
+  onCancelTranscript,
+  onDeleteTranscript,
   onAnalyzeWithAI,
   onDelete,
 }: MediaCardActionMenuProps) {
@@ -99,13 +112,46 @@ function MediaCardActionMenuItems({
       {isTranscribable && !isBroken && !isTranscribing && (
         <DropdownMenuItem onClick={onGenerateTranscript}>
           <FileText className="w-3 h-3 mr-2" />
-          {hasTranscript ? 'Regenerate Transcript' : 'Transcribe Audio'}
+          {hasTranscript ? 'Refresh Transcript' : 'Generate Transcript'}
+        </DropdownMenuItem>
+      )}
+      {isTranscribable && !isBroken && hasTranscript && !isTranscribing && (
+        <DropdownMenuItem onClick={onDeleteTranscript} className="text-destructive focus:text-destructive">
+          <Trash2 className="w-3 h-3 mr-2" />
+          Delete Transcript
         </DropdownMenuItem>
       )}
       {isTranscribable && !isBroken && isTranscribing && (
         <DropdownMenuItem disabled>
-          <Loader2 className="w-3 h-3 mr-2 animate-spin" />
-          {transcriptProgressLabel}
+          <div className="flex w-full min-w-48 flex-col gap-1.5">
+            <div className="flex items-center gap-2">
+              <Loader2 className="w-3 h-3 animate-spin flex-shrink-0" />
+              <span className="min-w-0 truncate">
+                {transcriptBusyLabel}
+              </span>
+            </div>
+            {transcriptProgressPercent !== null && (
+              <div
+                role="progressbar"
+                aria-label="Transcript menu progress"
+                aria-valuemin={0}
+                aria-valuemax={100}
+                aria-valuenow={transcriptProgressPercent}
+                className="h-1 overflow-hidden rounded-full bg-secondary"
+              >
+                <div
+                  className="h-full rounded-full bg-blue-500 transition-all duration-300"
+                  style={{ width: `${transcriptProgressPercent}%` }}
+                />
+              </div>
+            )}
+          </div>
+        </DropdownMenuItem>
+      )}
+      {isTranscribable && !isBroken && isTranscribing && (
+        <DropdownMenuItem onClick={onCancelTranscript}>
+          <Square className="w-3 h-3 mr-2" />
+          Cancel Transcript
         </DropdownMenuItem>
       )}
       {proxyStatus === 'generating' && (
@@ -176,7 +222,7 @@ export const MediaCard = memo(function MediaCard({
     && proxyService.canGenerateProxy(media.mimeType);
   const hasProxy = proxyStatus === 'ready';
   const hasTranscript = transcriptStatus === 'ready';
-  const isTranscribing = transcriptStatus === 'transcribing';
+  const isTranscribing = transcriptStatus === 'transcribing' || transcriptStatus === 'queued';
   const isTagging = useMediaLibraryStore((s) => s.taggingMediaIds.has(media.id));
   const isTaggable = mediaType === 'video' || mediaType === 'image';
   const hasCaptions = (media.aiCaptions?.length ?? 0) > 0;
@@ -187,8 +233,12 @@ export const MediaCard = memo(function MediaCard({
   const dragImageRef = useRef<HTMLDivElement | null>(null);
   const setMediaSkimPreview = useEditorStore((s) => s.setMediaSkimPreview);
   const clearMediaSkimPreview = useEditorStore((s) => s.clearMediaSkimPreview);
+  const isTranscriptionDialogOpen = useEditorStore((s) => s.transcriptionDialogDepth > 0);
+  const pauseTimelinePlayback = usePlaybackStore((s) => s.pause);
 
   const isAudio = mediaType === 'audio' && !isBroken && !isImporting;
+  const [transcribeDialogOpen, setTranscribeDialogOpen] = useState(false);
+  const [transcribeErrorMessage, setTranscribeErrorMessage] = useState<string | null>(null);
 
   // Load thumbnail on mount and when thumbnailId changes (e.g. after regeneration)
   useEffect(() => {
@@ -259,40 +309,97 @@ export const MediaCard = memo(function MediaCard({
     proxyService.cancelProxy(media.id, getSharedProxyKey(media));
   };
 
-  const handleGenerateTranscript = async (e: React.MouseEvent) => {
+  const handleOpenTranscribeDialog = (e: React.MouseEvent) => {
     e.preventDefault();
     e.stopPropagation();
+    setTranscribeErrorMessage(null);
+    setTranscribeDialogOpen(true);
+  };
 
+  const handleStartTranscription = useCallback((values: TranscribeDialogValues) => {
     const store = useMediaLibraryStore.getState();
     const previousStatus = store.transcriptStatus.get(media.id) ?? 'idle';
 
-    store.setTranscriptStatus(media.id, 'transcribing');
-    store.setTranscriptProgress(media.id, { stage: 'loading', progress: 0 });
+    setTranscribeErrorMessage(null);
+    store.setTranscriptStatus(media.id, 'queued');
+    store.setTranscriptProgress(media.id, { stage: 'queued', progress: 0 });
+
+    scheduleAfterPaint(() => {
+      void (async () => {
+        try {
+          await mediaTranscriptionService.transcribeMedia(media.id, {
+            model: values.model,
+            quantization: values.quantization,
+            language: values.language || undefined,
+            onQueueStatusChange: (state) => {
+              if (state === 'queued') {
+                store.setTranscriptStatus(media.id, 'queued');
+                store.setTranscriptProgress(media.id, { stage: 'queued', progress: 0 });
+                return;
+              }
+
+              store.setTranscriptStatus(media.id, 'transcribing');
+              store.setTranscriptProgress(media.id, { stage: 'loading', progress: 0 });
+            },
+            onProgress: (progress) => {
+              store.setTranscriptProgress(media.id, progress);
+            },
+          });
+          store.setTranscriptStatus(media.id, 'ready');
+          store.clearTranscriptProgress(media.id);
+          store.showNotification({
+            type: 'success',
+            message: `Transcript ready for "${media.fileName}"`,
+          });
+          setTranscribeDialogOpen(false);
+        } catch (error) {
+          if (isTranscriptionCancellationError(error)) {
+            store.setTranscriptStatus(media.id, previousStatus);
+            store.clearTranscriptProgress(media.id);
+            return;
+          }
+
+          store.setTranscriptStatus(media.id, previousStatus === 'ready' ? 'ready' : 'error');
+          store.clearTranscriptProgress(media.id);
+
+          const baseMessage = error instanceof Error ? error.message : 'Failed to transcribe media';
+          const dialogMessage = isTranscriptionOutOfMemoryError(error)
+            ? TRANSCRIPTION_OOM_HINT
+            : baseMessage;
+          setTranscribeErrorMessage(dialogMessage);
+          store.showNotification({
+            type: 'error',
+            message: dialogMessage,
+          });
+        }
+      })();
+    });
+  }, [media.id, media.fileName]);
+
+  const handleCancelTranscript = (e?: React.MouseEvent) => {
+    e?.preventDefault();
+    e?.stopPropagation();
+    mediaTranscriptionService.cancelTranscription(media.id);
+  };
+
+  const handleDeleteTranscript = async (e: React.MouseEvent) => {
+    e.preventDefault();
+    e.stopPropagation();
+
+    const store = useMediaLibraryStore.getState();
 
     try {
-      await mediaTranscriptionService.transcribeMedia(media.id, {
-        onProgress: (progress) => {
-          store.setTranscriptProgress(media.id, progress);
-        },
-      });
-      store.setTranscriptStatus(media.id, 'ready');
+      await mediaTranscriptionService.deleteTranscript(media.id);
+      store.setTranscriptStatus(media.id, 'idle');
       store.clearTranscriptProgress(media.id);
       store.showNotification({
         type: 'success',
-        message: `Transcript ready for "${media.fileName}"`,
+        message: `Transcript deleted for "${media.fileName}"`,
       });
     } catch (error) {
-      if (isLocalInferenceCancellationError(error)) {
-        store.setTranscriptStatus(media.id, previousStatus);
-        store.clearTranscriptProgress(media.id);
-        return;
-      }
-
-      store.setTranscriptStatus(media.id, previousStatus === 'ready' ? 'ready' : 'error');
-      store.clearTranscriptProgress(media.id);
       store.showNotification({
         type: 'error',
-        message: error instanceof Error ? error.message : 'Failed to transcribe media',
+        message: error instanceof Error ? error.message : 'Failed to delete transcript',
       });
     }
   };
@@ -300,65 +407,8 @@ export const MediaCard = memo(function MediaCard({
   const handleAnalyzeWithAI = useCallback(async (e: React.MouseEvent) => {
     e.preventDefault();
     e.stopPropagation();
-
-    const store = useMediaLibraryStore.getState();
-    store.setTaggingMedia(media.id, true);
-
-    try {
-      let captions: Array<{ timeSec: number; text: string }>;
-
-      if (mediaType === 'video') {
-        const blobUrl = await mediaLibraryService.getMediaBlobUrl(media.id);
-        if (!blobUrl) throw new Error('Could not load media file');
-
-        const video = document.createElement('video');
-        video.muted = true;
-        video.preload = 'auto';
-        video.src = blobUrl;
-
-        await new Promise<void>((resolve, reject) => {
-          video.onloadedmetadata = () => resolve();
-          video.onerror = () => reject(new Error('Failed to load video'));
-        });
-
-        try {
-          captions = await captionVideo(video);
-        } finally {
-          video.src = '';
-          URL.revokeObjectURL(blobUrl);
-        }
-      } else {
-        const blobUrl = await mediaLibraryService.getMediaBlobUrl(media.id);
-        if (!blobUrl) throw new Error('Could not load media file');
-
-        const response = await fetch(blobUrl);
-        const blob = await response.blob();
-        URL.revokeObjectURL(blobUrl);
-        captions = await captionImage(blob);
-      }
-
-      if (captions.length > 0) {
-        await mediaLibraryService.updateMediaCaptions(media.id, captions);
-        store.updateMediaCaptions(media.id, captions);
-        store.showNotification({
-          type: 'success',
-          message: `Generated ${captions.length} caption${captions.length === 1 ? '' : 's'} for "${media.fileName}"`,
-        });
-      } else {
-        store.showNotification({
-          type: 'info',
-          message: `No captions generated for "${media.fileName}"`,
-        });
-      }
-    } catch (error) {
-      store.showNotification({
-        type: 'error',
-        message: error instanceof Error ? error.message : 'Failed to analyze media',
-      });
-    } finally {
-      store.setTaggingMedia(media.id, false);
-    }
-  }, [media.id, media.fileName, mediaType]);
+    await mediaAnalysisService.analyzeMedia(media);
+  }, [media]);
 
   const handleDragStart = useCallback((e: React.DragEvent) => {
     // Set drag data for timeline drop
@@ -441,8 +491,17 @@ export const MediaCard = memo(function MediaCard({
     onSelect?.(e);
   };
 
-  const canHoverPreview = (mediaType === 'video' || mediaType === 'image') && !isBroken && !isImporting;
-  const canScrubPreview = mediaType === 'video' && media.duration > 0 && !isBroken && !isImporting;
+  const canHoverPreview = (mediaType === 'video' || mediaType === 'image')
+    && !isBroken
+    && !isImporting
+    && !isTranscriptionDialogOpen;
+  const canScrubPreview = mediaType === 'video'
+    && media.duration > 0
+    && !isBroken
+    && !isImporting
+    && !isTranscriptionDialogOpen;
+  const skimRafRef = useRef<number | null>(null);
+  const pendingSkimClientXRef = useRef<number | null>(null);
 
   const updateSkimPreview = useCallback((clientX: number) => {
     const thumbnailContainer = thumbnailContainerRef.current;
@@ -465,30 +524,58 @@ export const MediaCard = memo(function MediaCard({
     setMediaSkimPreview(media.id, frame);
   }, [canHoverPreview, canScrubPreview, media.duration, media.fps, media.id, setMediaSkimPreview]);
 
+  const flushScheduledSkimPreview = useCallback(() => {
+    skimRafRef.current = null;
+    const clientX = pendingSkimClientXRef.current;
+    pendingSkimClientXRef.current = null;
+    if (clientX === null) return;
+    updateSkimPreview(clientX);
+  }, [updateSkimPreview]);
+
+  const scheduleSkimPreview = useCallback((clientX: number) => {
+    pendingSkimClientXRef.current = clientX;
+    if (skimRafRef.current !== null) {
+      return;
+    }
+
+    skimRafRef.current = requestAnimationFrame(flushScheduledSkimPreview);
+  }, [flushScheduledSkimPreview]);
+
+  const cancelScheduledSkimPreview = useCallback(() => {
+    pendingSkimClientXRef.current = null;
+    if (skimRafRef.current !== null) {
+      cancelAnimationFrame(skimRafRef.current);
+      skimRafRef.current = null;
+    }
+  }, []);
+
   const handleThumbnailPointerEnter = useCallback((event: React.PointerEvent<HTMLDivElement>) => {
     if (!canHoverPreview || event.pointerType === 'touch') return;
+    pauseTimelinePlayback();
     updateSkimPreview(event.clientX);
-  }, [canHoverPreview, updateSkimPreview]);
+  }, [canHoverPreview, pauseTimelinePlayback, updateSkimPreview]);
 
   const handleThumbnailPointerMove = useCallback((event: React.PointerEvent<HTMLDivElement>) => {
     if (!canScrubPreview || event.pointerType === 'touch') return;
-    updateSkimPreview(event.clientX);
-  }, [canScrubPreview, updateSkimPreview]);
+    scheduleSkimPreview(event.clientX);
+  }, [canScrubPreview, scheduleSkimPreview]);
 
   const handleThumbnailPointerLeave = useCallback(() => {
     if (!canHoverPreview) return;
+    cancelScheduledSkimPreview();
     setSkimProgress(null);
     clearMediaSkimPreview();
-  }, [canHoverPreview, clearMediaSkimPreview]);
+  }, [canHoverPreview, cancelScheduledSkimPreview, clearMediaSkimPreview]);
 
   useEffect(() => {
     if (!canHoverPreview) return;
     return () => {
+      cancelScheduledSkimPreview();
       if (useEditorStore.getState().mediaSkimPreviewMediaId === media.id) {
         clearMediaSkimPreview();
       }
     };
-  }, [canHoverPreview, clearMediaSkimPreview, media.id]);
+  }, [canHoverPreview, cancelScheduledSkimPreview, clearMediaSkimPreview, media.id]);
 
   // Cleanup audio on unmount
   useEffect(() => {
@@ -568,9 +655,33 @@ export const MediaCard = memo(function MediaCard({
     useEditorStore.getState().setSourcePreviewMediaId(media.id);
   }, [media.duration, media.fps, media.id]);
 
+  const transcriptProgressPercent = transcriptProgress
+    ? Math.round(getTranscriptionOverallPercent(transcriptProgress))
+    : null;
   const transcriptProgressLabel = transcriptProgress
-    ? `${getTranscriptionStageLabel(transcriptProgress.stage)} (${Math.round(getTranscriptionOverallPercent(transcriptProgress))}%)`
+    ? `${getTranscriptionStageLabel(transcriptProgress.stage)} (${transcriptProgressPercent}%)`
     : 'Transcribing...';
+  const transcriptBusyLabel = hasTranscript
+    ? `Refreshing Transcript (${transcriptProgressLabel})`
+    : transcriptProgressLabel;
+
+  const transcribeDialog = (
+    <TranscribeDialog
+      open={transcribeDialogOpen}
+      onOpenChange={(next) => {
+        if (!next) setTranscribeErrorMessage(null);
+        setTranscribeDialogOpen(next);
+      }}
+      fileName={media.fileName}
+      hasTranscript={hasTranscript}
+      isRunning={isTranscribing}
+      progressPercent={transcriptProgressPercent}
+      progressLabel={transcriptProgressLabel}
+      errorMessage={transcribeErrorMessage}
+      onStart={handleStartTranscription}
+      onCancel={handleCancelTranscript}
+    />
+  );
 
   const actionMenuItems = (
     <MediaCardActionMenuItems
@@ -583,14 +694,17 @@ export const MediaCard = memo(function MediaCard({
       isTranscribable={isTranscribable}
       isTranscribing={isTranscribing}
       hasTranscript={hasTranscript}
-      transcriptProgressLabel={transcriptProgressLabel}
+      transcriptProgressPercent={transcriptProgressPercent}
+      transcriptBusyLabel={transcriptBusyLabel}
       isTaggable={isTaggable}
       isTagging={isTagging}
       hasTags={hasCaptions}
       onGenerateProxy={handleGenerateProxy}
       onCancelProxy={handleCancelProxy}
       onDeleteProxy={handleDeleteProxy}
-      onGenerateTranscript={handleGenerateTranscript}
+      onGenerateTranscript={handleOpenTranscribeDialog}
+      onCancelTranscript={handleCancelTranscript}
+      onDeleteTranscript={handleDeleteTranscript}
       onAnalyzeWithAI={handleAnalyzeWithAI}
       onDelete={handleDelete}
     />
@@ -612,6 +726,8 @@ export const MediaCard = memo(function MediaCard({
   // List view
   if (viewMode === 'list') {
     return (
+      <>
+      {transcribeDialog}
       <div
         style={CARD_PERF_STYLE}
         className={`
@@ -661,7 +777,12 @@ export const MediaCard = memo(function MediaCard({
           )}
           {/* Proxy badge for list view */}
           {!isBroken && !isImporting && proxyStatus === 'generating' && (
-            <div className="absolute bottom-0.5 right-0.5 p-0.5 rounded bg-amber-500/90 text-black">
+            <div className="absolute bottom-0.5 right-0.5 p-0.5 rounded bg-green-500/90 text-black">
+              <Loader2 className="w-2.5 h-2.5 animate-spin" />
+            </div>
+          )}
+          {!isBroken && !isImporting && isTagging && (
+            <div className="absolute bottom-0.5 left-0.5 p-0.5 rounded bg-purple-500/90 text-white" title="Analyzing with AI">
               <Loader2 className="w-2.5 h-2.5 animate-spin" />
             </div>
           )}
@@ -676,6 +797,21 @@ export const MediaCard = memo(function MediaCard({
                 style={{ left: `${skimProgress * 100}%` }}
               />
           )}
+          {!isBroken && !isImporting && isTranscribing && transcriptProgressPercent !== null && (
+            <div
+              role="progressbar"
+              aria-label="Transcript progress"
+              aria-valuemin={0}
+              aria-valuemax={100}
+              aria-valuenow={transcriptProgressPercent}
+              className="absolute inset-x-0 bottom-0 z-10 h-1 overflow-hidden bg-black/25 pointer-events-none"
+            >
+              <div
+                className="h-full bg-blue-500 transition-all duration-300"
+                style={{ width: `${transcriptProgressPercent}%` }}
+              />
+            </div>
+          )}
           {/* Audio play button for list view */}
           {isAudio && (
             <button
@@ -747,11 +883,14 @@ export const MediaCard = memo(function MediaCard({
           </div>
         )}
       </div>
+      </>
     );
   }
 
   // Grid view
   return (
+    <>
+    {transcribeDialog}
     <div
       style={CARD_PERF_STYLE}
       className={`
@@ -817,7 +956,12 @@ export const MediaCard = memo(function MediaCard({
               </div>
             )}
             {!isBroken && proxyStatus === 'generating' && (
-              <div className="p-0.5 rounded bg-amber-500/90 text-black pointer-events-none">
+              <div className="p-0.5 rounded bg-green-500/90 text-black pointer-events-none">
+                <Loader2 className="w-2.5 h-2.5 animate-spin" />
+              </div>
+            )}
+            {!isBroken && isTagging && (
+              <div className="p-0.5 rounded bg-purple-500/90 text-white pointer-events-none" title="Analyzing with AI">
                 <Loader2 className="w-2.5 h-2.5 animate-spin" />
               </div>
             )}
@@ -883,6 +1027,21 @@ export const MediaCard = memo(function MediaCard({
               style={{ left: `${skimProgress * 100}%` }}
             />
         )}
+        {!isBroken && !isImporting && isTranscribing && transcriptProgressPercent !== null && (
+          <div
+            role="progressbar"
+            aria-label="Transcript progress"
+            aria-valuemin={0}
+            aria-valuemax={100}
+            aria-valuenow={transcriptProgressPercent}
+            className="absolute inset-x-0 bottom-0 z-10 h-1 overflow-hidden bg-black/25 pointer-events-none"
+          >
+            <div
+              className="h-full bg-blue-500 transition-all duration-300"
+              style={{ width: `${transcriptProgressPercent}%` }}
+            />
+          </div>
+        )}
       </div>
 
       {/* Content footer - minimal */}
@@ -918,5 +1077,6 @@ export const MediaCard = memo(function MediaCard({
       <div className="absolute left-0 top-0 bottom-0 w-[2px] bg-gradient-to-b from-border via-muted to-border opacity-50" />
       <div className="absolute right-0 top-0 bottom-0 w-[2px] bg-gradient-to-b from-border via-muted to-border opacity-50" />
     </div>
+    </>
   );
 });
diff --git a/src/features/media-library/components/media-info-popover.test.tsx b/src/features/media-library/components/media-info-popover.test.tsx
new file mode 100644
index 000000000..94f73c1c9
--- /dev/null
+++ b/src/features/media-library/components/media-info-popover.test.tsx
@@ -0,0 +1,125 @@
+import type { ReactNode, MouseEvent } from 'react';
+import React, { createContext, cloneElement, isValidElement, useContext } from 'react';
+import { render, screen, fireEvent, waitFor } from '@testing-library/react';
+import { describe, it, expect, vi, beforeEach } from 'vitest';
+import type { MediaMetadata, MediaTranscript } from '@/types/storage';
+
+const mediaTranscriptionServiceMocks = vi.hoisted(() => ({
+  getTranscript: vi.fn(),
+}));
+
+vi.mock('@/components/ui/popover', () => {
+  const PopoverContext = createContext<{
+    open: boolean;
+    onOpenChange: (open: boolean) => void;
+  }>({
+    open: false,
+    onOpenChange: () => {},
+  });
+
+  return {
+    Popover: ({
+      children,
+      open = false,
+      onOpenChange = () => {},
+    }: {
+      children: ReactNode;
+      open?: boolean;
+      onOpenChange?: (open: boolean) => void;
+    }) => (
+      <PopoverContext.Provider value={{ open, onOpenChange }}>
+        <div>{children}</div>
+      </PopoverContext.Provider>
+    ),
+    PopoverTrigger: ({
+      children,
+      asChild,
+    }: {
+      children: ReactNode;
+      asChild?: boolean;
+    }) => {
+      const { open, onOpenChange } = useContext(PopoverContext);
+      if (asChild && isValidElement(children)) {
+        return cloneElement(children, {
+          onClick: (event: MouseEvent<HTMLButtonElement>) => {
+            children.props.onClick?.(event);
+            onOpenChange(!open);
+          },
+        });
+      }
+      return <button onClick={() => onOpenChange(!open)}>{children}</button>;
+    },
+    PopoverContent: ({ children }: { children: ReactNode }) => {
+      const { open } = useContext(PopoverContext);
+      return open ? <div>{children}</div> : null;
+    },
+  };
+});
+
+vi.mock('../services/media-transcription-service', () => ({
+  mediaTranscriptionService: mediaTranscriptionServiceMocks,
+}));
+
+vi.mock('../transcription/registry', () => ({
+  getMediaTranscriptionModelLabel: (model: string) => model === 'whisper-tiny' ? 'Tiny' : model,
+}));
+
+import { MediaInfoPopover } from './media-info-popover';
+
+function makeMedia(overrides: Partial<MediaMetadata> = {}): MediaMetadata {
+  return {
+    id: 'media-1',
+    storageType: 'handle',
+    fileName: 'clip.mp4',
+    fileSize: 1024,
+    mimeType: 'video/mp4',
+    duration: 5,
+    width: 1920,
+    height: 1080,
+    fps: 30,
+    codec: 'h264',
+    bitrate: 5000,
+    tags: [],
+    createdAt: 1,
+    updatedAt: 1,
+    ...overrides,
+  };
+}
+
+describe('MediaInfoPopover', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+  });
+
+  it('loads and displays transcript details when opened', async () => {
+    const transcript: MediaTranscript = {
+      id: 'media-1',
+      mediaId: 'media-1',
+      model: 'whisper-tiny',
+      quantization: 'q8',
+      text: 'Hello world from transcript',
+      segments: [
+        { text: 'Hello world', start: 1.25, end: 2.5 },
+      ],
+      createdAt: 1,
+      updatedAt: 1,
+    };
+    mediaTranscriptionServiceMocks.getTranscript.mockResolvedValue(transcript);
+    const onSeekToCaption = vi.fn();
+
+    render(<MediaInfoPopover media={makeMedia()} onSeekToCaption={onSeekToCaption} />);
+
+    fireEvent.click(screen.getByTitle('Media info'));
+
+    await waitFor(() => {
+      expect(mediaTranscriptionServiceMocks.getTranscript).toHaveBeenCalledWith('media-1');
+    });
+
+    expect(await screen.findByText('Transcript (1)')).toBeInTheDocument();
+    expect(screen.getByText('Tiny')).toBeInTheDocument();
+    expect(screen.getByText('Hello world from transcript')).toBeInTheDocument();
+
+    fireEvent.click(screen.getByRole('button', { name: '0:01' }));
+    expect(onSeekToCaption).toHaveBeenCalledWith(1.25);
+  });
+});
diff --git a/src/features/media-library/components/media-info-popover.tsx b/src/features/media-library/components/media-info-popover.tsx
index 5645a090a..805a4bb25 100644
--- a/src/features/media-library/components/media-info-popover.tsx
+++ b/src/features/media-library/components/media-info-popover.tsx
@@ -1,8 +1,11 @@
-import { Info, Video, FileAudio, Image as ImageIcon, Film, Clock, Maximize2, HardDrive, FileType, Sparkles } from 'lucide-react';
+import { Info, Video, FileAudio, Image as ImageIcon, Film, Clock, Maximize2, HardDrive, FileType, Loader2, FileText } from 'lucide-react';
+import { useEffect, useState } from 'react';
 import { Popover, PopoverContent, PopoverTrigger } from '@/components/ui/popover';
-import type { MediaMetadata } from '@/types/storage';
+import type { MediaMetadata, MediaTranscript } from '@/types/storage';
 import { getMediaType, formatDuration } from '../utils/validation';
 import { formatBytes } from '@/shared/utils/format-utils';
+import { mediaTranscriptionService } from '../services/media-transcription-service';
+import { getMediaTranscriptionModelLabel } from '../transcription/registry';
 
 function formatTimestamp(sec: number): string {
   const m = Math.floor(sec / 60);
@@ -19,8 +22,12 @@ interface MediaInfoPopoverProps {
 }
 
 export function MediaInfoPopover({ media, triggerClassName, onSeekToCaption }: MediaInfoPopoverProps) {
+  const [open, setOpen] = useState(false);
+  const [transcript, setTranscript] = useState<MediaTranscript | null>(null);
+  const [transcriptLoading, setTranscriptLoading] = useState(false);
   const mediaType = getMediaType(media.mimeType);
   const typeLabel = mediaType === 'video' ? 'Video' : mediaType === 'audio' ? 'Audio' : 'Image';
+  const isTranscribable = mediaType === 'video' || mediaType === 'audio';
 
   const rows: Array<{ icon: React.ReactNode; label: string; value: string }> = [];
 
@@ -46,8 +53,33 @@ export function MediaInfoPopover({ media, triggerClassName, onSeekToCaption }: M
     rows.push({ icon: <Film className="w-3 h-3" />, label: 'Frame Rate', value: `${media.fps.toFixed(2)} fps` });
   }
 
+  useEffect(() => {
+    if (!open || !isTranscribable) {
+      return;
+    }
+
+    let cancelled = false;
+    setTranscriptLoading(true);
+
+    void mediaTranscriptionService.getTranscript(media.id)
+      .then((loadedTranscript) => {
+        if (!cancelled) {
+          setTranscript(loadedTranscript ?? null);
+        }
+      })
+      .finally(() => {
+        if (!cancelled) {
+          setTranscriptLoading(false);
+        }
+      });
+
+    return () => {
+      cancelled = true;
+    };
+  }, [isTranscribable, media.id, open]);
+
   return (
-    <Popover>
+    <Popover open={open} onOpenChange={setOpen}>
       <PopoverTrigger asChild>
         <button
           type="button"
@@ -86,30 +118,48 @@ export function MediaInfoPopover({ media, triggerClassName, onSeekToCaption }: M
           ))}
         </div>
 
-        {/* AI Captions section */}
-        {media.aiCaptions && media.aiCaptions.length > 0 && (
+        {(transcriptLoading || transcript) && (
           <div className="border-t border-border/50">
             <div className="flex items-center gap-1.5 px-3 py-1.5">
-              <Sparkles className="w-3 h-3 text-purple-400" />
+              <FileText className="w-3 h-3 text-primary" />
               <span className="text-[10px] font-medium text-muted-foreground">
-                AI Captions ({media.aiCaptions.length})
+                {transcript
+                  ? `Transcript (${transcript.segments.length})`
+                  : 'Transcript'}
               </span>
+              {transcript && (
+                <span className="ml-auto text-[10px] text-muted-foreground">
+                  {getMediaTranscriptionModelLabel(transcript.model)}
+                </span>
+              )}
             </div>
-            <div className="px-3 pb-2 space-y-1.5 max-h-40 overflow-y-auto">
-              {media.aiCaptions.map((caption, i) => (
-                <div key={i} className="flex gap-2 text-[10px]">
-                  <button
-                    type="button"
-                    className="text-primary/80 hover:text-primary font-mono flex-shrink-0 w-10 text-right cursor-pointer hover:underline"
-                    onClick={(e) => { e.stopPropagation(); onSeekToCaption?.(caption.timeSec); }}
-                    title="Open in source monitor"
-                  >
-                    {formatTimestamp(caption.timeSec)}
-                  </button>
-                  <span className="text-foreground leading-snug">{caption.text}</span>
+            {transcriptLoading ? (
+              <div className="px-3 pb-3 flex items-center gap-2 text-[10px] text-muted-foreground">
+                <Loader2 className="w-3 h-3 animate-spin" />
+                Loading transcript...
+              </div>
+            ) : transcript ? (
+              <div className="px-3 pb-2 space-y-2">
+                <p className="text-[10px] leading-snug text-foreground/85 line-clamp-3">
+                  {transcript.text}
+                </p>
+                <div className="space-y-1.5 max-h-40 overflow-y-auto">
+                  {transcript.segments.map((segment, i) => (
+                    <div key={`${segment.start}-${segment.end}-${i}`} className="flex gap-2 text-[10px]">
+                      <button
+                        type="button"
+                        className="text-primary/80 hover:text-primary font-mono flex-shrink-0 w-10 text-right cursor-pointer hover:underline"
+                        onClick={(e) => { e.stopPropagation(); onSeekToCaption?.(segment.start); }}
+                        title="Open in source monitor"
+                      >
+                        {formatTimestamp(segment.start)}
+                      </button>
+                      <span className="text-foreground leading-snug">{segment.text}</span>
+                    </div>
+                  ))}
                 </div>
-              ))}
-            </div>
+              </div>
+            ) : null}
           </div>
         )}
       </PopoverContent>
diff --git a/src/features/media-library/components/media-library.tsx b/src/features/media-library/components/media-library.tsx
index 6119ba65f..40d8c21f9 100644
--- a/src/features/media-library/components/media-library.tsx
+++ b/src/features/media-library/components/media-library.tsx
@@ -1,5 +1,6 @@
 import { useEffect, useRef, useState, useMemo, memo, useCallback } from 'react';
-import { Search, Filter, SortAsc, Video, FileAudio, Image as ImageIcon, Trash2, Grid3x3, List, AlertTriangle, Info, X, FolderOpen, Link2Off, ChevronRight, Film, ArrowLeft, Zap, Loader2, Copy, Check, Upload } from 'lucide-react';
+import { Search, Filter, SortAsc, Video, FileAudio, Image as ImageIcon, Trash2, Grid3x3, List, AlertTriangle, Info, X, FolderOpen, Link2Off, ChevronRight, Film, ArrowLeft, Zap, Loader2, Copy, Check, Upload, Sparkles, FileText, ScanSearch } from 'lucide-react';
+import { SceneBrowserPanel, useSceneBrowserStore } from '../deps/scene-browser';
 import { createLogger } from '@/shared/logging/logger';
 
 const logger = createLogger('MediaLibrary');
@@ -28,10 +29,17 @@ import {
   CollapsibleContent,
   CollapsibleTrigger,
 } from '@/components/ui/collapsible';
+import {
+  Tooltip,
+  TooltipContent,
+  TooltipProvider,
+  TooltipTrigger,
+} from '@/components/ui/tooltip';
 import { MarqueeOverlay } from '@/components/marquee-overlay';
 import { cn } from '@/shared/ui/cn';
 import { MediaGrid } from './media-grid';
 import { CompositionsSection } from './compositions-section';
+import { BackgroundTaskProgress } from './background-task-progress';
 import { MissingMediaDialog } from './missing-media-dialog';
 import { OrphanedClipsDialog } from './orphaned-clips-dialog';
 import { UnsupportedAudioCodecDialog } from './unsupported-audio-codec-dialog';
@@ -47,10 +55,16 @@ import {
 import { useProjectStore } from '@/features/media-library/deps/projects';
 import { proxyService } from '../services/proxy-service';
 import { mediaLibraryService } from '../services/media-library-service';
+import { mediaTranscriptionService } from '../services/media-transcription-service';
+import { mediaAnalysisService } from '../services/media-analysis-service';
 import { extractValidMediaFileEntriesFromDataTransfer } from '../utils/file-drop';
 import { getSharedProxyKey } from '../utils/proxy-key';
 import { getMediaType } from '../utils/validation';
 import { getProjectBrokenMediaIds } from '@/features/media-library/utils/broken-media';
+import {
+  getTranscriptionOverallProgress,
+  getTranscriptionStageLabel,
+} from '@/shared/utils/transcription-progress';
 import type { MediaMetadata } from '@/types/storage';
 import { isMarqueeJustFinished, useMarqueeSelection, type MarqueeItem } from '@/hooks/use-marquee-selection';
 
@@ -72,6 +86,21 @@ function CopyButton({ text }: { text: string }) {
   );
 }
 
+function HeaderActionTooltip({
+  label,
+  children,
+}: {
+  label: string;
+  children: React.ReactNode;
+}) {
+  return (
+    <Tooltip>
+      <TooltipTrigger asChild>{children}</TooltipTrigger>
+      <TooltipContent side="bottom">{label}</TooltipContent>
+    </Tooltip>
+  );
+}
+
 const GROUP_ICONS = {
   video: Video,
   audio: FileAudio,
@@ -158,6 +187,8 @@ export const MediaLibrary = memo(function MediaLibrary({ onMediaSelect }: MediaL
   const setSortBy = useMediaLibraryStore((s) => s.setSortBy);
   const viewMode = useMediaLibraryStore((s) => s.viewMode);
   const setViewMode = useMediaLibraryStore((s) => s.setViewMode);
+  const sceneBrowserOpen = useSceneBrowserStore((s) => s.open);
+  const toggleSceneBrowser = useSceneBrowserStore((s) => s.toggleBrowser);
   const mediaItemSize = useMediaLibraryStore((s) => s.mediaItemSize);
   const setMediaItemSize = useMediaLibraryStore((s) => s.setMediaItemSize);
   const selectedMediaIds = useMediaLibraryStore((s) => s.selectedMediaIds);
@@ -175,6 +206,8 @@ export const MediaLibrary = memo(function MediaLibrary({ onMediaSelect }: MediaL
   const projectStoreProjectId = useProjectStore((s) => s.currentProject?.id ?? null);
   const proxyStatus = useMediaLibraryStore((s) => s.proxyStatus);
   const proxyProgress = useMediaLibraryStore((s) => s.proxyProgress);
+  const transcriptStatus = useMediaLibraryStore((s) => s.transcriptStatus);
+  const transcriptProgress = useMediaLibraryStore((s) => s.transcriptProgress);
   const filteredMediaItems = useFilteredMediaItems();
   const mediaGroups = useMemo(() => {
     const groups: { key: string; label: string; icon: 'video' | 'audio' | 'image' | 'gif'; items: MediaMetadata[] }[] = [];
@@ -487,6 +520,19 @@ export const MediaLibrary = memo(function MediaLibrary({ onMediaSelect }: MediaL
     return count;
   }, [proxyStatus]);
 
+  const analysisProgress = useMediaLibraryStore((s) => s.analysisProgress);
+  const analysisPercent = analysisProgress && analysisProgress.total > 0
+    ? (analysisProgress.completed / analysisProgress.total) * 100
+    : 0;
+
+  const transcribingCount = useMemo(() => {
+    let count = 0;
+    for (const status of transcriptStatus.values()) {
+      if (status === 'queued' || status === 'transcribing') count++;
+    }
+    return count;
+  }, [transcriptStatus]);
+
   const currentProjectBrokenMediaIds = useMemo(
     () => getProjectBrokenMediaIds(brokenMediaIds, mediaById),
     [brokenMediaIds, mediaById]
@@ -506,6 +552,31 @@ export const MediaLibrary = memo(function MediaLibrary({ onMediaSelect }: MediaL
     return count > 0 ? total / count : 0;
   }, [proxyStatus, proxyProgress, generatingCount]);
 
+  const transcribingAvgProgress = useMemo(() => {
+    if (transcribingCount === 0) return 0;
+    let total = 0;
+    let count = 0;
+    for (const [id, status] of transcriptStatus.entries()) {
+      if (status === 'queued' || status === 'transcribing') {
+        const progress = transcriptProgress.get(id);
+        total += progress ? getTranscriptionOverallProgress(progress) : 0;
+        count++;
+      }
+    }
+    return count > 0 ? total / count : 0;
+  }, [transcriptStatus, transcriptProgress, transcribingCount]);
+
+  const singleTranscriptionStageLabel = useMemo(() => {
+    if (transcribingCount !== 1) return null;
+    for (const [id, status] of transcriptStatus.entries()) {
+      if (status === 'queued' || status === 'transcribing') {
+        const progress = transcriptProgress.get(id);
+        return progress ? getTranscriptionStageLabel(progress.stage) : null;
+      }
+    }
+    return null;
+  }, [transcriptStatus, transcriptProgress, transcribingCount]);
+
   const handleGenerateSelectedProxies = async () => {
     const selectedItems = selectedMediaIds
       .map((id) => mediaById[id])
@@ -542,6 +613,16 @@ export const MediaLibrary = memo(function MediaLibrary({ onMediaSelect }: MediaL
     }
   };
 
+  const handleCancelAllTranscriptions = () => {
+    for (const [mediaId, status] of transcriptStatus.entries()) {
+      if (status !== 'queued' && status !== 'transcribing') {
+        continue;
+      }
+
+      mediaTranscriptionService.cancelTranscription(mediaId);
+    }
+  };
+
   // Count selected items that are eligible for proxy generation
   const selectedProxyEligibleCount = useMemo(() => {
     return selectedMediaIds.filter((id) => {
@@ -622,85 +703,112 @@ export const MediaLibrary = memo(function MediaLibrary({ onMediaSelect }: MediaL
   return (
     <div ref={containerRef} className="h-full flex flex-col">
       {/* Header toolbar */}
-      <div className="px-3 py-2 border-b border-border flex-shrink-0">
-        <div className="flex items-center gap-2 text-xs">
-          {/* Import action */}
-          <button
-            onClick={handleImport}
-            disabled={!currentProjectId}
-            className="flex items-center gap-1.5 h-7 px-2.5 rounded-md
-              bg-primary text-primary-foreground
-              hover:bg-primary/90
-              disabled:opacity-40 disabled:cursor-not-allowed
-              transition-colors duration-150"
-            title="Import media files"
-          >
-            <FolderOpen className="w-3.5 h-3.5" />
-            <span>Import</span>
-          </button>
-
-          {/* Missing media indicator */}
-          {currentProjectBrokenMediaIds.length > 0 && (
-            <button
-              onClick={openMissingMediaDialog}
-              className="flex items-center gap-1.5 h-7 px-2.5 rounded-md
-                bg-destructive/10 border border-destructive/25 text-destructive
-                hover:bg-destructive/20 hover:border-destructive/40
-                transition-colors duration-150"
-              title="View missing media files"
-            >
-              <Link2Off className="w-3.5 h-3.5" />
-              <span>{currentProjectBrokenMediaIds.length} Missing</span>
-            </button>
-          )}
-
-
-          {/* Selection indicator & actions */}
-          {selectedAssetCount > 0 && (
-            <>
-              <div className="h-4 w-px bg-border" />
+      <div className="@container px-3 py-2 border-b border-border flex-shrink-0">
+        <TooltipProvider>
+          <div className="flex flex-wrap items-center gap-2 text-xs min-w-0">
+            {/* Import action */}
+            <HeaderActionTooltip label="Import media files">
+              <button
+                onClick={handleImport}
+                disabled={!currentProjectId}
+                className="flex items-center gap-1.5 h-7 px-2.5 rounded-md shrink-0
+                  bg-primary text-primary-foreground
+                  hover:bg-primary/90
+                  disabled:opacity-40 disabled:cursor-not-allowed
+                  transition-colors duration-150"
+              >
+                <FolderOpen className="w-3.5 h-3.5" />
+                <span className="hidden @[220px]:inline">Import</span>
+              </button>
+            </HeaderActionTooltip>
 
-              {/* Selection badge */}
-              <div className="flex items-center gap-1 h-7 pl-2 pr-1 rounded-md bg-accent/50 border border-border">
-                <span className="tabular-nums">{selectedAssetCount}</span>
-                <span className="text-muted-foreground">selected</span>
-                <button
-                  onClick={clearSelection}
-                  className="ml-0.5 p-1 rounded hover:bg-foreground/10 text-muted-foreground hover:text-foreground transition-colors"
-                  title="Clear selection"
-                >
-                  <X className="w-3 h-3" />
-                </button>
-              </div>
+            {/* Scene browser view toggle — lives here with Import (not in
+                the filter row) because it switches the whole panel between
+                media-library and scene-captioner views; the search/filter
+                bar below only scopes whichever view is mounted. */}
+            <HeaderActionTooltip label="Search scenes (Ctrl+Shift+F)">
+              <button
+                onClick={toggleSceneBrowser}
+                className={cn(
+                  'flex items-center gap-1.5 h-7 px-2.5 rounded-md shrink-0 border transition-colors duration-150',
+                  sceneBrowserOpen
+                    ? 'bg-primary/10 border-primary/40 text-primary'
+                    : 'bg-secondary border-border text-muted-foreground hover:text-foreground hover:bg-foreground/5',
+                )}
+                aria-pressed={sceneBrowserOpen}
+              >
+                <ScanSearch className="w-3.5 h-3.5" />
+                <span className="hidden @[260px]:inline">Scenes</span>
+              </button>
+            </HeaderActionTooltip>
 
-              {/* Generate proxies for selection */}
-              {selectedProxyEligibleCount > 0 && (
+            {/* Missing media indicator */}
+            {currentProjectBrokenMediaIds.length > 0 && (
+              <HeaderActionTooltip label={`View ${currentProjectBrokenMediaIds.length} missing media file${currentProjectBrokenMediaIds.length === 1 ? '' : 's'}`}>
                 <button
-                  onClick={handleGenerateSelectedProxies}
-                  className="flex items-center gap-1 h-7 px-2 rounded-md
-                    text-muted-foreground hover:text-primary hover:bg-primary/10
+                  onClick={openMissingMediaDialog}
+                  className="flex items-center gap-1.5 h-7 px-2.5 rounded-md shrink-0
+                    bg-destructive/10 border border-destructive/25 text-destructive
+                    hover:bg-destructive/20 hover:border-destructive/40
                     transition-colors duration-150"
-                  title="Generate proxies for selected"
                 >
-                  <Zap className="w-3 h-3" />
-                  <span>Proxy ({selectedProxyEligibleCount})</span>
+                  <Link2Off className="w-3.5 h-3.5" />
+                  <span className="hidden @[250px]:inline">{currentProjectBrokenMediaIds.length} Missing</span>
                 </button>
-              )}
+              </HeaderActionTooltip>
+            )}
 
-              {/* Delete action */}
-              <button
-                onClick={handleDeleteSelected}
-                className="flex items-center gap-1 h-7 px-2 rounded-md
-                  text-destructive/80 hover:text-destructive hover:bg-destructive/10
-                  transition-colors duration-150"
-                title="Delete selected"
-              >
-                <Trash2 className="w-3 h-3" />
-                <span>Delete</span>
-              </button>
-            </>
-          )}
-        </div>
+
+            {/* Selection indicator & actions */}
+            {selectedAssetCount > 0 && (
+              <>
+                <div className="h-4 w-px bg-border hidden @[240px]:block" />
+
+                {/* Selection badge */}
+                <div className="flex items-center gap-1 h-7 pl-2 pr-1 rounded-md bg-accent/50 border border-border min-w-0 max-w-full">
+                  <span className="tabular-nums shrink-0">{selectedAssetCount}</span>
+                  <span className="text-muted-foreground hidden @[260px]:inline">selected</span>
+                  <HeaderActionTooltip label="Clear selection">
+                    <button
+                      onClick={clearSelection}
+                      className="ml-0.5 p-1 rounded hover:bg-foreground/10 text-muted-foreground hover:text-foreground transition-colors shrink-0"
+                    >
+                      <X className="w-3 h-3" />
+                    </button>
+                  </HeaderActionTooltip>
+                </div>
+
+                {/* Generate proxies for selection */}
+                {selectedProxyEligibleCount > 0 && (
+                  <HeaderActionTooltip label={`Generate proxies for ${selectedProxyEligibleCount} selected item${selectedProxyEligibleCount === 1 ? '' : 's'}`}>
+                    <button
+                      onClick={handleGenerateSelectedProxies}
+                      className="flex items-center gap-1 h-7 px-2 rounded-md shrink-0
+                        text-muted-foreground hover:text-primary hover:bg-primary/10
+                        transition-colors duration-150"
+                    >
+                      <Zap className="w-3 h-3" />
+                      <span className="hidden @[320px]:inline">Proxy ({selectedProxyEligibleCount})</span>
+                    </button>
+                  </HeaderActionTooltip>
+                )}
+
+                {/* Delete action */}
+                <HeaderActionTooltip label="Delete selected assets">
+                  <button
+                    onClick={handleDeleteSelected}
+                    className="flex items-center gap-1 h-7 px-2 rounded-md shrink-0
+                      text-destructive/80 hover:text-destructive hover:bg-destructive/10
+                      transition-colors duration-150"
+                  >
+                    <Trash2 className="w-3 h-3" />
+                    <span className="hidden @[280px]:inline">Delete</span>
+                  </button>
+                </HeaderActionTooltip>
+              </>
+            )}
+          </div>
+        </TooltipProvider>
       </div>
 
       {/* Error message */}
@@ -772,7 +880,9 @@ export const MediaLibrary = memo(function MediaLibrary({ onMediaSelect }: MediaL
         </div>
       )}
 
-      {/* Search and filters */}
+      {/* Search and filters — hidden in Scene mode since they only scope
+          the media library grid; the scene browser has its own search. */}
+      {!sceneBrowserOpen && (
       <div className="px-4 pt-3 pb-2 space-y-2 flex-shrink-0">
         {/* Search */}
         <div className="relative group">
@@ -923,6 +1033,7 @@ export const MediaLibrary = memo(function MediaLibrary({ onMediaSelect }: MediaL
           </div>
         </div>
       </div>
+      )}
 
       {/* Composition navigation banner — shown when inside a sub-composition */}
       {activeCompositionId !== null && activeCompLabel && (
@@ -941,9 +1052,13 @@ export const MediaLibrary = memo(function MediaLibrary({ onMediaSelect }: MediaL
 
       {/* Scrollable content: wrapper provides relative context for the drag overlay */}
       <div className="flex-1 relative min-h-0">
+        {sceneBrowserOpen && <SceneBrowserPanel className="absolute inset-0 bg-background" />}
         <div
           ref={scrollContainerRef}
-          className="relative h-full overflow-y-auto px-4 pb-4 [scrollbar-gutter:stable]"
+          className={cn(
+            'relative h-full overflow-y-auto px-4 pb-4 [scrollbar-gutter:stable]',
+            sceneBrowserOpen && 'hidden',
+          )}
           onClick={handleScrollContentClick}
           onDragEnter={handleDragEnter}
           onDragOver={handleDragOver}
@@ -1015,38 +1130,91 @@ export const MediaLibrary = memo(function MediaLibrary({ onMediaSelect }: MediaL
         )}
       </div>
 
+      {/* Background AI analysis status */}
+      {analysisProgress && (
+        <BackgroundTaskProgress
+          icon={<Loader2 className="w-3.5 h-3.5 text-purple-400 animate-spin flex-shrink-0" />}
+          label={
+            analysisProgress.total > 1
+              ? `Analyzing ${Math.min(analysisProgress.completed + 1, analysisProgress.total)} of ${analysisProgress.total} with AI`
+              : 'Analyzing 1 item with AI'
+          }
+          progressAriaLabel="AI analysis progress"
+          progressPercent={analysisPercent}
+          meta={(
+            <>
+              <span className="tabular-nums">{Math.round(analysisPercent)}%</span>
+              {!analysisProgress.cancelRequested ? (
+                <button
+                  type="button"
+                  onClick={() => mediaAnalysisService.requestCancel()}
+                  className="text-muted-foreground hover:text-foreground transition-colors"
+                >
+                  Cancel
+                </button>
+              ) : (
+                <span className="text-muted-foreground/80">Cancelling…</span>
+              )}
+            </>
+          )}
+          trailing={<Sparkles className="w-3.5 h-3.5 text-purple-400 flex-shrink-0" />}
+          fillClassName="bg-purple-500"
+        />
+      )}
+
+      {/* Transcript generation progress bar */}
+      {transcribingCount > 0 && (
+        <BackgroundTaskProgress
+          icon={<FileText className="w-3.5 h-3.5 text-blue-500 flex-shrink-0" />}
+          label={`Generating ${transcribingCount} ${transcribingCount === 1 ? 'transcript' : 'transcripts'} in background`}
+          progressAriaLabel="Transcript generation progress"
+          progressPercent={transcribingAvgProgress * 100}
+          meta={(
+            <>
+              {singleTranscriptionStageLabel && (
+                <span className="hidden sm:inline truncate">
+                  {singleTranscriptionStageLabel}
+                </span>
+              )}
+              <span className="tabular-nums">
+                {Math.round(transcribingAvgProgress * 100)}%
+              </span>
+              <button
+                type="button"
+                onClick={handleCancelAllTranscriptions}
+                className="text-muted-foreground hover:text-foreground transition-colors"
+              >
+                Cancel all
+              </button>
+            </>
+          )}
+          fillClassName="bg-blue-500"
+        />
+      )}
+
       {/* Proxy generation progress bar */}
       {generatingCount > 0 && (
-        <div className="px-3 py-2 border-t border-border flex-shrink-0 bg-panel-bg/50">
-          <div className="flex items-center gap-2 text-xs">
-            <Loader2 className="w-3.5 h-3.5 text-amber-500 animate-spin flex-shrink-0" />
-            <div className="flex-1 min-w-0">
-              <div className="flex items-center justify-between mb-1">
-                <span className="text-muted-foreground">
-                  Generating {generatingCount} {generatingCount === 1 ? 'proxy' : 'proxies'} in background
-                </span>
-                <div className="flex items-center gap-2">
-                  <span className="text-muted-foreground tabular-nums">
-                    {Math.round(generatingAvgProgress * 100)}%
-                  </span>
-                  <button
-                    type="button"
-                    onClick={handleCancelAllProxies}
-                    className="text-muted-foreground hover:text-foreground transition-colors"
-                  >
-                    Cancel all
-                  </button>
-                </div>
-              </div>
-              <div className="h-1 bg-secondary rounded-full overflow-hidden">
-                <div
-                  className="h-full bg-amber-500 rounded-full transition-all duration-300"
-                  style={{ width: `${Math.round(generatingAvgProgress * 100)}%` }}
-                />
-              </div>
-            </div>
-          </div>
-        </div>
+        <BackgroundTaskProgress
+          icon={<Loader2 className="w-3.5 h-3.5 text-green-500 animate-spin flex-shrink-0" />}
+          label={`Generating ${generatingCount} ${generatingCount === 1 ? 'proxy' : 'proxies'} in background`}
+          progressAriaLabel="Proxy generation progress"
+          progressPercent={generatingAvgProgress * 100}
+          meta={(
+            <>
+              <span className="tabular-nums">
+                {Math.round(generatingAvgProgress * 100)}%
+              </span>
+              <button
+                type="button"
+                onClick={handleCancelAllProxies}
+                className="text-muted-foreground hover:text-foreground transition-colors"
+              >
+                Cancel all
+              </button>
+            </>
+          )}
+          fillClassName="bg-green-500"
+        />
       )}
 
       {/* Delete confirmation dialog */}
diff --git a/src/features/media-library/components/transcribe-dialog.test.tsx b/src/features/media-library/components/transcribe-dialog.test.tsx
new file mode 100644
index 000000000..fb10e26fa
--- /dev/null
+++ b/src/features/media-library/components/transcribe-dialog.test.tsx
@@ -0,0 +1,261 @@
+import type { ReactNode } from 'react';
+import React, { useContext } from 'react';
+import { fireEvent, render, screen } from '@testing-library/react';
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+const settingsStoreState = vi.hoisted(() => ({
+  defaultWhisperModel: 'whisper-base',
+  defaultWhisperQuantization: 'hybrid',
+  defaultWhisperLanguage: '',
+}));
+
+const editorStoreState = vi.hoisted(() => ({
+  clearMediaSkimPreview: vi.fn(),
+  clearCompoundClipSkimPreview: vi.fn(),
+  beginTranscriptionDialog: vi.fn(),
+  endTranscriptionDialog: vi.fn(),
+}));
+
+const playbackStoreState = vi.hoisted(() => ({
+  setPreviewFrame: vi.fn(),
+  pause: vi.fn(),
+}));
+
+vi.mock('@/features/media-library/deps/settings-contract', () => ({
+  useSettingsStore: (selector: (state: typeof settingsStoreState) => unknown) => selector(settingsStoreState),
+}));
+
+vi.mock('@/app/state/editor', () => ({
+  useEditorStore: (selector: (state: typeof editorStoreState) => unknown) => selector(editorStoreState),
+}));
+
+vi.mock('@/shared/state/playback', () => ({
+  usePlaybackStore: {
+    getState: () => playbackStoreState,
+  },
+}));
+
+vi.mock('../transcription/registry', () => ({
+  getMediaTranscriptionModelOptions: () => [
+    { value: 'whisper-base', label: 'Whisper Base' },
+  ],
+}));
+
+vi.mock('@/shared/utils/whisper-settings', () => ({
+  getWhisperLanguageSelectValue: (value: string) => value,
+  getWhisperLanguageSettingValue: (value: string) => value,
+  normalizeSelectableWhisperModel: (value: string) => value,
+  WHISPER_LANGUAGE_OPTIONS: [
+    { value: '', label: 'Auto-detect' },
+  ],
+  WHISPER_QUANTIZATION_OPTIONS: [
+    { value: 'hybrid', label: 'Hybrid' },
+  ],
+}));
+
+vi.mock('@/components/ui/button', () => ({
+  Button: ({
+    children,
+    onClick,
+    disabled,
+  }: {
+    children: ReactNode;
+    onClick?: () => void;
+    disabled?: boolean;
+  }) => (
+    <button type="button" disabled={disabled} onClick={onClick}>
+      {children}
+    </button>
+  ),
+}));
+
+vi.mock('@/components/ui/label', () => ({
+  Label: ({ children }: { children: ReactNode }) => <label>{children}</label>,
+}));
+
+vi.mock('@/components/ui/combobox', () => ({
+  Combobox: ({ value, onValueChange, disabled }: {
+    value: string;
+    onValueChange: (value: string) => void;
+    disabled?: boolean;
+  }) => (
+    <input
+      aria-label="Language"
+      disabled={disabled}
+      value={value}
+      onChange={(event) => onValueChange(event.target.value)}
+    />
+  ),
+}));
+
+vi.mock('@/components/ui/select', () => ({
+  Select: ({
+    children,
+    value,
+    onValueChange,
+    disabled,
+  }: {
+    children: ReactNode;
+    value: string;
+    onValueChange: (value: string) => void;
+    disabled?: boolean;
+  }) => (
+    <select
+      aria-label="Select"
+      disabled={disabled}
+      value={value}
+      onChange={(event) => onValueChange(event.target.value)}
+    >
+      {children}
+    </select>
+  ),
+  SelectTrigger: ({ children }: { children: ReactNode }) => <>{children}</>,
+  SelectValue: () => null,
+  SelectContent: ({ children }: { children: ReactNode }) => <>{children}</>,
+  SelectItem: ({ children, value }: { children: ReactNode; value: string }) => (
+    <option value={value}>{children}</option>
+  ),
+}));
+
+vi.mock('lucide-react', () => ({
+  Loader2: () => <span aria-hidden="true">loader</span>,
+  Square: () => <span aria-hidden="true">square</span>,
+}));
+
+vi.mock('@/components/ui/dialog', async () => {
+  const ReactModule = await import('react');
+  const DialogContext = ReactModule.createContext<{
+    open: boolean;
+    onOpenChange: (open: boolean) => void;
+  }>({
+    open: false,
+    onOpenChange: () => {},
+  });
+
+  return {
+    Dialog: ({
+      open,
+      onOpenChange,
+      children,
+    }: {
+      open: boolean;
+      onOpenChange: (open: boolean) => void;
+      children: ReactNode;
+    }) => (
+      <DialogContext.Provider value={{ open, onOpenChange }}>
+        {open ? (
+          <div>
+            <button type="button" data-testid="dialog-dismiss" onClick={() => onOpenChange(false)}>
+              request close
+            </button>
+            {children}
+          </div>
+        ) : null}
+      </DialogContext.Provider>
+    ),
+    DialogContent: ({
+      children,
+      hideCloseButton,
+    }: {
+      children: ReactNode;
+      hideCloseButton?: boolean;
+    }) => {
+      const { open, onOpenChange } = useContext(DialogContext);
+      if (!open) return null;
+      return (
+        <div data-testid="transcribe-dialog">
+          {!hideCloseButton && (
+            <button type="button" aria-label="Close" onClick={() => onOpenChange(false)}>
+              close
+            </button>
+          )}
+          {children}
+        </div>
+      );
+    },
+    DialogHeader: ({ children }: { children: ReactNode }) => <div>{children}</div>,
+    DialogTitle: ({ children }: { children: ReactNode }) => <h1>{children}</h1>,
+    DialogDescription: ({ children }: { children: ReactNode }) => <p>{children}</p>,
+    DialogFooter: ({ children }: { children: ReactNode }) => <div>{children}</div>,
+  };
+});
+
+import { TranscribeDialog } from './transcribe-dialog';
+
+describe('TranscribeDialog', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+  });
+
+  it('clears background skim and scrub previews when opened', () => {
+    render(
+      <TranscribeDialog
+        open
+        onOpenChange={vi.fn()}
+        fileName="clip.mp4"
+        hasTranscript={false}
+        isRunning={false}
+        progressPercent={null}
+        progressLabel="Queued..."
+        onStart={vi.fn()}
+        onCancel={vi.fn()}
+      />
+    );
+
+    expect(editorStoreState.clearMediaSkimPreview).toHaveBeenCalledTimes(1);
+    expect(editorStoreState.clearCompoundClipSkimPreview).toHaveBeenCalledTimes(1);
+    expect(editorStoreState.beginTranscriptionDialog).toHaveBeenCalledTimes(1);
+    expect(playbackStoreState.setPreviewFrame).toHaveBeenCalledWith(null);
+    expect(playbackStoreState.pause).toHaveBeenCalledTimes(1);
+  });
+
+  it('requires stopping before the dialog can close mid-transcription', () => {
+    const onOpenChange = vi.fn();
+    const onCancel = vi.fn();
+
+    render(
+      <TranscribeDialog
+        open
+        onOpenChange={onOpenChange}
+        fileName="clip.mp4"
+        hasTranscript
+        isRunning
+        progressPercent={42}
+        progressLabel="Transcribing... (42%)"
+        onStart={vi.fn()}
+        onCancel={onCancel}
+      />
+    );
+
+    expect(screen.queryByRole('button', { name: 'Close' })).not.toBeInTheDocument();
+    expect(screen.queryByText('Close')).not.toBeInTheDocument();
+    expect(screen.getByRole('button', { name: 'Stop' })).toBeInTheDocument();
+
+    fireEvent.click(screen.getByTestId('dialog-dismiss'));
+    expect(onOpenChange).not.toHaveBeenCalled();
+
+    fireEvent.click(screen.getByRole('button', { name: 'Stop' }));
+    expect(onCancel).toHaveBeenCalledTimes(1);
+  });
+
+  it('allows closing again once transcription is idle', () => {
+    const onOpenChange = vi.fn();
+
+    render(
+      <TranscribeDialog
+        open
+        onOpenChange={onOpenChange}
+        fileName="clip.mp4"
+        hasTranscript={false}
+        isRunning={false}
+        progressPercent={null}
+        progressLabel="Idle"
+        onStart={vi.fn()}
+        onCancel={vi.fn()}
+      />
+    );
+
+    fireEvent.click(screen.getByRole('button', { name: 'Close' }));
+    expect(onOpenChange).toHaveBeenCalledWith(false);
+  });
+});
diff --git a/src/features/media-library/components/transcribe-dialog.tsx b/src/features/media-library/components/transcribe-dialog.tsx
new file mode 100644
index 000000000..2a306b89b
--- /dev/null
+++ b/src/features/media-library/components/transcribe-dialog.tsx
@@ -0,0 +1,255 @@
+import { useCallback, useEffect, useMemo, useState } from 'react';
+import { Loader2, Square } from 'lucide-react';
+import {
+  Dialog,
+  DialogContent,
+  DialogDescription,
+  DialogFooter,
+  DialogHeader,
+  DialogTitle,
+} from '@/components/ui/dialog';
+import { Button } from '@/components/ui/button';
+import { Label } from '@/components/ui/label';
+import { Combobox } from '@/components/ui/combobox';
+import { useEditorStore } from '@/app/state/editor';
+import { usePlaybackStore } from '@/shared/state/playback';
+import {
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
+} from '@/components/ui/select';
+import { useSettingsStore } from '@/features/media-library/deps/settings-contract';
+import {
+  getMediaTranscriptionModelOptions,
+} from '../transcription/registry';
+import {
+  getWhisperLanguageSelectValue,
+  getWhisperLanguageSettingValue,
+  normalizeSelectableWhisperModel,
+  WHISPER_LANGUAGE_OPTIONS,
+  WHISPER_QUANTIZATION_OPTIONS,
+} from '@/shared/utils/whisper-settings';
+import type {
+  MediaTranscriptModel,
+  MediaTranscriptQuantization,
+} from '@/types/storage';
+
+export interface TranscribeDialogValues {
+  model: MediaTranscriptModel;
+  quantization: MediaTranscriptQuantization;
+  language: string;
+}
+
+interface TranscribeDialogProps {
+  open: boolean;
+  onOpenChange: (open: boolean) => void;
+  fileName: string;
+  hasTranscript: boolean;
+  isRunning: boolean;
+  progressPercent: number | null;
+  progressLabel: string;
+  errorMessage?: string | null;
+  onStart: (values: TranscribeDialogValues) => void;
+  onCancel: () => void;
+}
+
+export function TranscribeDialog({
+  open,
+  onOpenChange,
+  fileName,
+  hasTranscript,
+  isRunning,
+  progressPercent,
+  progressLabel,
+  errorMessage,
+  onStart,
+  onCancel,
+}: TranscribeDialogProps) {
+  const defaultModel = useSettingsStore((s) => s.defaultWhisperModel);
+  const defaultQuantization = useSettingsStore((s) => s.defaultWhisperQuantization);
+  const defaultLanguage = useSettingsStore((s) => s.defaultWhisperLanguage);
+  const clearMediaSkimPreview = useEditorStore((s) => s.clearMediaSkimPreview);
+  const clearCompoundClipSkimPreview = useEditorStore((s) => s.clearCompoundClipSkimPreview);
+  const beginTranscriptionDialog = useEditorStore((s) => s.beginTranscriptionDialog);
+  const endTranscriptionDialog = useEditorStore((s) => s.endTranscriptionDialog);
+
+  const modelOptions = useMemo(() => getMediaTranscriptionModelOptions(), []);
+
+  const [model, setModel] = useState<MediaTranscriptModel>(() =>
+    normalizeSelectableWhisperModel(defaultModel),
+  );
+  const [quantization, setQuantization] = useState<MediaTranscriptQuantization>(defaultQuantization);
+  const [languageValue, setLanguageValue] = useState<string>(() =>
+    getWhisperLanguageSelectValue(defaultLanguage),
+  );
+
+  useEffect(() => {
+    if (!open) return;
+    setModel(normalizeSelectableWhisperModel(defaultModel));
+    setQuantization(defaultQuantization);
+    setLanguageValue(getWhisperLanguageSelectValue(defaultLanguage));
+  }, [open, defaultLanguage, defaultModel, defaultQuantization]);
+
+  useEffect(() => {
+    if (!open) return;
+    beginTranscriptionDialog();
+    clearMediaSkimPreview();
+    clearCompoundClipSkimPreview();
+    usePlaybackStore.getState().setPreviewFrame(null);
+    usePlaybackStore.getState().pause();
+
+    return () => {
+      endTranscriptionDialog();
+    };
+  }, [
+    beginTranscriptionDialog,
+    clearCompoundClipSkimPreview,
+    clearMediaSkimPreview,
+    endTranscriptionDialog,
+    open,
+  ]);
+
+  const handleStart = () => {
+    onStart({
+      model,
+      quantization,
+      language: getWhisperLanguageSettingValue(languageValue),
+    });
+  };
+
+  const handleOpenChange = useCallback((nextOpen: boolean) => {
+    if (isRunning && !nextOpen) {
+      return;
+    }
+    onOpenChange(nextOpen);
+  }, [isRunning, onOpenChange]);
+
+  const title = hasTranscript ? 'Refresh Transcript' : 'Generate Transcript';
+
+  return (
+    <Dialog open={open} onOpenChange={handleOpenChange} modal>
+      <DialogContent
+        className="sm:max-w-md"
+        hideCloseButton={isRunning}
+        onPointerDownOutside={(event) => event.preventDefault()}
+        onInteractOutside={(event) => event.preventDefault()}
+        onEscapeKeyDown={(event) => {
+          if (isRunning) event.preventDefault();
+        }}
+      >
+        <DialogHeader>
+          <DialogTitle>{title}</DialogTitle>
+          <DialogDescription className="truncate">{fileName}</DialogDescription>
+        </DialogHeader>
+
+        <div className="space-y-3">
+          <div className="space-y-1.5">
+            <Label className="text-sm">Model</Label>
+            <Select
+              value={model}
+              onValueChange={(value) => setModel(value as MediaTranscriptModel)}
+              disabled={isRunning}
+            >
+              <SelectTrigger>
+                <SelectValue />
+              </SelectTrigger>
+              <SelectContent>
+                {modelOptions.map((option) => (
+                  <SelectItem key={option.value} value={option.value}>
+                    {option.label}
+                  </SelectItem>
+                ))}
+              </SelectContent>
+            </Select>
+          </div>
+
+          <div className="space-y-1.5">
+            <Label className="text-sm">Quantization</Label>
+            <Select
+              value={quantization}
+              onValueChange={(value) => setQuantization(value as MediaTranscriptQuantization)}
+              disabled={isRunning}
+            >
+              <SelectTrigger>
+                <SelectValue />
+              </SelectTrigger>
+              <SelectContent>
+                {WHISPER_QUANTIZATION_OPTIONS.map((option) => (
+                  <SelectItem key={option.value} value={option.value}>
+                    {option.label}
+                  </SelectItem>
+                ))}
+              </SelectContent>
+            </Select>
+          </div>
+
+          <div className="space-y-1.5">
+            <Label className="text-sm">Language</Label>
+            <Combobox
+              value={languageValue}
+              onValueChange={setLanguageValue}
+              options={WHISPER_LANGUAGE_OPTIONS}
+              placeholder="Auto-detect"
+              searchPlaceholder="Search languages..."
+              emptyMessage="No languages match that search."
+              disabled={isRunning}
+            />
+          </div>
+
+          {errorMessage && !isRunning && (
+            <div
+              role="alert"
+              className="rounded-md border border-destructive/40 bg-destructive/10 px-3 py-2 text-xs text-destructive"
+            >
+              {errorMessage}
+            </div>
+          )}
+
+          {isRunning && (
+            <div className="space-y-1.5 rounded-md border border-border bg-secondary/40 px-3 py-2">
+              <div className="flex items-center gap-2 text-sm">
+                <Loader2 className="h-3.5 w-3.5 animate-spin" />
+                <span className="truncate">{progressLabel}</span>
+              </div>
+              {progressPercent !== null && (
+                <div
+                  role="progressbar"
+                  aria-label="Transcription progress"
+                  aria-valuemin={0}
+                  aria-valuemax={100}
+                  aria-valuenow={progressPercent}
+                  className="h-1 overflow-hidden rounded-full bg-secondary"
+                >
+                  <div
+                    className="h-full rounded-full bg-blue-500 transition-all duration-300"
+                    style={{ width: `${progressPercent}%` }}
+                  />
+                </div>
+              )}
+            </div>
+          )}
+        </div>
+
+        <DialogFooter>
+          {isRunning ? (
+            <Button variant="destructive" onClick={onCancel}>
+              <Square className="mr-1.5 h-3.5 w-3.5" />
+              Stop
+            </Button>
+          ) : (
+            <>
+              <Button variant="outline" onClick={() => handleOpenChange(false)}>
+                Cancel
+              </Button>
+              <Button onClick={handleStart}>
+                Start Transcription
+              </Button>
+            </>
+          )}
+        </DialogFooter>
+      </DialogContent>
+    </Dialog>
+  );
+}
diff --git a/src/features/media-library/contracts/timeline.ts b/src/features/media-library/contracts/timeline.ts
index b143f9fff..1c605e6dd 100644
--- a/src/features/media-library/contracts/timeline.ts
+++ b/src/features/media-library/contracts/timeline.ts
@@ -10,6 +10,10 @@ export {
   getMediaTranscriptionModelLabel,
   getMediaTranscriptionModelOptions,
 } from '../transcription/registry';
+export {
+  TranscribeDialog,
+  type TranscribeDialogValues,
+} from '../components/transcribe-dialog';
 export { opfsService } from '../services/opfs-service';
 export {
   resolveMediaUrl,
diff --git a/src/features/media-library/deps/analysis-contract.ts b/src/features/media-library/deps/analysis-contract.ts
new file mode 100644
index 000000000..9291c6d1f
--- /dev/null
+++ b/src/features/media-library/deps/analysis-contract.ts
@@ -0,0 +1,28 @@
+/**
+ * Cross-feature contract — analysis infrastructure used by media-library.
+ *
+ * Split out of `analysis.ts` so additional analysis imports (embeddings,
+ * future providers) stay in one auditable place for the boundary checker.
+ */
+
+export { captionVideo, captionImage } from '@/infrastructure/analysis';
+export type { MediaCaption, CaptioningProgress, CaptioningOptions } from '@/infrastructure/analysis';
+export {
+  embeddingsProvider,
+  EMBEDDING_MODEL_ID,
+  EMBEDDING_MODEL_DIM,
+  clipProvider,
+  CLIP_MODEL_ID,
+  CLIP_EMBEDDING_DIM,
+  buildEmbeddingText,
+  extractDominantColors,
+  extractDominantColorPhrase,
+} from '@/infrastructure/analysis';
+export type {
+  EmbeddingsOptions,
+  EmbeddingsProgress,
+  EmbeddingsProvider,
+  BuildEmbeddingTextInput,
+  TranscriptSegment,
+  PaletteEntry,
+} from '@/infrastructure/analysis';
diff --git a/src/features/media-library/deps/analysis.ts b/src/features/media-library/deps/analysis.ts
index c68c392c2..6195064ad 100644
--- a/src/features/media-library/deps/analysis.ts
+++ b/src/features/media-library/deps/analysis.ts
@@ -1,2 +1 @@
-export { captionVideo, captionImage } from '@/infrastructure/analysis';
-export type { MediaCaption, CaptioningProgress, CaptioningOptions } from '@/infrastructure/analysis';
+export * from './analysis-contract';
diff --git a/src/features/media-library/deps/composition-runtime-contract.ts b/src/features/media-library/deps/composition-runtime-contract.ts
index 574b11249..5a8c7362c 100644
--- a/src/features/media-library/deps/composition-runtime-contract.ts
+++ b/src/features/media-library/deps/composition-runtime-contract.ts
@@ -8,4 +8,7 @@ export {
   startPreviewAudioConform,
   startPreviewAudioStartupWarm,
 } from '@/features/composition-runtime/utils/audio-decode-cache';
-export { deletePreviewAudioConform } from '@/features/composition-runtime/utils/preview-audio-conform';
+export {
+  deletePreviewAudioConform,
+  resolvePreviewAudioConformUrl,
+} from '@/features/composition-runtime/utils/preview-audio-conform';
diff --git a/src/features/media-library/deps/scene-browser-contract.ts b/src/features/media-library/deps/scene-browser-contract.ts
new file mode 100644
index 000000000..54078cba9
--- /dev/null
+++ b/src/features/media-library/deps/scene-browser-contract.ts
@@ -0,0 +1,10 @@
+/**
+ * Adapter — media-library mounts the Scene Browser panel and opens it from
+ * the info popover through this contract.
+ */
+
+export {
+  SceneBrowserPanel,
+  useSceneBrowserStore,
+  invalidateMediaCaptionThumbnails,
+} from '@/features/scene-browser';
diff --git a/src/features/media-library/deps/scene-browser.ts b/src/features/media-library/deps/scene-browser.ts
new file mode 100644
index 000000000..bf30c3858
--- /dev/null
+++ b/src/features/media-library/deps/scene-browser.ts
@@ -0,0 +1 @@
+export * from './scene-browser-contract';
diff --git a/src/features/media-library/deps/settings-contract.ts b/src/features/media-library/deps/settings-contract.ts
index c704a2be5..aa61d8958 100644
--- a/src/features/media-library/deps/settings-contract.ts
+++ b/src/features/media-library/deps/settings-contract.ts
@@ -2,4 +2,9 @@
  * Adapter — re-exports settings store for media-library consumption.
  */
 
-export { useSettingsStore } from '@/features/settings/stores/settings-store';
+export {
+  useSettingsStore,
+  resolveCaptioningIntervalSec,
+  DEFAULT_CAPTIONING_INTERVAL_SECONDS,
+} from '@/features/settings/stores/settings-store';
+export type { CaptioningIntervalUnit } from '@/features/settings/stores/settings-store';
diff --git a/src/features/media-library/services/media-analysis-service.test.ts b/src/features/media-library/services/media-analysis-service.test.ts
new file mode 100644
index 000000000..eda66eb79
--- /dev/null
+++ b/src/features/media-library/services/media-analysis-service.test.ts
@@ -0,0 +1,167 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+import type { MediaMetadata } from '@/types/storage';
+
+const captionImageMock = vi.fn();
+const captionVideoMock = vi.fn();
+const resolveCaptioningIntervalSecMock = vi.fn(() => 3);
+const saveCaptionThumbnailMock = vi.fn();
+const deleteCaptionThumbnailsMock = vi.fn();
+const deleteCaptionEmbeddingsMock = vi.fn();
+const updateMediaCaptionsMock = vi.fn();
+const getMediaBlobUrlMock = vi.fn();
+const invalidateMediaCaptionThumbnailsMock = vi.fn();
+const storeGetStateMock = vi.fn();
+
+let storeState: ReturnType<typeof createStoreState>;
+
+vi.mock('../deps/analysis', () => ({
+  captionImage: captionImageMock,
+  captionVideo: captionVideoMock,
+  EMBEDDING_MODEL_ID: 'embed-model',
+  EMBEDDING_MODEL_DIM: 384,
+  CLIP_MODEL_ID: 'clip-model',
+  CLIP_EMBEDDING_DIM: 512,
+  embeddingsProvider: {
+    ensureReady: vi.fn(),
+    embedBatch: vi.fn(),
+  },
+  clipProvider: {
+    ensureReady: vi.fn(),
+    embedImages: vi.fn(),
+  },
+  buildEmbeddingText: vi.fn(() => 'caption text'),
+  extractDominantColors: vi.fn(),
+}));
+
+vi.mock('../deps/settings-contract', () => ({
+  useSettingsStore: {
+    getState: () => ({
+      captioningIntervalUnit: 'seconds',
+      captioningIntervalValue: 3,
+    }),
+  },
+  resolveCaptioningIntervalSec: resolveCaptioningIntervalSecMock,
+}));
+
+vi.mock('@/infrastructure/storage', () => ({
+  saveCaptionThumbnail: saveCaptionThumbnailMock,
+  deleteCaptionThumbnails: deleteCaptionThumbnailsMock,
+  deleteCaptionEmbeddings: deleteCaptionEmbeddingsMock,
+  saveCaptionEmbeddings: vi.fn(),
+  saveCaptionImageEmbeddings: vi.fn(),
+  getTranscript: vi.fn(),
+}));
+
+vi.mock('../deps/scene-browser', () => ({
+  invalidateMediaCaptionThumbnails: invalidateMediaCaptionThumbnailsMock,
+}));
+
+vi.mock('../stores/media-library-store', () => ({
+  useMediaLibraryStore: {
+    getState: storeGetStateMock,
+  },
+}));
+
+vi.mock('./media-library-service', () => ({
+  mediaLibraryService: {
+    getMediaBlobUrl: getMediaBlobUrlMock,
+    updateMediaCaptions: updateMediaCaptionsMock,
+  },
+}));
+
+vi.mock('../utils/validation', () => ({
+  getMediaType: (mimeType: string) => {
+    if (mimeType.startsWith('image/')) return 'image';
+    if (mimeType.startsWith('video/')) return 'video';
+    return 'unknown';
+  },
+}));
+
+const { mediaAnalysisService } = await import('./media-analysis-service');
+
+function createStoreState() {
+  return {
+    analysisProgress: null as { total: number; completed: number; cancelRequested: boolean } | null,
+    beginAnalysisRun: vi.fn((count: number) => {
+      storeState.analysisProgress = { total: count, completed: 0, cancelRequested: false };
+    }),
+    incrementAnalysisCompleted: vi.fn(),
+    endAnalysisRun: vi.fn(() => {
+      storeState.analysisProgress = null;
+    }),
+    setTaggingMedia: vi.fn(),
+    updateMediaCaptions: vi.fn(),
+    showNotification: vi.fn(),
+  };
+}
+
+function makeMedia(overrides: Partial<MediaMetadata> = {}): MediaMetadata {
+  return {
+    id: 'media-1',
+    fileName: 'frame.png',
+    storageType: 'opfs',
+    fileSize: 1024,
+    mimeType: 'image/png',
+    duration: 0,
+    width: 1920,
+    height: 1080,
+    fps: 30,
+    bitrate: 0,
+    codec: 'png',
+    thumbnailId: 'thumb-1',
+    tags: [],
+    createdAt: 1,
+    updatedAt: 1,
+    ...overrides,
+  };
+}
+
+describe('mediaAnalysisService.analyzeMedia', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+    storeState = createStoreState();
+    storeGetStateMock.mockImplementation(() => storeState);
+    getMediaBlobUrlMock.mockResolvedValue('blob:media-1');
+    updateMediaCaptionsMock.mockImplementation(async (_mediaId: string, captions: unknown) => ({
+      ...makeMedia(),
+      aiCaptions: captions as MediaMetadata['aiCaptions'],
+    }));
+    captionVideoMock.mockReset();
+    captionImageMock.mockReset();
+    resolveCaptioningIntervalSecMock.mockReturnValue(3);
+    vi.stubGlobal('fetch', vi.fn(async () => new Response(new Blob(['image-bytes'], { type: 'image/png' }))));
+    Object.defineProperty(URL, 'revokeObjectURL', {
+      configurable: true,
+      value: vi.fn(),
+    });
+  });
+
+  it('keeps existing scene assets intact when re-analysis fails', async () => {
+    const media = makeMedia({
+      aiCaptions: [{ timeSec: 0, text: 'Existing caption', thumbRelPath: 'media/media-1/cache/ai/captions-thumbs/0.jpg' }],
+    });
+    captionImageMock.mockRejectedValue(new Error('caption failed'));
+
+    await expect(mediaAnalysisService.analyzeMedia(media)).resolves.toBe(false);
+
+    expect(deleteCaptionThumbnailsMock).not.toHaveBeenCalled();
+    expect(deleteCaptionEmbeddingsMock).not.toHaveBeenCalled();
+    expect(updateMediaCaptionsMock).not.toHaveBeenCalled();
+  });
+
+  it('clears caption metadata and old assets when a rerun finds no scenes', async () => {
+    const media = makeMedia({
+      aiCaptions: [{ timeSec: 0, text: 'Existing caption', thumbRelPath: 'media/media-1/cache/ai/captions-thumbs/0.jpg' }],
+    });
+    captionImageMock.mockResolvedValue([]);
+
+    await expect(mediaAnalysisService.analyzeMedia(media)).resolves.toBe(true);
+
+    expect(updateMediaCaptionsMock).toHaveBeenCalledWith(media.id, [], {
+      sampleIntervalSec: 3,
+    });
+    expect(storeState.updateMediaCaptions).toHaveBeenCalledWith(media.id, []);
+    expect(deleteCaptionThumbnailsMock).toHaveBeenCalledWith(media.id);
+    expect(deleteCaptionEmbeddingsMock).toHaveBeenCalledWith(media.id);
+  });
+});
diff --git a/src/features/media-library/services/media-analysis-service.ts b/src/features/media-library/services/media-analysis-service.ts
new file mode 100644
index 000000000..1963c9f7a
--- /dev/null
+++ b/src/features/media-library/services/media-analysis-service.ts
@@ -0,0 +1,385 @@
+/**
+ * Runs the "Analyze with AI" pipeline for a single media item — captions,
+ * dominant-color palette, text embeddings, and CLIP image embeddings — so
+ * both the media card's per-item menu and the scene browser's "analyze all"
+ * action hit the exact same path.
+ *
+ * Extracted from `media-card.tsx` so there's one authoritative flow for
+ * wiping stale thumbs/embeddings, running the captioner, indexing, and
+ * persisting to the workspace. The call site does nothing but drive UI.
+ */
+
+import type { MediaMetadata } from '@/types/storage';
+import {
+  captionVideo,
+  captionImage,
+  type MediaCaption,
+  embeddingsProvider,
+  EMBEDDING_MODEL_ID,
+  EMBEDDING_MODEL_DIM,
+  clipProvider,
+  CLIP_MODEL_ID,
+  CLIP_EMBEDDING_DIM,
+  buildEmbeddingText,
+  extractDominantColors,
+} from '../deps/analysis';
+import {
+  useSettingsStore,
+  resolveCaptioningIntervalSec,
+} from '../deps/settings-contract';
+import {
+  saveCaptionThumbnail,
+  deleteCaptionThumbnails,
+  deleteCaptionEmbeddings,
+  saveCaptionEmbeddings,
+  saveCaptionImageEmbeddings,
+  getTranscript,
+} from '@/infrastructure/storage';
+import { invalidateMediaCaptionThumbnails } from '../deps/scene-browser';
+import { useMediaLibraryStore } from '../stores/media-library-store';
+import { mediaLibraryService } from './media-library-service';
+import { getMediaType } from '../utils/validation';
+import { createLogger } from '@/shared/logging/logger';
+
+const logger = createLogger('MediaAnalysisService');
+
+export interface AnalyzeBatchResult {
+  analyzed: number;
+  skipped: number;
+  failed: number;
+}
+
+export interface AnalyzeBatchOptions {
+  /** When true, only analyze media that has no captions yet. Default: false (re-analyze everything). */
+  onlyMissing?: boolean;
+  /** Optional filter for which media to consider (e.g. a single scope id). */
+  mediaIds?: readonly string[];
+}
+
+class MediaAnalysisService {
+  private batchInFlight = false;
+
+  /**
+   * Analyze a single media item end-to-end. Accepts either a mediaId (resolved
+   * from the library store) or the full `MediaMetadata` when the caller
+   * already has it. Returns true on success, false on failure — notifications
+   * are surfaced via the media-library store either way.
+   *
+   * When called standalone (not from `analyzeBatch`), wraps itself as a
+   * 1-item run so the background progress bar shows a concrete 0→100%
+   * instead of a pulsing indeterminate bar.
+   */
+  async analyzeMedia(mediaOrId: string | MediaMetadata): Promise<boolean> {
+    const store = useMediaLibraryStore.getState();
+    const media = typeof mediaOrId === 'string'
+      ? store.mediaItems.find((m) => m.id === mediaOrId)
+      : mediaOrId;
+    if (!media) return false;
+
+    const ownsRun = !this.batchInFlight && !store.analysisProgress;
+    if (ownsRun) {
+      store.beginAnalysisRun(1);
+    }
+    try {
+      const ok = await this.analyzeOne(media);
+      if (ownsRun) {
+        useMediaLibraryStore.getState().incrementAnalysisCompleted(1);
+      }
+      return ok;
+    } finally {
+      if (ownsRun) {
+        useMediaLibraryStore.getState().endAnalysisRun();
+      }
+    }
+  }
+
+  private async analyzeOne(media: MediaMetadata): Promise<boolean> {
+    const store = useMediaLibraryStore.getState();
+    const mediaType = getMediaType(media.mimeType);
+    if (mediaType !== 'video' && mediaType !== 'image') return false;
+
+    const { captioningIntervalUnit, captioningIntervalValue } = useSettingsStore.getState();
+    const sampleIntervalSec = resolveCaptioningIntervalSec(
+      captioningIntervalUnit,
+      captioningIntervalValue,
+      media.fps,
+    );
+
+    store.setTaggingMedia(media.id, true);
+
+    try {
+      // Drop every in-memory thumbnail URL and semantic cache entry tied to
+      // this media before re-analysis starts. If the rerun fails, the old
+      // on-disk assets still exist and can be rehydrated on demand; if it
+      // succeeds, fresh thumbs/embeddings repopulate the caches below.
+      invalidateMediaCaptionThumbnails(media.id);
+
+      let captions: MediaCaption[];
+      const stagedThumbnailBlobs = new Map<number, Blob>();
+
+      const stageThumbnail = async (index: number, blob: Blob): Promise<string | undefined> => {
+        stagedThumbnailBlobs.set(index, blob);
+        return undefined;
+      };
+
+      if (mediaType === 'video') {
+        const blobUrl = await mediaLibraryService.getMediaBlobUrl(media.id);
+        if (!blobUrl) throw new Error('Could not load media file');
+
+        const video = document.createElement('video');
+        video.muted = true;
+        video.preload = 'auto';
+        video.src = blobUrl;
+
+        await new Promise<void>((resolve, reject) => {
+          video.onloadedmetadata = () => resolve();
+          video.onerror = () => reject(new Error('Failed to load video'));
+        });
+
+        try {
+          captions = await captionVideo(video, {
+            sampleIntervalSec,
+            saveThumbnail: stageThumbnail,
+          });
+        } finally {
+          video.src = '';
+          URL.revokeObjectURL(blobUrl);
+        }
+      } else {
+        const blobUrl = await mediaLibraryService.getMediaBlobUrl(media.id);
+        if (!blobUrl) throw new Error('Could not load media file');
+
+        const response = await fetch(blobUrl);
+        const blob = await response.blob();
+        URL.revokeObjectURL(blobUrl);
+        captions = await captionImage(blob, { saveThumbnail: stageThumbnail });
+      }
+
+      if (captions.length > 0) {
+        let embeddingModel: string | undefined;
+        let embeddingDim: number | undefined;
+        let imageEmbeddingModel: string | undefined;
+        let imageEmbeddingDim: number | undefined;
+        let captionsWithEmbeddings = captions;
+
+        const thumbBlobs = captions.map((_, index) =>
+          stagedThumbnailBlobs.get(index) ?? null,
+        );
+
+        const colorResults = await Promise.all(
+          thumbBlobs.map(async (blob) => {
+            if (!blob) return { phrase: '', palette: [] as const };
+            try { return await extractDominantColors(blob); }
+            catch { return { phrase: '', palette: [] as const }; }
+          }),
+        );
+        const palettesByIndex = colorResults.map((r) => r.palette);
+
+        captionsWithEmbeddings = captions.map((caption, i) => {
+          const palette = palettesByIndex[i];
+          const next = { ...caption } as typeof caption & {
+            palette?: typeof palette;
+          };
+          if (palette && palette.length > 0) next.palette = [...palette];
+          return next;
+        });
+
+        try {
+          await embeddingsProvider.ensureReady();
+
+          const transcript = await getTranscript(media.id).catch(() => null);
+
+          const texts = captions.map((caption, i) => buildEmbeddingText({
+            caption: { text: caption.text, timeSec: caption.timeSec },
+            sceneData: caption.sceneData,
+            transcriptSegments: transcript?.segments,
+            colorPhrase: colorResults[i]?.phrase ?? '',
+          }));
+
+          const vectors = await embeddingsProvider.embedBatch(texts);
+          if (vectors.length === captions.length) {
+            await saveCaptionEmbeddings(media.id, vectors, EMBEDDING_MODEL_DIM);
+            embeddingModel = EMBEDDING_MODEL_ID;
+            embeddingDim = EMBEDDING_MODEL_DIM;
+            captionsWithEmbeddings = captionsWithEmbeddings.map((caption, i) => ({
+              ...caption,
+              embedding: Array.from(vectors[i]!),
+            }));
+          }
+        } catch (error) {
+          store.showNotification({
+            type: 'warning',
+            message: `Semantic indexing skipped for "${media.fileName}" — keyword search still works.`,
+          });
+          void error;
+        }
+
+        try {
+          const validBlobs = thumbBlobs.filter((b): b is Blob => b !== null);
+          if (validBlobs.length > 0 && validBlobs.length === captions.length) {
+            await clipProvider.ensureReady();
+            const imageVectors = await clipProvider.embedImages(validBlobs);
+            if (imageVectors.length === captions.length) {
+              await saveCaptionImageEmbeddings(media.id, imageVectors, CLIP_EMBEDDING_DIM);
+              imageEmbeddingModel = CLIP_MODEL_ID;
+              imageEmbeddingDim = CLIP_EMBEDDING_DIM;
+            }
+          }
+        } catch (error) {
+          void error;
+        }
+
+        if (stagedThumbnailBlobs.size > 0) {
+          captionsWithEmbeddings = await Promise.all(
+            captionsWithEmbeddings.map(async (caption, index) => {
+              const blob = stagedThumbnailBlobs.get(index);
+              if (!blob) return caption;
+              try {
+                const thumbRelPath = await saveCaptionThumbnail(media.id, index, blob);
+                return { ...caption, thumbRelPath };
+              } catch {
+                return caption;
+              }
+            }),
+          );
+        }
+
+        await mediaLibraryService.updateMediaCaptions(media.id, captionsWithEmbeddings, {
+          sampleIntervalSec,
+          embeddingModel,
+          embeddingDim,
+          imageEmbeddingModel,
+          imageEmbeddingDim,
+        });
+        store.updateMediaCaptions(media.id, captionsWithEmbeddings);
+
+        const sceneCaptionCountLabel = `${captions.length} scene caption${captions.length === 1 ? '' : 's'}`;
+        store.showNotification({
+          type: 'success',
+          message: `Generated ${sceneCaptionCountLabel} for "${media.fileName}"`,
+        });
+      } else {
+        await mediaLibraryService.updateMediaCaptions(media.id, [], {
+          sampleIntervalSec,
+        });
+        store.updateMediaCaptions(media.id, []);
+        await deleteCaptionThumbnails(media.id);
+        await deleteCaptionEmbeddings(media.id);
+        store.showNotification({
+          type: 'info',
+          message: `No scene captions generated for "${media.fileName}"`,
+        });
+      }
+      return true;
+    } catch (error) {
+      store.showNotification({
+        type: 'error',
+        message: error instanceof Error ? error.message : 'Failed to analyze media',
+      });
+      return false;
+    } finally {
+      store.setTaggingMedia(media.id, false);
+    }
+  }
+
+  /**
+   * Analyze a batch of media sequentially. Sequential avoids thrashing the
+   * shared WebGPU device and CLIP model — parallelism here would starve the
+   * preview canvas and risk OOM on longer videos.
+   *
+   * A single batch is the unit of concurrency — calling twice while one is
+   * running is a no-op (second call resolves immediately with zeros). The
+   * per-item tagging flag blocks any overlapping per-card "Analyze" clicks.
+   */
+  async analyzeBatch(options: AnalyzeBatchOptions = {}): Promise<AnalyzeBatchResult> {
+    if (this.batchInFlight) {
+      return { analyzed: 0, skipped: 0, failed: 0 };
+    }
+    this.batchInFlight = true;
+
+    const store = useMediaLibraryStore.getState();
+    const all = store.mediaItems;
+    const pool = options.mediaIds
+      ? all.filter((m) => options.mediaIds!.includes(m.id))
+      : all;
+
+    const targets = pool.filter((m) => {
+      const type = getMediaType(m.mimeType);
+      if (type !== 'video' && type !== 'image') return false;
+      if (options.onlyMissing && (m.aiCaptions?.length ?? 0) > 0) return false;
+      return true;
+    });
+
+    let analyzed = 0;
+    let failed = 0;
+    let cancelled = 0;
+    const skipped = pool.length - targets.length;
+
+    try {
+      if (targets.length === 0) {
+        store.showNotification({
+          type: 'info',
+          message: options.onlyMissing
+            ? 'No unanalyzed media to process.'
+            : 'No media to analyze.',
+        });
+        return { analyzed: 0, skipped, failed: 0 };
+      }
+
+      store.beginAnalysisRun(targets.length);
+      store.showNotification({
+        type: 'info',
+        message: targets.length === 1
+          ? `Analyzing "${firstName(targets)}"…`
+          : `Analyzing ${targets.length} media files…`,
+      });
+
+      for (const media of targets) {
+        // Cancel is cooperative — the in-flight item finishes first. Any
+        // remaining items are skipped but still counted toward `completed`
+        // so the progress bar reaches 100% and unmounts cleanly instead
+        // of stranding the user with a stuck bar.
+        const { analysisProgress } = useMediaLibraryStore.getState();
+        if (analysisProgress?.cancelRequested) {
+          cancelled = targets.length - (analyzed + failed);
+          useMediaLibraryStore.getState().incrementAnalysisCompleted(cancelled);
+          break;
+        }
+        logger.info('batch analyzing media', { mediaId: media.id, fileName: media.fileName });
+        const ok = await this.analyzeOne(media);
+        if (ok) analyzed += 1;
+        else failed += 1;
+        useMediaLibraryStore.getState().incrementAnalysisCompleted(1);
+      }
+
+      if (targets.length > 1) {
+        const suffix = failed > 0 ? ` — ${failed} failed` : '';
+        const cancelSuffix = cancelled > 0 ? ` (${cancelled} cancelled)` : '';
+        store.showNotification({
+          type: cancelled > 0 ? 'warning' : (failed === 0 ? 'success' : 'warning'),
+          message: `Analyzed ${analyzed}/${targets.length}${suffix}${cancelSuffix}`,
+        });
+      }
+    } finally {
+      useMediaLibraryStore.getState().endAnalysisRun();
+      this.batchInFlight = false;
+    }
+
+    return { analyzed, skipped, failed };
+  }
+
+  /** Ask the currently running analysis to stop after the in-flight item. */
+  requestCancel(): void {
+    useMediaLibraryStore.getState().requestAnalysisCancel();
+  }
+
+  isBatchInFlight(): boolean {
+    return this.batchInFlight;
+  }
+}
+
+function firstName(items: readonly MediaMetadata[]): string {
+  return items[0]?.fileName ?? '';
+}
+
+export const mediaAnalysisService = new MediaAnalysisService();
diff --git a/src/features/media-library/services/media-captioning-service.ts b/src/features/media-library/services/media-captioning-service.ts
new file mode 100644
index 000000000..addd0f487
--- /dev/null
+++ b/src/features/media-library/services/media-captioning-service.ts
@@ -0,0 +1,273 @@
+/**
+ * Bridges AI captions (vision-language-model frame descriptions) into timeline
+ * text items. Mirrors {@link MediaTranscriptionService.insertTranscriptAsCaptions}
+ * but sources from `MediaCaption[]` (point-in-time descriptions) rather than
+ * whisper speech-to-text segments.
+ *
+ * Keep both services aligned in behavior — if one gains new track-placement
+ * or replacement logic, the other usually needs the same treatment.
+ */
+
+import { useSelectionStore } from '@/shared/state/selection';
+import { createLogger } from '@/shared/logging/logger';
+import type { MediaCaption } from '@/infrastructure/analysis';
+import type {
+  AudioItem,
+  TextItem,
+  TimelineItem,
+  TimelineTrack,
+  VideoItem,
+} from '@/types/timeline';
+import {
+  aiCaptionsToSegments,
+  buildCaptionTextItems,
+  buildCaptionTrackAbove,
+  findReplaceableCaptionItemsForClip,
+  findCompatibleCaptionTrackForRanges,
+  isCaptionTrackCandidate,
+  getCaptionTextItemTemplate,
+  getCaptionRangeForClip,
+} from '../utils/caption-items';
+import { useProjectStore } from '@/features/media-library/deps/projects';
+import { useTimelineStore } from '@/features/media-library/deps/timeline-stores';
+
+const logger = createLogger('MediaCaptioningService');
+
+type CaptionableClip = AudioItem | VideoItem;
+
+interface InsertAiCaptionsOptions {
+  /** Restrict insertion to these clip ids. Defaults to selection/playhead heuristics. */
+  clipIds?: readonly string[];
+  /** If true, pre-existing AI-caption items on matched clips are removed first. */
+  replaceExisting?: boolean;
+  /** Sample interval reported by the captioning provider — used to size trailing caption duration. */
+  sampleIntervalSec?: number;
+}
+
+export interface InsertAiCaptionsResult {
+  insertedItemCount: number;
+  removedItemCount: number;
+  /** `true` when no compatible clip was found on the timeline. */
+  noTargetClips: boolean;
+}
+
+class MediaCaptioningService {
+  /**
+   * Insert AI captions as timeline text items anchored to the clips that use
+   * `mediaId`. Finds a compatible existing caption track per clip, or creates
+   * one. Returns `noTargetClips: true` when the media isn't on the timeline
+   * yet — callers should treat that as a soft outcome, not an error.
+   */
+  async insertAiCaptionsOnTimeline(
+    mediaId: string,
+    captions: readonly MediaCaption[],
+    options: InsertAiCaptionsOptions = {},
+  ): Promise<InsertAiCaptionsResult> {
+    logger.info('insertAiCaptionsOnTimeline invoked', {
+      mediaId,
+      captionCount: captions.length,
+      options,
+    });
+
+    if (captions.length === 0) {
+      return { insertedItemCount: 0, removedItemCount: 0, noTargetClips: false };
+    }
+
+    const segments = aiCaptionsToSegments(captions, options.sampleIntervalSec);
+    logger.info('aiCaptionsToSegments produced segments', {
+      mediaId,
+      segmentCount: segments.length,
+      firstSegment: segments[0],
+      lastSegment: segments.at(-1),
+    });
+    if (segments.length === 0) {
+      return { insertedItemCount: 0, removedItemCount: 0, noTargetClips: false };
+    }
+
+    const timeline = useTimelineStore.getState();
+    const project = useProjectStore.getState().currentProject;
+    const targetClips = this.resolveTargetClips(mediaId, options.clipIds);
+    logger.info('resolveTargetClips result', {
+      mediaId,
+      targetClipCount: targetClips.length,
+      targetClipIds: targetClips.map((c) => c.id),
+      allClipsWithMediaId: timeline.items.filter((i) => 'mediaId' in i && i.mediaId === mediaId).length,
+    });
+    if (targetClips.length === 0) {
+      logger.info(`No timeline clips for media ${mediaId} — captions saved but not inserted`);
+      return { insertedItemCount: 0, removedItemCount: 0, noTargetClips: true };
+    }
+
+    const canvasWidth = project?.metadata.width ?? 1920;
+    const canvasHeight = project?.metadata.height ?? 1080;
+    const newTracks: TimelineTrack[] = [...timeline.tracks];
+    const generatedCaptionIdsToRemove = options.replaceExisting
+      ? new Set(
+          targetClips.flatMap((clip) =>
+            findReplaceableCaptionItemsForClip(timeline.items, clip, 'ai-captions').map((item) => item.id),
+          ),
+        )
+      : new Set<string>();
+    const plannedItems = timeline.items.filter((item) => !generatedCaptionIdsToRemove.has(item.id));
+    const insertedItems: TextItem[] = [];
+
+    for (const clip of targetClips) {
+      const clipRange = getCaptionRangeForClip(clip, segments, timeline.fps);
+      logger.info('per-clip getCaptionRangeForClip result', {
+        clipId: clip.id,
+        clipFrom: clip.from,
+        clipDurationInFrames: clip.durationInFrames,
+        sourceStart: clip.sourceStart,
+        sourceEnd: clip.sourceEnd,
+        sourceFps: clip.sourceFps,
+        timelineFps: timeline.fps,
+        clipRange,
+      });
+      if (!clipRange) {
+        continue;
+      }
+
+      const existingGeneratedCaptions = options.replaceExisting
+        ? findReplaceableCaptionItemsForClip(timeline.items, clip, 'ai-captions')
+        : [];
+      const preferredTrackId = this.resolvePreferredTrackId(
+        newTracks,
+        plannedItems,
+        existingGeneratedCaptions,
+        clipRange,
+      );
+
+      let targetTrack = preferredTrackId
+        ? newTracks.find((track) => track.id === preferredTrackId) ?? null
+        : findCompatibleCaptionTrackForRanges(
+            newTracks,
+            plannedItems,
+            [{ startFrame: clipRange.startFrame, endFrame: clipRange.endFrame }],
+          );
+
+      if (!targetTrack) {
+        // Drop the caption track directly above the clip's own track — that's
+        // where users expect overlaid subtitles. `buildCaptionTrackAbove`
+        // picks a fractional order between the clip track and the next track
+        // up so no existing tracks need to shift.
+        const clipTrack = newTracks.find((track) => track.id === clip.trackId);
+        targetTrack = clipTrack
+          ? buildCaptionTrackAbove(newTracks, clipTrack.order)
+          : buildCaptionTrackAbove(newTracks, 0);
+        newTracks.push(targetTrack);
+        newTracks.sort((a, b) => a.order - b.order);
+      }
+
+      const clipCaptionItems = buildCaptionTextItems({
+        mediaId,
+        trackId: targetTrack.id,
+        segments,
+        clip,
+        timelineFps: timeline.fps,
+        canvasWidth,
+        canvasHeight,
+        sourceType: 'ai-captions',
+        styleTemplate: existingGeneratedCaptions[0]
+          ? getCaptionTextItemTemplate(existingGeneratedCaptions[0])
+          : undefined,
+      });
+      logger.info('buildCaptionTextItems produced items', {
+        clipId: clip.id,
+        trackId: targetTrack.id,
+        itemCount: clipCaptionItems.length,
+      });
+
+      if (clipCaptionItems.length === 0) {
+        continue;
+      }
+
+      insertedItems.push(...clipCaptionItems);
+      plannedItems.push(...clipCaptionItems);
+    }
+
+    logger.info('insertAiCaptionsOnTimeline finishing', {
+      mediaId,
+      insertedItemCount: insertedItems.length,
+      removedItemCount: generatedCaptionIdsToRemove.size,
+      trackChangeCount: newTracks.length - timeline.tracks.length,
+    });
+
+    const tracksChanged = newTracks.length !== timeline.tracks.length
+      || newTracks.some((track, index) => track.id !== timeline.tracks[index]?.id);
+    if (tracksChanged) {
+      timeline.setTracks(newTracks);
+    }
+
+    if (generatedCaptionIdsToRemove.size > 0) {
+      timeline.removeItems([...generatedCaptionIdsToRemove]);
+    }
+
+    if (insertedItems.length > 0) {
+      timeline.addItems(insertedItems);
+      useSelectionStore.getState().selectItems(insertedItems.map((item) => item.id));
+    }
+
+    return {
+      insertedItemCount: insertedItems.length,
+      removedItemCount: generatedCaptionIdsToRemove.size,
+      noTargetClips: false,
+    };
+  }
+
+  private resolveTargetClips(
+    mediaId: string,
+    clipIds?: readonly string[],
+  ): CaptionableClip[] {
+    const timeline = useTimelineStore.getState();
+    const selection = useSelectionStore.getState();
+
+    const matchingClips = timeline.items
+      .filter((item): item is CaptionableClip =>
+        (item.type === 'video' || item.type === 'audio') && item.mediaId === mediaId,
+      )
+      .sort((a, b) => a.from - b.from);
+
+    if (matchingClips.length === 0) return [];
+
+    if (clipIds && clipIds.length > 0) {
+      const requested = new Set(clipIds);
+      return matchingClips.filter((clip) => requested.has(clip.id));
+    }
+
+    const selectedClips = selection.selectedItemIds
+      .map((id) => matchingClips.find((clip) => clip.id === id))
+      .filter((clip): clip is CaptionableClip => clip !== undefined);
+    if (selectedClips.length > 0) return selectedClips;
+
+    // Default: caption every clip that uses this media. The whisper flow
+    // picks a single clip when many exist (it's long-form speech), but AI
+    // frame captions are inherently per-frame-range — applying to all clips
+    // is the less surprising default here.
+    return matchingClips;
+  }
+
+  private resolvePreferredTrackId(
+    tracks: readonly TimelineTrack[],
+    items: readonly TimelineItem[],
+    existingCaptions: ReadonlyArray<{ trackId: string }>,
+    range: { startFrame: number; endFrame: number },
+  ): string | null {
+    const trackIds = [...new Set(existingCaptions.map((item) => item.trackId))];
+    if (trackIds.length !== 1) return null;
+
+    const preferredTrack = tracks.find((track) => track.id === trackIds[0]);
+    if (!preferredTrack || !isCaptionTrackCandidate(preferredTrack, items)) {
+      return null;
+    }
+
+    const hasOverlap = items.some((item) => {
+      if (item.trackId !== preferredTrack.id) return false;
+      const itemEnd = item.from + item.durationInFrames;
+      return item.from < range.endFrame && itemEnd > range.startFrame;
+    });
+
+    return hasOverlap ? null : preferredTrack.id;
+  }
+}
+
+export const mediaCaptioningService = new MediaCaptioningService();
diff --git a/src/features/media-library/services/media-library-service.ts b/src/features/media-library/services/media-library-service.ts
index 390e48e64..34d18e91a 100644
--- a/src/features/media-library/services/media-library-service.ts
+++ b/src/features/media-library/services/media-library-service.ts
@@ -56,6 +56,8 @@ import {
   getMediaForProject as getMediaForProjectDB,
   deleteTranscript,
 } from '@/infrastructure/storage';
+import { saveCaptions, deleteCaptions } from '@/infrastructure/storage/workspace-fs/captions';
+import { deleteScenes } from '@/infrastructure/storage/workspace-fs/scenes';
 import { filmstripCache, gifFrameCache, waveformCache } from '@/features/media-library/deps/timeline-services';
 import { opfsService } from './opfs-service';
 import { proxyService } from './proxy-service';
@@ -91,12 +93,13 @@ const IMPORT_BACKGROUND_WARM_DELAY_MS = 600;
 const IMPORT_BACKGROUND_HEAVY_DELAY_MS = 2200;
 
 /**
- * Media Library Service - Coordinates OPFS + IndexedDB + metadata extraction
+ * Media Library Service - Coordinates handle/OPFS media access with
+ * workspace-backed metadata, thumbnails, and derived caches.
  *
  * Includes in-memory thumbnail URL cache to prevent flicker on re-renders.
  *
- * Provides atomic operations for media management, ensuring OPFS and IndexedDB
- * stay in sync.
+ * Provides atomic operations for media management while keeping origin-scoped
+ * sources and the workspace folder in sync.
  */
 class MediaLibraryService {
   /** In-memory cache for thumbnail blob URLs to prevent flicker on re-renders */
@@ -110,6 +113,22 @@ class MediaLibraryService {
     }
   }
 
+  private async deleteCaptionsSafely(mediaId: string): Promise<void> {
+    try {
+      await deleteCaptions(mediaId);
+    } catch (error) {
+      logger.warn('Failed to delete captions:', error);
+    }
+  }
+
+  private async deleteScenesSafely(mediaId: string): Promise<void> {
+    try {
+      await deleteScenes(mediaId);
+    } catch (error) {
+      logger.warn('Failed to delete scenes:', error);
+    }
+  }
+
   private async deleteThumbnailsSafely(mediaId: string): Promise<void> {
     this.clearThumbnailCache(mediaId);
     try {
@@ -142,7 +161,7 @@ class MediaLibraryService {
 
   /**
    * Clear waveform caches for a fully-dereferenced media item. Removes
-   * the in-memory LRU entry, the IndexedDB binned persistence, and the
+   * the in-memory LRU entry, the persisted binned waveform cache, and the
    * OPFS + workspace-folder multi-resolution mirrors.
    */
   private async clearWaveformCacheSafely(mediaId: string): Promise<void> {
@@ -217,7 +236,7 @@ class MediaLibraryService {
   }
 
   /**
-   * Get all media items from IndexedDB
+   * Get all media items from workspace storage
    */
   async getAllMedia(): Promise<MediaMetadata[]> {
     return getAllMediaDB();
@@ -320,7 +339,7 @@ class MediaLibraryService {
     // Check for unsupported audio codec (included in metadata from worker)
     const codecCheck = mediaProcessorService.hasUnsupportedAudioCodec(metadata);
 
-    // Stage 6: Save metadata to IndexedDB with file handle
+    // Stage 6: Save metadata with the file handle-backed source reference
     const mediaMetadata: MediaMetadata = {
       id,
       storageType: 'handle',
@@ -656,6 +675,8 @@ class MediaLibraryService {
       await deleteMediaDB(mediaId);
 
       await this.deleteTranscriptSafely(mediaId);
+      await this.deleteCaptionsSafely(mediaId);
+      await this.deleteScenesSafely(mediaId);
       await this.deleteThumbnailsSafely(mediaId);
       await this.clearGifFrameCacheSafely(mediaId);
       await this.clearFilmstripCacheSafely(mediaId);
@@ -758,6 +779,8 @@ class MediaLibraryService {
     await deleteMediaDB(id);
 
     await this.deleteTranscriptSafely(id);
+    await this.deleteCaptionsSafely(id);
+    await this.deleteScenesSafely(id);
   }
 
   /**
@@ -1009,11 +1032,41 @@ class MediaLibraryService {
 
   /**
    * Update AI-generated captions for a media item.
+   *
+   * Captions live in `cache/ai/captions.json` as the authoritative source.
+   * We also mirror them onto `MediaMetadata.aiCaptions` so in-memory zustand
+   * consumers and search (`media-library-store.ts`) don't need a separate
+   * hydration pass — the mirror stays consistent because this is the only
+   * writer.
    */
   async updateMediaCaptions(
     mediaId: string,
-    captions: Array<{ timeSec: number; text: string }>,
+    captions: NonNullable<MediaMetadata['aiCaptions']>,
+    options?: {
+      service?: string;
+      model?: string;
+      sampleIntervalSec?: number;
+      embeddingModel?: string;
+      embeddingDim?: number;
+      imageEmbeddingModel?: string;
+      imageEmbeddingDim?: number;
+    },
   ): Promise<MediaMetadata> {
+    try {
+      await saveCaptions({
+        mediaId,
+        captions,
+        service: options?.service ?? 'lfm-captioning',
+        model: options?.model ?? 'lfm-2.5-vl',
+        sampleIntervalSec: options?.sampleIntervalSec,
+        embeddingModel: options?.embeddingModel,
+        embeddingDim: options?.embeddingDim,
+        imageEmbeddingModel: options?.imageEmbeddingModel,
+        imageEmbeddingDim: options?.imageEmbeddingDim,
+      });
+    } catch (error) {
+      logger.warn(`Failed to persist captions for ${mediaId}; metadata mirror will still update`, error);
+    }
     return updateMediaDB(mediaId, { aiCaptions: captions });
   }
 
@@ -1071,7 +1124,7 @@ class MediaLibraryService {
   }
 
   /**
-   * Validate sync between OPFS and IndexedDB
+   * Validate sync between OPFS and workspace-backed metadata
    * Returns list of issues found
    *
    * Note: Only validates OPFS-based media. Handle-based media is validated
diff --git a/src/features/media-library/services/media-transcription-service.test.ts b/src/features/media-library/services/media-transcription-service.test.ts
new file mode 100644
index 000000000..79e3f674d
--- /dev/null
+++ b/src/features/media-library/services/media-transcription-service.test.ts
@@ -0,0 +1,532 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+import { waitFor } from '@testing-library/react';
+import type { MediaTranscript } from '@/types/storage';
+import type { TimelineItem, TimelineTrack, VideoItem } from '@/types/timeline';
+
+const saveTranscriptMock = vi.fn();
+const getTranscriptMock = vi.fn();
+const useTimelineStoreGetStateMock = vi.fn();
+const useProjectStoreGetStateMock = vi.fn();
+const useSelectionStoreGetStateMock = vi.fn();
+const usePlaybackStoreGetStateMock = vi.fn();
+const transcribeCollectMock = vi.fn();
+const transcribeMock = vi.fn();
+const getMediaMock = vi.fn();
+const getMediaFileMock = vi.fn();
+const startPreviewAudioConformMock = vi.fn();
+const resolvePreviewAudioConformUrlMock = vi.fn();
+
+vi.mock('@/infrastructure/storage', () => ({
+  deleteTranscript: vi.fn(),
+  getTranscript: getTranscriptMock,
+  getTranscriptMediaIds: vi.fn(),
+  saveTranscript: saveTranscriptMock,
+}));
+
+vi.mock('@/shared/state/selection', () => ({
+  useSelectionStore: {
+    getState: useSelectionStoreGetStateMock,
+  },
+}));
+
+vi.mock('@/shared/state/playback', () => ({
+  usePlaybackStore: {
+    getState: usePlaybackStoreGetStateMock,
+  },
+}));
+
+vi.mock('@/features/media-library/deps/projects', () => ({
+  useProjectStore: {
+    getState: useProjectStoreGetStateMock,
+  },
+}));
+
+vi.mock('@/features/media-library/deps/timeline-stores', () => ({
+  useTimelineStore: {
+    getState: useTimelineStoreGetStateMock,
+  },
+}));
+
+vi.mock('@/features/media-library/deps/settings-contract', () => ({
+  useSettingsStore: {
+    getState: () => ({
+      defaultWhisperModel: 'tiny',
+      defaultWhisperQuantization: 'q8',
+      defaultWhisperLanguage: 'auto',
+    }),
+  },
+}));
+
+vi.mock('../transcription/registry', () => ({
+  getDefaultMediaTranscriptionAdapter: () => ({
+    createTranscriber: () => ({
+      transcribe: transcribeMock,
+    }),
+  }),
+  getMediaTranscriptionModelLabel: () => 'Tiny',
+}));
+
+vi.mock('./media-library-service', () => ({
+  mediaLibraryService: {
+    getMedia: getMediaMock,
+    getMediaFile: getMediaFileMock,
+  },
+}));
+
+vi.mock('@/features/media-library/deps/composition-runtime-contract', () => ({
+  needsCustomAudioDecoder: vi.fn((codec?: string) => codec === 'pcm-s16be'),
+  startPreviewAudioConform: startPreviewAudioConformMock,
+  resolvePreviewAudioConformUrl: resolvePreviewAudioConformUrlMock,
+}));
+
+const { mediaTranscriptionService } = await import('./media-transcription-service');
+
+function makeTrack(id: string, order: number): TimelineTrack {
+  return {
+    id,
+    name: id,
+    height: 64,
+    locked: false,
+    visible: true,
+    muted: false,
+    solo: false,
+    order,
+    items: [],
+  };
+}
+
+function makeTextItem(
+  id: string,
+  trackId: string,
+  from: number,
+  durationInFrames: number,
+): TimelineItem {
+  return {
+    id,
+    type: 'text',
+    trackId,
+    from,
+    durationInFrames,
+    label: id,
+    text: id,
+    color: '#fff',
+  };
+}
+
+describe('mediaTranscriptionService.insertTranscriptAsCaptions', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+    useSelectionStoreGetStateMock.mockReturnValue({
+      selectedItemIds: [],
+      selectItems: vi.fn(),
+    });
+    usePlaybackStoreGetStateMock.mockReturnValue({ currentFrame: 0 });
+    useProjectStoreGetStateMock.mockReturnValue({
+      currentProject: {
+        metadata: {
+          width: 1920,
+          height: 1080,
+        },
+      },
+    });
+    transcribeMock.mockReturnValue({
+      collect: transcribeCollectMock,
+    });
+    transcribeCollectMock.mockResolvedValue([]);
+    getMediaMock.mockResolvedValue(null);
+    getMediaFileMock.mockResolvedValue(null);
+    startPreviewAudioConformMock.mockResolvedValue(undefined);
+    resolvePreviewAudioConformUrlMock.mockResolvedValue(null);
+  });
+
+  it('creates a new captions track above the clip track when no compatible track exists', async () => {
+    const clip: VideoItem = {
+      id: 'clip-1',
+      type: 'video',
+      trackId: 'track-video',
+      from: 0,
+      durationInFrames: 90,
+      label: 'Clip',
+      mediaId: 'media-1',
+      src: 'blob:test',
+      sourceStart: 0,
+      sourceEnd: 90,
+      sourceDuration: 90,
+      sourceFps: 30,
+      speed: 1,
+    };
+    const initialTracks = [
+      makeTrack('track-top', 0),
+      makeTrack('track-video', 1),
+      makeTrack('track-bottom', 2),
+    ];
+    const setTracks = vi.fn();
+    const removeItems = vi.fn();
+    const addItems = vi.fn();
+
+    useTimelineStoreGetStateMock.mockReturnValue({
+      fps: 30,
+      tracks: initialTracks,
+      items: [
+        clip,
+        makeTextItem('top-blocker', 'track-top', 0, 90),
+        makeTextItem('bottom-blocker', 'track-bottom', 0, 90),
+      ],
+      setTracks,
+      removeItems,
+      addItems,
+    });
+
+    const transcript: MediaTranscript = {
+      id: 'media-1',
+      mediaId: 'media-1',
+      model: 'tiny',
+      language: 'auto',
+      quantization: 'q8',
+      text: 'Hello there',
+      segments: [
+        { text: 'Hello there', start: 0, end: 2 },
+      ],
+      createdAt: Date.now(),
+      updatedAt: Date.now(),
+    };
+    getTranscriptMock.mockResolvedValue(transcript);
+
+    const result = await mediaTranscriptionService.insertTranscriptAsCaptions('media-1', {
+      clipIds: ['clip-1'],
+    });
+
+    expect(result).toEqual({
+      insertedItemCount: 1,
+      removedItemCount: 0,
+    });
+    expect(setTracks).toHaveBeenCalledTimes(1);
+
+    const updatedTracks = setTracks.mock.calls[0][0] as TimelineTrack[];
+    const captionTrack = updatedTracks.find((track) => !initialTracks.some((existing) => existing.id === track.id));
+    expect(captionTrack).toBeDefined();
+    expect(captionTrack?.order).toBe(0.5);
+
+    expect(addItems).toHaveBeenCalledTimes(1);
+    const insertedItems = addItems.mock.calls[0][0] as TimelineItem[];
+    expect(insertedItems).toHaveLength(1);
+    expect(insertedItems[0]?.trackId).toBe(captionTrack?.id);
+    expect(removeItems).not.toHaveBeenCalled();
+  });
+
+  it('does not reuse an audio track when regenerating transcript captions', async () => {
+    const clip: VideoItem = {
+      id: 'clip-1',
+      type: 'video',
+      trackId: 'track-video',
+      from: 0,
+      durationInFrames: 90,
+      label: 'Clip',
+      mediaId: 'media-1',
+      src: 'blob:test',
+      sourceStart: 0,
+      sourceEnd: 90,
+      sourceDuration: 90,
+      sourceFps: 30,
+      speed: 1,
+    };
+    const initialTracks = [
+      { ...makeTrack('track-audio', 0), name: 'A1', kind: 'audio' as const },
+      { ...makeTrack('track-video', 1), name: 'V1', kind: 'video' as const },
+    ];
+    const legacyCaptionOnAudioTrack: TimelineItem = {
+      id: 'caption-old',
+      type: 'text',
+      trackId: 'track-audio',
+      from: 0,
+      durationInFrames: 30,
+      label: 'caption-old',
+      text: 'caption-old',
+      mediaId: 'media-1',
+      color: '#fff',
+      captionSource: {
+        type: 'transcript',
+        clipId: 'clip-1',
+        mediaId: 'media-1',
+      },
+    };
+    const setTracks = vi.fn();
+    const removeItems = vi.fn();
+    const addItems = vi.fn();
+
+    useTimelineStoreGetStateMock.mockReturnValue({
+      fps: 30,
+      tracks: initialTracks,
+      items: [clip, legacyCaptionOnAudioTrack],
+      setTracks,
+      removeItems,
+      addItems,
+    });
+
+    const transcript: MediaTranscript = {
+      id: 'media-1',
+      mediaId: 'media-1',
+      model: 'tiny',
+      language: 'auto',
+      quantization: 'q8',
+      text: 'Hello there',
+      segments: [{ text: 'Hello there', start: 0, end: 2 }],
+      createdAt: Date.now(),
+      updatedAt: Date.now(),
+    };
+    getTranscriptMock.mockResolvedValue(transcript);
+
+    const result = await mediaTranscriptionService.insertTranscriptAsCaptions('media-1', {
+      clipIds: ['clip-1'],
+      replaceExisting: true,
+    });
+
+    expect(result).toEqual({
+      insertedItemCount: 1,
+      removedItemCount: 1,
+    });
+    expect(setTracks).toHaveBeenCalledTimes(1);
+
+    const updatedTracks = setTracks.mock.calls[0][0] as TimelineTrack[];
+    const captionTrack = updatedTracks.find((track) => !initialTracks.some((existing) => existing.id === track.id));
+    expect(captionTrack).toBeDefined();
+    expect(captionTrack?.kind).toBe('video');
+
+    expect(addItems).toHaveBeenCalledTimes(1);
+    const insertedItems = addItems.mock.calls[0][0] as TimelineItem[];
+    expect(insertedItems[0]?.trackId).toBe(captionTrack?.id);
+    expect(insertedItems[0]?.trackId).not.toBe('track-audio');
+    expect(removeItems).toHaveBeenCalledWith(['caption-old']);
+  });
+});
+
+describe('mediaTranscriptionService.transcribeMedia', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+    transcribeMock.mockReturnValue({
+      collect: transcribeCollectMock,
+    });
+    transcribeCollectMock.mockResolvedValue([
+      { text: ' hello ', start: 0, end: 1.2 },
+    ]);
+    startPreviewAudioConformMock.mockResolvedValue(undefined);
+    resolvePreviewAudioConformUrlMock.mockResolvedValue(null);
+  });
+
+  it('transcribes the original file for browser-decodable codecs', async () => {
+    const sourceFile = new File(['audio'], 'clip.mp3', { type: 'audio/mpeg' });
+    getMediaMock.mockResolvedValue({
+      id: 'media-1',
+      fileName: 'clip.mp3',
+      mimeType: 'audio/mpeg',
+      codec: 'mp3',
+      fileLastModified: 123,
+    });
+    getMediaFileMock.mockResolvedValue(sourceFile);
+
+    await mediaTranscriptionService.transcribeMedia('media-1');
+
+    expect(startPreviewAudioConformMock).not.toHaveBeenCalled();
+    expect(transcribeMock).toHaveBeenCalledTimes(1);
+    expect(transcribeMock.mock.calls[0]?.[0]).toBe(sourceFile);
+    expect(saveTranscriptMock).toHaveBeenCalledTimes(1);
+  });
+
+  it('transcribes a conformed wav for custom-decoded codecs like pcm-s16be', async () => {
+    const sourceFile = new File(['pcm'], 'clip.aif', { type: 'audio/aiff' });
+    const conformedBlob = new Blob(['wav'], { type: 'audio/wav' });
+    const fetchMock = vi.spyOn(globalThis, 'fetch').mockResolvedValue({
+      ok: true,
+      blob: async () => conformedBlob,
+    } as Response);
+
+    getMediaMock.mockResolvedValue({
+      id: 'media-1',
+      fileName: 'clip.aif',
+      mimeType: 'audio/aiff',
+      codec: 'pcm-s16be',
+      fileLastModified: 123,
+    });
+    getMediaFileMock.mockResolvedValue(sourceFile);
+    resolvePreviewAudioConformUrlMock
+      .mockResolvedValueOnce(null)
+      .mockResolvedValueOnce('blob:conformed-audio');
+
+    await mediaTranscriptionService.transcribeMedia('media-1');
+
+    expect(startPreviewAudioConformMock).toHaveBeenCalledWith('media-1', sourceFile);
+    expect(resolvePreviewAudioConformUrlMock).toHaveBeenCalledWith('media-1');
+    expect(transcribeMock).toHaveBeenCalledTimes(1);
+
+    const transcribeFile = transcribeMock.mock.calls[0]?.[0] as File;
+    expect(transcribeFile).toBeInstanceOf(File);
+    expect(transcribeFile.type).toBe('audio/wav');
+
+    fetchMock.mockRestore();
+  });
+
+  it('reuses a cached conformed wav without starting a new conform job', async () => {
+    const sourceFile = new File(['pcm'], 'clip.aif', { type: 'audio/aiff' });
+    const conformedBlob = new Blob(['wav'], { type: 'audio/wav' });
+    const fetchMock = vi.spyOn(globalThis, 'fetch').mockResolvedValue({
+      ok: true,
+      blob: async () => conformedBlob,
+    } as Response);
+
+    getMediaMock.mockResolvedValue({
+      id: 'media-1',
+      fileName: 'clip.aif',
+      mimeType: 'audio/aiff',
+      codec: 'pcm-s16be',
+      fileLastModified: 123,
+    });
+    getMediaFileMock.mockResolvedValue(sourceFile);
+    resolvePreviewAudioConformUrlMock.mockResolvedValue('blob:cached-conformed-audio');
+
+    await mediaTranscriptionService.transcribeMedia('media-1');
+
+    expect(startPreviewAudioConformMock).not.toHaveBeenCalled();
+    expect(resolvePreviewAudioConformUrlMock).toHaveBeenCalledWith('media-1');
+    expect(transcribeMock).toHaveBeenCalledTimes(1);
+
+    const transcribeFile = transcribeMock.mock.calls[0]?.[0] as File;
+    expect(transcribeFile).toBeInstanceOf(File);
+    expect(transcribeFile.type).toBe('audio/wav');
+
+    fetchMock.mockRestore();
+  });
+
+  it('runs only one transcription job at a time and queues later requests', async () => {
+    const sourceById = {
+      'media-1': new File(['one'], 'one.mp3', { type: 'audio/mpeg' }),
+      'media-2': new File(['two'], 'two.mp3', { type: 'audio/mpeg' }),
+    } as const;
+    getMediaMock.mockImplementation(async (mediaId: string) => ({
+      id: mediaId,
+      fileName: `${mediaId}.mp3`,
+      mimeType: 'audio/mpeg',
+      codec: 'mp3',
+      fileLastModified: 123,
+    }));
+    getMediaFileMock.mockImplementation(async (mediaId: string) => sourceById[mediaId as keyof typeof sourceById]);
+
+    let resolveFirstCollect!: (segments: Array<{ text: string; start: number; end: number }>) => void;
+    const firstCollect = vi.fn(() => new Promise<Array<{ text: string; start: number; end: number }>>((resolve) => {
+      resolveFirstCollect = resolve;
+    }));
+    const secondCollect = vi.fn().mockResolvedValue([
+      { text: ' second ', start: 0, end: 1 },
+    ]);
+
+    transcribeMock
+      .mockReturnValueOnce({ collect: firstCollect, cancel: vi.fn() })
+      .mockReturnValueOnce({ collect: secondCollect, cancel: vi.fn() });
+
+    const firstQueueState = vi.fn();
+    const secondQueueState = vi.fn();
+
+    const firstPromise = mediaTranscriptionService.transcribeMedia('media-1', {
+      onQueueStatusChange: firstQueueState,
+    });
+    const secondPromise = mediaTranscriptionService.transcribeMedia('media-2', {
+      onQueueStatusChange: secondQueueState,
+    });
+
+    await waitFor(() => {
+      expect(transcribeMock).toHaveBeenCalledTimes(1);
+    });
+    expect(firstQueueState).toHaveBeenCalledWith('running');
+    expect(secondQueueState).toHaveBeenCalledWith('queued');
+
+    resolveFirstCollect([{ text: ' first ', start: 0, end: 1 }]);
+
+    await firstPromise;
+    await secondPromise;
+
+    expect(transcribeMock).toHaveBeenCalledTimes(2);
+    expect(secondQueueState).toHaveBeenCalledWith('running');
+  });
+
+  it('cancels queued transcription jobs before they start', async () => {
+    const sourceById = {
+      'media-1': new File(['one'], 'one.mp3', { type: 'audio/mpeg' }),
+      'media-2': new File(['two'], 'two.mp3', { type: 'audio/mpeg' }),
+    } as const;
+    getMediaMock.mockImplementation(async (mediaId: string) => ({
+      id: mediaId,
+      fileName: `${mediaId}.mp3`,
+      mimeType: 'audio/mpeg',
+      codec: 'mp3',
+      fileLastModified: 123,
+    }));
+    getMediaFileMock.mockImplementation(async (mediaId: string) => sourceById[mediaId as keyof typeof sourceById]);
+
+    let resolveFirstCollect!: (segments: Array<{ text: string; start: number; end: number }>) => void;
+    const firstCollect = vi.fn(() => new Promise<Array<{ text: string; start: number; end: number }>>((resolve) => {
+      resolveFirstCollect = resolve;
+    }));
+
+    transcribeMock
+      .mockReturnValueOnce({ collect: firstCollect, cancel: vi.fn() });
+
+    const firstPromise = mediaTranscriptionService.transcribeMedia('media-1');
+    const secondPromise = mediaTranscriptionService.transcribeMedia('media-2');
+
+    await waitFor(() => {
+      expect(transcribeMock).toHaveBeenCalledTimes(1);
+    });
+
+    const secondRejection = expect(secondPromise).rejects.toThrow('Transcription cancelled');
+    expect(mediaTranscriptionService.cancelTranscription('media-2')).toBe(true);
+    await secondRejection;
+    expect(transcribeMock).toHaveBeenCalledTimes(1);
+
+    resolveFirstCollect([{ text: ' first ', start: 0, end: 1 }]);
+    await firstPromise;
+  });
+
+  it('cancels the active transcription job and advances the queue', async () => {
+    const sourceById = {
+      'media-1': new File(['one'], 'one.mp3', { type: 'audio/mpeg' }),
+      'media-2': new File(['two'], 'two.mp3', { type: 'audio/mpeg' }),
+    } as const;
+    getMediaMock.mockImplementation(async (mediaId: string) => ({
+      id: mediaId,
+      fileName: `${mediaId}.mp3`,
+      mimeType: 'audio/mpeg',
+      codec: 'mp3',
+      fileLastModified: 123,
+    }));
+    getMediaFileMock.mockImplementation(async (mediaId: string) => sourceById[mediaId as keyof typeof sourceById]);
+
+    let rejectFirstCollect!: (error: Error) => void;
+    const firstCollect = vi.fn(() => new Promise<Array<{ text: string; start: number; end: number }>>((_, reject) => {
+      rejectFirstCollect = reject;
+    }));
+    const firstCancel = vi.fn((message?: string) => {
+      rejectFirstCollect(new Error(message ?? 'Transcription cancelled'));
+    });
+    const secondCollect = vi.fn().mockResolvedValue([
+      { text: ' second ', start: 0, end: 1 },
+    ]);
+
+    transcribeMock
+      .mockReturnValueOnce({ collect: firstCollect, cancel: firstCancel })
+      .mockReturnValueOnce({ collect: secondCollect, cancel: vi.fn() });
+
+    const firstPromise = mediaTranscriptionService.transcribeMedia('media-1');
+    const secondPromise = mediaTranscriptionService.transcribeMedia('media-2');
+
+    await waitFor(() => {
+      expect(transcribeMock).toHaveBeenCalledTimes(1);
+    });
+
+    expect(mediaTranscriptionService.cancelTranscription('media-1')).toBe(true);
+    await expect(firstPromise).rejects.toThrow('Transcription cancelled');
+
+    const secondTranscript = await secondPromise;
+    expect(firstCancel).toHaveBeenCalledWith('Transcription cancelled');
+    expect(secondTranscript.mediaId).toBe('media-2');
+    expect(transcribeMock).toHaveBeenCalledTimes(2);
+  });
+});
diff --git a/src/features/media-library/services/media-transcription-service.ts b/src/features/media-library/services/media-transcription-service.ts
index a4268fa45..93800f41d 100644
--- a/src/features/media-library/services/media-transcription-service.ts
+++ b/src/features/media-library/services/media-transcription-service.ts
@@ -9,7 +9,7 @@ import { useSelectionStore } from '@/shared/state/selection';
 import { createLogger } from '@/shared/logging/logger';
 import type { MediaTranscript, MediaTranscriptModel } from '@/types/storage';
 import type { AudioItem, TextItem, TimelineItem, TimelineTrack, VideoItem } from '@/types/timeline';
-import type { TranscribeOptions } from '../transcription/types';
+import type { TranscriptSegment, TranscribeOptions } from '../transcription/types';
 import {
   getDefaultMediaTranscriptionAdapter,
   getMediaTranscriptionModelLabel,
@@ -17,20 +17,27 @@ import {
 import { mediaLibraryService } from './media-library-service';
 import {
   buildCaptionTextItems,
-  buildCaptionTrack,
+  buildCaptionTrackAbove,
   findReplaceableCaptionItemsForClip,
   findCompatibleCaptionTrackForRanges,
+  isCaptionTrackCandidate,
   getCaptionTextItemTemplate,
   getCaptionRangeForClip,
 } from '../utils/caption-items';
 import { useProjectStore } from '@/features/media-library/deps/projects';
 import { useTimelineStore } from '@/features/media-library/deps/timeline-stores';
 import { useSettingsStore } from '@/features/media-library/deps/settings-contract';
+import {
+  needsCustomAudioDecoder,
+  resolvePreviewAudioConformUrl,
+  startPreviewAudioConform,
+} from '@/features/media-library/deps/composition-runtime-contract';
 import {
   DEFAULT_WHISPER_MODEL,
   DEFAULT_WHISPER_QUANTIZATION,
   normalizeWhisperLanguage,
 } from '@/shared/utils/whisper-settings';
+import { TRANSCRIPTION_CANCELLED_MESSAGE } from '@/shared/utils/transcription-cancellation';
 
 const logger = createLogger('MediaTranscriptionService');
 const DEFAULT_MODEL: MediaTranscriptModel = DEFAULT_WHISPER_MODEL;
@@ -47,61 +54,269 @@ interface InsertTranscriptAsCaptionsResult {
   removedItemCount: number;
 }
 
+type QueueState = 'queued' | 'running';
+
+interface TranscriptionRequestOptions {
+  language?: string;
+  model?: MediaTranscriptModel;
+  quantization?: TranscribeOptions['quantization'];
+  onProgress?: TranscribeOptions['onProgress'];
+  onQueueStatusChange?: (state: QueueState) => void;
+}
+
+interface QueuedTranscriptionListener {
+  onProgress?: TranscribeOptions['onProgress'];
+  onQueueStatusChange?: (state: QueueState) => void;
+}
+
+interface QueuedTranscriptionJob {
+  mediaId: string;
+  requestKey: string;
+  model: MediaTranscriptModel;
+  quantization: NonNullable<TranscribeOptions['quantization']>;
+  language?: string;
+  listeners: QueuedTranscriptionListener[];
+  promise: Promise<MediaTranscript>;
+  resolve: (value: MediaTranscript) => void;
+  reject: (reason?: unknown) => void;
+  state: QueueState;
+  stream: { collect(): Promise<TranscriptSegment[]>; cancel(message?: string): void } | null;
+  cancelled: boolean;
+  cancelMessage: string;
+}
+
 class MediaTranscriptionService {
   private readonly adapter = getDefaultMediaTranscriptionAdapter();
   private readonly transcriber = this.adapter.createTranscriber({
     model: DEFAULT_MODEL,
     quantization: DEFAULT_QUANTIZATION,
   });
+  private activeJob: QueuedTranscriptionJob | null = null;
+  private queue: QueuedTranscriptionJob[] = [];
 
   getTranscript = getTranscript;
   getTranscriptMediaIds = getTranscriptMediaIds;
-  deleteTranscript = deleteTranscript;
+
+  async deleteTranscript(mediaId: string): Promise<void> {
+    await deleteTranscript(mediaId);
+  }
 
   async transcribeMedia(
     mediaId: string,
-    options: Pick<TranscribeOptions, 'language' | 'model' | 'quantization' | 'onProgress'> = {},
+    options: TranscriptionRequestOptions = {},
   ): Promise<MediaTranscript> {
+    const settings = useSettingsStore.getState();
+    const model = options.model ?? settings.defaultWhisperModel ?? DEFAULT_MODEL;
+    const quantization =
+      options.quantization ?? settings.defaultWhisperQuantization ?? DEFAULT_QUANTIZATION;
+    const language = normalizeWhisperLanguage(options.language ?? settings.defaultWhisperLanguage);
+    const requestKey = `${mediaId}:${model}:${quantization}:${language ?? 'auto'}`;
+    const listener: QueuedTranscriptionListener = {
+      onProgress: options.onProgress,
+      onQueueStatusChange: options.onQueueStatusChange,
+    };
+    const existingJob = this.findJobByKey(requestKey);
+
+    if (existingJob) {
+      this.attachListener(existingJob, listener);
+      return existingJob.promise;
+    }
+
+    const job = this.createJob({
+      mediaId,
+      requestKey,
+      model,
+      quantization,
+      language,
+      listener,
+    });
+
+    if (this.activeJob) {
+      this.queue.push(job);
+      this.setJobState(job, 'queued');
+    } else {
+      this.startJob(job);
+    }
+
+    return job.promise;
+  }
+
+  cancelTranscription(mediaId: string, message = TRANSCRIPTION_CANCELLED_MESSAGE): boolean {
+    let cancelled = false;
+
+    this.queue = this.queue.filter((job) => {
+      if (job.mediaId !== mediaId) {
+        return true;
+      }
+
+      cancelled = true;
+      this.cancelJob(job, message);
+      return false;
+    });
+
+    if (this.activeJob?.mediaId === mediaId) {
+      cancelled = true;
+      this.cancelJob(this.activeJob, message);
+    }
+
+    return cancelled;
+  }
+
+  private findJobByKey(requestKey: string): QueuedTranscriptionJob | null {
+    if (this.activeJob?.requestKey === requestKey) {
+      return this.activeJob;
+    }
+
+    return this.queue.find((job) => job.requestKey === requestKey) ?? null;
+  }
+
+  private createJob({
+    mediaId,
+    requestKey,
+    model,
+    quantization,
+    language,
+    listener,
+  }: {
+    mediaId: string;
+    requestKey: string;
+    model: MediaTranscriptModel;
+    quantization: NonNullable<TranscribeOptions['quantization']>;
+    language?: string;
+    listener: QueuedTranscriptionListener;
+  }): QueuedTranscriptionJob {
+    let resolve!: (value: MediaTranscript) => void;
+    let reject!: (reason?: unknown) => void;
+    const promise = new Promise<MediaTranscript>((resolvePromise, rejectPromise) => {
+      resolve = resolvePromise;
+      reject = rejectPromise;
+    });
+
+    return {
+      mediaId,
+      requestKey,
+      model,
+      quantization,
+      language,
+      listeners: [listener],
+      promise,
+      resolve,
+      reject,
+      state: 'queued',
+      stream: null,
+      cancelled: false,
+      cancelMessage: TRANSCRIPTION_CANCELLED_MESSAGE,
+    };
+  }
+
+  private attachListener(job: QueuedTranscriptionJob, listener: QueuedTranscriptionListener): void {
+    job.listeners.push(listener);
+    listener.onQueueStatusChange?.(job.state);
+  }
+
+  private setJobState(job: QueuedTranscriptionJob, state: QueueState): void {
+    job.state = state;
+    for (const listener of job.listeners) {
+      listener.onQueueStatusChange?.(state);
+    }
+  }
+
+  private cancelJob(job: QueuedTranscriptionJob, message: string): void {
+    job.cancelled = true;
+    job.cancelMessage = message;
+
+    if (job.state === 'queued') {
+      job.reject(new Error(message));
+      return;
+    }
+
+    job.stream?.cancel(message);
+  }
+
+  private startJob(job: QueuedTranscriptionJob): void {
+    this.activeJob = job;
+    this.setJobState(job, 'running');
+
+    void (async () => {
+      try {
+        const transcript = await this.executeTranscriptionJob(job);
+        job.resolve(transcript);
+      } catch (error) {
+        job.reject(error);
+      } finally {
+        if (this.activeJob === job) {
+          this.activeJob = null;
+        }
+        this.processNextJob();
+      }
+    })();
+  }
+
+  private processNextJob(): void {
+    if (this.activeJob) {
+      return;
+    }
+
+    const nextJob = this.queue.shift();
+    if (nextJob) {
+      this.startJob(nextJob);
+    }
+  }
+
+  private throwIfCancelled(job: QueuedTranscriptionJob): void {
+    if (job.cancelled) {
+      throw new Error(job.cancelMessage);
+    }
+  }
+
+  private async executeTranscriptionJob(job: QueuedTranscriptionJob): Promise<MediaTranscript> {
+    const mediaId = job.mediaId;
     const media = await mediaLibraryService.getMedia(mediaId);
     if (!media) {
       throw new Error(`Media not found: ${mediaId}`);
     }
+    this.throwIfCancelled(job);
 
     if (!media.mimeType.startsWith('audio/') && !media.mimeType.startsWith('video/')) {
       throw new Error('Only audio and video files can be transcribed');
     }
 
-    const blob = await mediaLibraryService.getMediaFile(mediaId);
-    if (!blob) {
+    const sourceBlob = await mediaLibraryService.getMediaFile(mediaId);
+    if (!sourceBlob) {
       throw new Error(`Could not load media file: ${media.fileName}`);
     }
+    this.throwIfCancelled(job);
 
-    const file = blob instanceof File
-      ? blob
-      : new File([blob], media.fileName, {
-          type: media.mimeType,
+    const transcriptionBlob = await this.resolveTranscriptionBlob(media, sourceBlob);
+    this.throwIfCancelled(job);
+
+    const file = transcriptionBlob instanceof File
+      ? transcriptionBlob
+      : new File([transcriptionBlob], media.fileName, {
+          type: transcriptionBlob.type || media.mimeType,
           lastModified: media.fileLastModified ?? Date.now(),
         });
 
-    const settings = useSettingsStore.getState();
-    const model = options.model ?? settings.defaultWhisperModel ?? DEFAULT_MODEL;
-    const quantization =
-      options.quantization ?? settings.defaultWhisperQuantization ?? DEFAULT_QUANTIZATION;
-    const language = normalizeWhisperLanguage(options.language ?? settings.defaultWhisperLanguage);
     const stream = this.transcriber.transcribe(file, {
-      model,
-      language,
-      quantization,
-      onProgress: options.onProgress,
+      model: job.model,
+      language: job.language,
+      quantization: job.quantization,
+      onProgress: (progress) => {
+        for (const listener of job.listeners) {
+          listener.onProgress?.(progress);
+        }
+      },
     });
+    job.stream = stream;
     const segments = await stream.collect();
+    this.throwIfCancelled(job);
 
     const transcript: MediaTranscript = {
       id: mediaId,
       mediaId,
-      model,
-      language,
-      quantization,
+      model: job.model,
+      language: job.language,
+      quantization: job.quantization,
       text: segments.map((segment) => segment.text.trim()).filter(Boolean).join(' ').trim(),
       segments: segments.map((segment) => ({
         text: segment.text.trim(),
@@ -121,6 +336,33 @@ class MediaTranscriptionService {
     return transcript;
   }
 
+  private async resolveTranscriptionBlob(media: { id: string; fileName: string; mimeType: string; codec: string; audioCodec?: string }, sourceBlob: Blob): Promise<Blob> {
+    const transcriptionCodec = media.mimeType.startsWith('audio/')
+      ? media.codec
+      : (media.audioCodec ?? media.codec);
+
+    if (!needsCustomAudioDecoder(transcriptionCodec)) {
+      return sourceBlob;
+    }
+
+    let conformedUrl = await resolvePreviewAudioConformUrl(media.id);
+    if (!conformedUrl) {
+      await startPreviewAudioConform(media.id, sourceBlob);
+      conformedUrl = await resolvePreviewAudioConformUrl(media.id);
+    }
+
+    if (!conformedUrl) {
+      throw new Error(`Failed to prepare ${transcriptionCodec || 'custom'} audio for transcription`);
+    }
+
+    const response = await fetch(conformedUrl);
+    if (!response.ok) {
+      throw new Error(`Failed to load conformed audio for transcription (${response.status})`);
+    }
+
+    return await response.blob();
+  }
+
   async insertTranscriptAsCaptions(
     mediaId: string,
     options: InsertTranscriptAsCaptionsOptions = {},
@@ -143,7 +385,7 @@ class MediaTranscriptionService {
     const generatedCaptionIdsToRemove = options.replaceExisting
       ? new Set(
           targetClips.flatMap((clip) =>
-            findReplaceableCaptionItemsForClip(timeline.items, clip).map((item) => item.id)
+            findReplaceableCaptionItemsForClip(timeline.items, clip, 'transcript').map((item) => item.id)
           )
         )
       : new Set<string>();
@@ -157,7 +399,7 @@ class MediaTranscriptionService {
       }
 
       const existingGeneratedCaptions = options.replaceExisting
-        ? findReplaceableCaptionItemsForClip(timeline.items, clip)
+        ? findReplaceableCaptionItemsForClip(timeline.items, clip, 'transcript')
         : [];
       const preferredTrackId = this.resolvePreferredCaptionTrackId(
         newTracks,
@@ -175,7 +417,10 @@ class MediaTranscriptionService {
           );
 
       if (!targetTrack) {
-        targetTrack = buildCaptionTrack(newTracks);
+        const clipTrack = newTracks.find((track) => track.id === clip.trackId);
+        targetTrack = clipTrack
+          ? buildCaptionTrackAbove(newTracks, clipTrack.order)
+          : buildCaptionTrackAbove(newTracks, 0);
         newTracks.push(targetTrack);
         newTracks.sort((a, b) => a.order - b.order);
       }
@@ -283,7 +528,7 @@ class MediaTranscriptionService {
     }
 
     const preferredTrack = tracks.find((track) => track.id === trackIds[0]);
-    if (!preferredTrack || preferredTrack.visible === false || preferredTrack.locked || preferredTrack.isGroup) {
+    if (!preferredTrack || !isCaptionTrackCandidate(preferredTrack, items)) {
       return null;
     }
 
diff --git a/src/features/media-library/services/proxy-service.ts b/src/features/media-library/services/proxy-service.ts
index e85c15784..7734ecadb 100644
--- a/src/features/media-library/services/proxy-service.ts
+++ b/src/features/media-library/services/proxy-service.ts
@@ -346,8 +346,8 @@ class ProxyService {
       // Directory may not exist
     }
 
-    // Mirror deletion to workspace cache (best-effort, no-op when absent).
-    void removeWorkspaceCacheEntry([WORKSPACE_PROXIES_DIR, resolvedProxyKey], {
+    // Mirror deletion to workspace cache before reporting completion.
+    await removeWorkspaceCacheEntry([WORKSPACE_PROXIES_DIR, resolvedProxyKey], {
       recursive: true,
     });
   }
@@ -360,12 +360,11 @@ class ProxyService {
     const staleProxyIds: string[] = [];
     try {
       const root = await navigator.storage.getDirectory();
-      let proxyRoot: FileSystemDirectoryHandle;
-      try {
-        proxyRoot = await root.getDirectoryHandle(PROXY_DIR);
-      } catch {
-        return staleProxyIds; // No proxies directory yet
-      }
+      // Create the dir if missing — on a fresh origin OPFS has no `proxies/`
+      // yet, but the workspace fallback below still needs a handle to back-fill
+      // into. Without `create: true` we'd bail before hydrating from the
+      // workspace folder and never show cross-origin-reused proxies.
+      const proxyRoot = await root.getDirectoryHandle(PROXY_DIR, { create: true });
 
       const requestedProxyKeys = new Set<string>();
       for (const mediaId of mediaIds) {
diff --git a/src/features/media-library/stores/media-delete-actions.test.ts b/src/features/media-library/stores/media-delete-actions.test.ts
index d2726b4f3..18194e309 100644
--- a/src/features/media-library/stores/media-delete-actions.test.ts
+++ b/src/features/media-library/stores/media-delete-actions.test.ts
@@ -30,6 +30,12 @@ vi.mock('@/infrastructure/browser/blob-url-manager', () => ({
   blobUrlManager: blobUrlManagerMocks,
 }));
 
+const sceneBrowserMocks = vi.hoisted(() => ({
+  invalidateMediaCaptionThumbnails: vi.fn(),
+}));
+
+vi.mock('../deps/scene-browser', () => sceneBrowserMocks);
+
 type DeleteState = Partial<MediaLibraryState> & Partial<MediaLibraryActions>;
 type DeleteUpdater =
   | Partial<MediaLibraryState>
diff --git a/src/features/media-library/stores/media-delete-actions.ts b/src/features/media-library/stores/media-delete-actions.ts
index 3fdffb666..bcd68e6fb 100644
--- a/src/features/media-library/stores/media-delete-actions.ts
+++ b/src/features/media-library/stores/media-delete-actions.ts
@@ -2,6 +2,7 @@
 import { mediaLibraryService } from '../services/media-library-service';
 import { proxyService } from '../services/proxy-service';
 import { blobUrlManager } from '@/infrastructure/browser/blob-url-manager';
+import { invalidateMediaCaptionThumbnails } from '../deps/scene-browser';
 
 type Set = (
   partial:
@@ -21,6 +22,11 @@ function releaseDeletedMediaResources(ids: string[]): void {
   for (const id of ids) {
     blobUrlManager.release(id);
     proxyService.clearProxyKey(id);
+    // Drop every Scene Browser cache tied to this media — thumbnail blob
+    // URLs (which otherwise pin the JPEG in memory forever), lazy-thumb
+    // result memos, and both text + image embedding maps. Disk-side
+    // cleanup is already handled by the recursive `media/{id}/` removal.
+    invalidateMediaCaptionThumbnails(id);
   }
 }
 
diff --git a/src/features/media-library/stores/media-library-store.ts b/src/features/media-library/stores/media-library-store.ts
index 4e086d922..b192449e0 100644
--- a/src/features/media-library/stores/media-library-store.ts
+++ b/src/features/media-library/stores/media-library-store.ts
@@ -1,4 +1,4 @@
-﻿import { create } from 'zustand';
+﻿import { create, type StoreApi, type UseBoundStore } from 'zustand';
 import { devtools } from 'zustand/middleware';
 import type {
   MediaLibraryState,
@@ -74,7 +74,16 @@ async function initializeProxyState(mediaItems: MediaMetadata[]): Promise<void>
   await proxyService.loadExistingProxies(videoItems.map((item) => item.id));
 }
 
-export const useMediaLibraryStore = create<
+type MediaLibraryStoreApi = UseBoundStore<StoreApi<MediaLibraryState & MediaLibraryActions>>;
+
+declare global {
+  // eslint-disable-next-line no-var
+  var __FREECUT_MEDIA_LIBRARY_STORE__: MediaLibraryStoreApi | undefined;
+}
+
+const hotStore = import.meta.env.DEV ? globalThis.__FREECUT_MEDIA_LIBRARY_STORE__ : undefined;
+
+const newStore: MediaLibraryStoreApi = hotStore ?? create<
   MediaLibraryState & MediaLibraryActions
 >()(
   devtools(
@@ -120,6 +129,7 @@ export const useMediaLibraryStore = create<
 
       // AI tagging
       taggingMediaIds: new Set(),
+      analysisProgress: null,
 
       // v3: Set current project context
       setCurrentProject: (projectId: string | null) => {
@@ -141,6 +151,7 @@ export const useMediaLibraryStore = create<
           transcriptStatus: new Map(),
           transcriptProgress: new Map(),
           taggingMediaIds: new Set(),
+          analysisProgress: null,
         });
         // Note: loadMediaItems is triggered by the component's useEffect
         // Don't call it here to avoid double loading
@@ -361,6 +372,54 @@ export const useMediaLibraryStore = create<
           return { mediaItems };
         });
       },
+
+      beginAnalysisRun: (count) => {
+        if (count <= 0) return;
+        set((state) => {
+          const current = state.analysisProgress;
+          if (!current) {
+            return { analysisProgress: { total: count, completed: 0, cancelRequested: false } };
+          }
+          // Merge concurrent runs (e.g. a per-card analyze while a batch is
+          // in flight) by growing the total so the percent keeps decreasing
+          // toward completion instead of snapping back.
+          return {
+            analysisProgress: {
+              total: current.total + count,
+              completed: current.completed,
+              cancelRequested: current.cancelRequested,
+            },
+          };
+        });
+      },
+
+      incrementAnalysisCompleted: (n = 1) => {
+        set((state) => {
+          if (!state.analysisProgress) return state;
+          return {
+            analysisProgress: {
+              ...state.analysisProgress,
+              completed: Math.min(
+                state.analysisProgress.total,
+                state.analysisProgress.completed + n,
+              ),
+            },
+          };
+        });
+      },
+
+      requestAnalysisCancel: () => {
+        set((state) => {
+          if (!state.analysisProgress) return state;
+          return {
+            analysisProgress: { ...state.analysisProgress, cancelRequested: true },
+          };
+        });
+      },
+
+      endAnalysisRun: () => {
+        set({ analysisProgress: null });
+      },
     }),
     {
       name: 'MediaLibraryStore',
@@ -369,28 +428,46 @@ export const useMediaLibraryStore = create<
   )
 );
 
-// Keep mediaById synchronized even when action modules update mediaItems directly.
-let prevMediaItemsRef = useMediaLibraryStore.getState().mediaItems;
-useMediaLibraryStore.subscribe((state) => {
-  if (state.mediaItems === prevMediaItemsRef) {
-    return;
-  }
-  prevMediaItemsRef = state.mediaItems;
-  useMediaLibraryStore.setState({ mediaById: buildMediaById(state.mediaItems) });
-});
-
-// Wire up proxy service status listener to update store state
-proxyService.onStatusChange((mediaId, status, progress) => {
-  const store = useMediaLibraryStore.getState();
-  if (status === 'idle') {
-    store.clearProxyStatus(mediaId);
-    return;
-  }
-  store.setProxyStatus(mediaId, status);
-  if (progress !== undefined) {
-    store.setProxyProgress(mediaId, progress);
-  }
-});
+// Preserve the store instance across Vite HMR so that mediaItems and the
+// rest of project state don't reset to `[]` on every file save — without
+// this, editing a feature component wipes the scene browser's "X clips · Y
+// scenes" and requires a hard refresh to reload via `loadMediaItems`.
+// DEV-only: prod builds don't HMR so the cache is harmless to skip.
+if (import.meta.env.DEV) {
+  globalThis.__FREECUT_MEDIA_LIBRARY_STORE__ = newStore;
+}
+
+export const useMediaLibraryStore = newStore;
+
+// Subscriptions (below) must only be wired the first time the store is
+// created. On HMR, `hotStore` is non-null and the subscription from the
+// previous module execution is still live on the store — re-wiring here
+// would leak a listener on every file save, eventually double-updating
+// `mediaById` and double-firing proxy status changes.
+if (!hotStore) {
+  // Keep mediaById synchronized even when action modules update mediaItems directly.
+  let prevMediaItemsRef = useMediaLibraryStore.getState().mediaItems;
+  useMediaLibraryStore.subscribe((state) => {
+    if (state.mediaItems === prevMediaItemsRef) {
+      return;
+    }
+    prevMediaItemsRef = state.mediaItems;
+    useMediaLibraryStore.setState({ mediaById: buildMediaById(state.mediaItems) });
+  });
+
+  // Wire up proxy service status listener to update store state
+  proxyService.onStatusChange((mediaId, status, progress) => {
+    const store = useMediaLibraryStore.getState();
+    if (status === 'idle') {
+      store.clearProxyStatus(mediaId);
+      return;
+    }
+    store.setProxyStatus(mediaId, status);
+    if (progress !== undefined) {
+      store.setProxyProgress(mediaId, progress);
+    }
+  });
+}
 
 // Selector hooks for common use cases (optional, but recommended)
 export const useFilteredMediaItems = () => {
diff --git a/src/features/media-library/transcription/browser-transcriber.ts b/src/features/media-library/transcription/browser-transcriber.ts
index 2367eabe4..b81920bf0 100644
--- a/src/features/media-library/transcription/browser-transcriber.ts
+++ b/src/features/media-library/transcription/browser-transcriber.ts
@@ -10,6 +10,8 @@ import type { MediaTranscriptQuantization } from '@/types/storage';
 import { localInferenceRuntimeRegistry } from '@/shared/state/local-inference';
 import { LOCAL_INFERENCE_UNLOADED_MESSAGE } from '@/shared/state/local-inference';
 import { formatWhisperRuntimeModelLabel, estimateWhisperRuntimeBytes } from './runtime-estimates';
+import { DEFAULT_WHISPER_MODEL } from '@/shared/utils/whisper-settings';
+import { usePlaybackStore } from '@/shared/state/playback';
 
 export class BrowserTranscriber {
   private readonly defaultOptions: TranscribeOptions;
@@ -37,6 +39,9 @@ export class TranscribeStream implements AsyncIterable<TranscriptSegment> {
   private bridge: Bridge | null = null;
   private started = false;
   private runtimeRegistered = false;
+  private unsubscribePlayback: (() => void) | null = null;
+  private idleResumeTimer: ReturnType<typeof setTimeout> | null = null;
+  private workerPaused = false;
 
   constructor(file: File, options: TranscribeOptions = {}) {
     this.file = file;
@@ -77,9 +82,70 @@ export class TranscribeStream implements AsyncIterable<TranscriptSegment> {
     this.queue.length = 0;
     this.error = new Error(message);
     this.unregisterRuntime();
+    this.stopPlaybackWatcher();
     this.wakeUp();
   }
 
+  private startPlaybackWatcher(): void {
+    if (this.unsubscribePlayback) return;
+    if (!this.bridge) return;
+
+    const IDLE_RESUME_MS = 400;
+
+    const pauseWorker = () => {
+      if (this.idleResumeTimer !== null) {
+        clearTimeout(this.idleResumeTimer);
+        this.idleResumeTimer = null;
+      }
+      if (this.workerPaused) return;
+      this.workerPaused = true;
+      this.bridge?.setPaused(true);
+    };
+
+    const scheduleResume = () => {
+      if (this.idleResumeTimer !== null) {
+        clearTimeout(this.idleResumeTimer);
+      }
+      this.idleResumeTimer = setTimeout(() => {
+        this.idleResumeTimer = null;
+        const playback = usePlaybackStore.getState();
+        if (playback.isPlaying || playback.previewFrame !== null) return;
+        this.workerPaused = false;
+        this.bridge?.setPaused(false);
+      }, IDLE_RESUME_MS);
+    };
+
+    const initial = usePlaybackStore.getState();
+    if (initial.isPlaying || initial.previewFrame !== null) {
+      pauseWorker();
+    }
+
+    this.unsubscribePlayback = usePlaybackStore.subscribe((state, prev) => {
+      const isActive = state.isPlaying || state.previewFrame !== null;
+      const frameMoved = state.currentFrameEpoch !== prev.currentFrameEpoch;
+
+      if (isActive || frameMoved) {
+        pauseWorker();
+        if (!state.isPlaying) scheduleResume();
+        return;
+      }
+
+      if (prev.isPlaying && !state.isPlaying) {
+        scheduleResume();
+      }
+    });
+  }
+
+  private stopPlaybackWatcher(): void {
+    this.unsubscribePlayback?.();
+    this.unsubscribePlayback = null;
+    if (this.idleResumeTimer !== null) {
+      clearTimeout(this.idleResumeTimer);
+      this.idleResumeTimer = null;
+    }
+    this.workerPaused = false;
+  }
+
   private async startBridge(): Promise<void> {
     if (this.started) {
       return;
@@ -104,11 +170,13 @@ export class TranscribeStream implements AsyncIterable<TranscriptSegment> {
       onDone: () => {
         this.doneFlag = true;
         this.unregisterRuntime();
+        this.stopPlaybackWatcher();
         this.wakeUp();
       },
       onError: (message: string) => {
         this.error = new Error(message);
         this.unregisterRuntime();
+        this.stopPlaybackWatcher();
         this.wakeUp();
       },
     });
@@ -116,13 +184,15 @@ export class TranscribeStream implements AsyncIterable<TranscriptSegment> {
     try {
       await this.bridge.start(
         this.file,
-        (this.options.model as WhisperModel | undefined) ?? 'whisper-tiny',
+        (this.options.model as WhisperModel | undefined) ?? DEFAULT_WHISPER_MODEL,
         this.options.language,
         this.options.quantization,
       );
+      this.startPlaybackWatcher();
     } catch (error) {
       this.error = error instanceof Error ? error : new Error(String(error));
       this.unregisterRuntime();
+      this.stopPlaybackWatcher();
       this.wakeUp();
     }
   }
@@ -133,7 +203,7 @@ export class TranscribeStream implements AsyncIterable<TranscriptSegment> {
     }
 
     this.runtimeRegistered = true;
-    const model = (this.options.model as WhisperModel | undefined) ?? 'whisper-tiny';
+    const model = (this.options.model as WhisperModel | undefined) ?? DEFAULT_WHISPER_MODEL;
     const quantization = (this.options.quantization as MediaTranscriptQuantization | undefined) ?? 'hybrid';
     const now = Date.now();
 
diff --git a/src/features/media-library/transcription/lib/bridge.ts b/src/features/media-library/transcription/lib/bridge.ts
index 849c9a969..6d49e1c70 100644
--- a/src/features/media-library/transcription/lib/bridge.ts
+++ b/src/features/media-library/transcription/lib/bridge.ts
@@ -11,6 +11,7 @@ import { MODEL_IDS } from '../types';
 import { createManagedWorkerSession } from '@/shared/utils/managed-worker-session';
 import { Chunker } from './chunker';
 import { downmixToMono, resampleTo16kHz } from './resampler';
+import { DEFAULT_WHISPER_MODEL } from '@/shared/utils/whisper-settings';
 
 export interface BridgeCallbacks {
   onSegment: (segment: TranscriptSegment) => void;
@@ -73,7 +74,7 @@ export class Bridge {
 
   async start(
     file: File,
-    model: WhisperModel = 'whisper-tiny',
+    model: WhisperModel = DEFAULT_WHISPER_MODEL,
     language?: string,
     quantization?: QuantizationType,
   ): Promise<void> {
@@ -112,6 +113,19 @@ export class Bridge {
     this.session.terminate();
   }
 
+  setPaused(paused: boolean): void {
+    if (this.session.isTerminated()) {
+      return;
+    }
+
+    const message = { type: paused ? 'pause' : 'resume' } as const;
+    this.session.getWorker('whisper').postMessage(message);
+    const hasWebCodecs = typeof window !== 'undefined' && 'AudioDecoder' in window;
+    if (hasWebCodecs) {
+      this.session.getWorker('decoder').postMessage(message);
+    }
+  }
+
   private async decodeWithAudioContext(file: File, port: MessagePort): Promise<void> {
     try {
       this.callbacks.onProgress({ stage: 'decoding', progress: 0 });
diff --git a/src/features/media-library/transcription/registry.test.ts b/src/features/media-library/transcription/registry.test.ts
index c58f4d678..c053ba27f 100644
--- a/src/features/media-library/transcription/registry.test.ts
+++ b/src/features/media-library/transcription/registry.test.ts
@@ -12,11 +12,15 @@ describe('mediaTranscriptionAdapterRegistry', () => {
       id: 'browser-whisper',
       label: 'Browser Whisper',
     });
-    expect(getDefaultMediaTranscriptionModel()).toBe('whisper-tiny');
+    expect(getDefaultMediaTranscriptionModel()).toBe('whisper-small');
     expect(getMediaTranscriptionModelOptions()).toContainEqual({
       value: 'whisper-small',
       label: 'Small',
     });
+    expect(getMediaTranscriptionModelOptions()).not.toContainEqual({
+      value: 'whisper-tiny',
+      label: 'Tiny',
+    });
   });
 
   it('formats model labels through the active adapter', () => {
diff --git a/src/features/media-library/transcription/types.ts b/src/features/media-library/transcription/types.ts
index 8c889d652..d6e9cecc2 100644
--- a/src/features/media-library/transcription/types.ts
+++ b/src/features/media-library/transcription/types.ts
@@ -49,7 +49,9 @@ export type WhisperWorkerMessage =
       modelId: string;
       language?: string;
       quantization?: QuantizationType;
-    };
+    }
+  | { type: 'pause' }
+  | { type: 'resume' };
 
 export const MODEL_IDS: Record<WhisperModel, string> = {
   'whisper-tiny': 'onnx-community/whisper-tiny',
diff --git a/src/features/media-library/transcription/workers/decoder.worker.ts b/src/features/media-library/transcription/workers/decoder.worker.ts
index 60f53e9f9..6a10f248d 100644
--- a/src/features/media-library/transcription/workers/decoder.worker.ts
+++ b/src/features/media-library/transcription/workers/decoder.worker.ts
@@ -11,6 +11,8 @@ import type { MainThreadMessage, PCMChunk } from '../types';
 let port: MessagePort | null = null;
 let whisperQueueSize = 0;
 let whisperQueueWaiter: (() => void) | null = null;
+let paused = false;
+let pauseWaiter: (() => void) | null = null;
 
 self.onmessage = async (event: MessageEvent) => {
   const message = event.data as { type: string; port?: MessagePort; file?: File };
@@ -27,6 +29,21 @@ self.onmessage = async (event: MessageEvent) => {
     return;
   }
 
+  if (message.type === 'pause') {
+    paused = true;
+    return;
+  }
+
+  if (message.type === 'resume') {
+    paused = false;
+    if (pauseWaiter) {
+      const waiter = pauseWaiter;
+      pauseWaiter = null;
+      waiter();
+    }
+    return;
+  }
+
   if (message.type === 'init' && message.file) {
     try {
       await run(message.file);
@@ -39,6 +56,13 @@ self.onmessage = async (event: MessageEvent) => {
   }
 };
 
+function awaitResume(): Promise<void> {
+  if (!paused) return Promise.resolve();
+  return new Promise<void>((resolve) => {
+    pauseWaiter = resolve;
+  });
+}
+
 async function run(file: File): Promise<void> {
   if (typeof AudioDecoder === 'undefined') {
     throw new Error('WebCodecs AudioDecoder is not available in this browser');
@@ -146,6 +170,10 @@ async function run(file: File): Promise<void> {
   try {
     const sink = new EncodedPacketSink(audioTrack);
     for await (const packet of sink.packets()) {
+      if (paused) {
+        await awaitResume();
+      }
+
       while (decoder.decodeQueueSize > 10 || whisperQueueSize >= 3) {
         await new Promise<void>((resolve) => {
           if (decoder.decodeQueueSize > 10) {
diff --git a/src/features/media-library/transcription/workers/whisper.worker.ts b/src/features/media-library/transcription/workers/whisper.worker.ts
index d0917be50..f64854339 100644
--- a/src/features/media-library/transcription/workers/whisper.worker.ts
+++ b/src/features/media-library/transcription/workers/whisper.worker.ts
@@ -48,10 +48,29 @@ let currentModelId: string | null = null;
 let port: MessagePort | null = null;
 let language: string | undefined;
 let pipelineReady = false;
+let paused = false;
 const queue: PCMChunk[] = [];
 let processing = false;
 let reportedEstimatedBytes = 0;
 
+self.addEventListener('unhandledrejection', (event: PromiseRejectionEvent) => {
+  const reason = event.reason;
+  const message = reason instanceof Error
+    ? `${reason.name}: ${reason.message}`
+    : typeof reason === 'string'
+      ? reason
+      : 'Unknown worker error';
+  postMain({ type: 'error', message });
+  event.preventDefault();
+});
+
+self.addEventListener('error', (event: ErrorEvent) => {
+  postMain({
+    type: 'error',
+    message: event.message || (event.error instanceof Error ? event.error.message : 'Worker error'),
+  });
+});
+
 self.onmessage = async (event: MessageEvent) => {
   const message = event.data as WhisperWorkerMessage;
 
@@ -66,18 +85,35 @@ self.onmessage = async (event: MessageEvent) => {
   if (message.type === 'init') {
     language = message.language;
     await initPipeline(message.modelId, message.quantization ?? 'hybrid');
+    return;
+  }
+
+  if (message.type === 'pause') {
+    paused = true;
+    return;
+  }
+
+  if (message.type === 'resume') {
+    if (!paused) return;
+    paused = false;
+    if (pipelineReady && !processing && queue.length > 0) {
+      void processNext();
+    }
   }
 };
 
 function enqueue(chunk: PCMChunk): void {
   queue.push(chunk);
   port?.postMessage(queue.length);
-  if (pipelineReady && !processing) {
+  if (pipelineReady && !processing && !paused) {
     void processNext();
   }
 }
 
-async function initPipeline(modelId: string, quantization: QuantizationType): Promise<void> {
+async function initPipeline(
+  modelId: string,
+  quantization: QuantizationType,
+): Promise<void> {
   postMain({ type: 'progress', event: { stage: 'loading', progress: 0 } });
   reportedEstimatedBytes = 0;
 
@@ -189,7 +225,7 @@ async function initPipeline(modelId: string, quantization: QuantizationType): Pr
 }
 
 async function processNext(): Promise<void> {
-  if (!pipelineReady || !asrPipeline) {
+  if (!pipelineReady || !asrPipeline || paused) {
     processing = false;
     return;
   }
@@ -215,7 +251,7 @@ async function processNext(): Promise<void> {
   }
 
   processing = false;
-  if (queue.length > 0) {
+  if (queue.length > 0 && !paused) {
     void processNext();
   }
 }
diff --git a/src/features/media-library/types.ts b/src/features/media-library/types.ts
index b5bc177d5..a3e6ca76b 100644
--- a/src/features/media-library/types.ts
+++ b/src/features/media-library/types.ts
@@ -11,7 +11,7 @@ export interface MediaLibrarySelection {
   compositionIds: string[];
 }
 
-export type MediaTranscriptStatus = 'idle' | 'transcribing' | 'ready' | 'error';
+export type MediaTranscriptStatus = 'idle' | 'queued' | 'transcribing' | 'ready' | 'error';
 
 export type MediaTranscriptProgress = TranscriptionProgressSnapshot;
 
@@ -94,6 +94,19 @@ export interface MediaLibraryState {
 
   // AI tagging
   taggingMediaIds: Set<string>;
+  /**
+   * Deterministic progress for the currently running AI analysis run (single
+   * item or batch). Null when no analysis is in flight. `completed` counts
+   * finished items (success or failure); the background progress bar reads
+   * `completed / total` to draw a real percentage instead of an indeterminate
+   * pulse. `cancelRequested` is a soft stop — the service finishes the
+   * current item then skips the rest.
+   */
+  analysisProgress: {
+    total: number;
+    completed: number;
+    cancelRequested: boolean;
+  } | null;
 }
 
 export interface MediaLibraryActions {
@@ -185,5 +198,14 @@ export interface MediaLibraryActions {
 
   // AI captioning
   setTaggingMedia: (mediaId: string, active: boolean) => void;
-  updateMediaCaptions: (mediaId: string, captions: Array<{ timeSec: number; text: string }>) => void;
+  updateMediaCaptions: (mediaId: string, captions: NonNullable<MediaMetadata['aiCaptions']>) => void;
+
+  /** Start (or merge into) an analysis run — adds `count` to `total`. */
+  beginAnalysisRun: (count: number) => void;
+  /** Increment the completed counter by one (or by `n`). */
+  incrementAnalysisCompleted: (n?: number) => void;
+  /** Ask the current run to stop after the in-flight item. */
+  requestAnalysisCancel: () => void;
+  /** Clear analysisProgress when the run is done. */
+  endAnalysisRun: () => void;
 }
diff --git a/src/features/media-library/utils/caption-items.test.ts b/src/features/media-library/utils/caption-items.test.ts
index a10eb4249..e80e044a5 100644
--- a/src/features/media-library/utils/caption-items.test.ts
+++ b/src/features/media-library/utils/caption-items.test.ts
@@ -8,18 +8,80 @@ vi.mock('../deps/timeline-contract', () => ({
     timelineFps: number,
     sourceFps: number,
   ) => Math.max(0, Math.round((timelineFrames / timelineFps) * sourceFps * speed)),
+  getNextClassicTrackName: (tracks: Array<{ name: string; kind?: string }>, kind: 'video' | 'audio') => {
+    const prefix = kind === 'video' ? 'V' : 'A';
+    const regex = new RegExp(`^${prefix}(\\d+)$`, 'i');
+    const used = new Set(
+      tracks
+        .filter((track) => track.kind === undefined || track.kind === kind)
+        .map((track) => {
+          const match = track.name.match(regex);
+          return match?.[1] ? Number.parseInt(match[1], 10) : NaN;
+        })
+        .filter((value) => Number.isFinite(value) && value > 0),
+    );
+    let next = 1;
+    while (used.has(next)) next += 1;
+    return `${prefix}${next}`;
+  },
+  getTrackKind: (track: { name: string; kind?: string }) => {
+    if (track.kind === 'video' || track.kind === 'audio') {
+      return track.kind;
+    }
+    if (/^V(\d+)$/i.test(track.name)) {
+      return 'video';
+    }
+    if (/^A(\d+)$/i.test(track.name)) {
+      return 'audio';
+    }
+    return null;
+  },
+  getEffectiveTrackKindForItem: (
+    track: { id: string; name: string; kind?: string },
+    items: Array<{ trackId: string; type: string }>,
+  ) => {
+    if (track.kind === 'video' || track.kind === 'audio') {
+      return track.kind;
+    }
+    if (/^V(\d+)$/i.test(track.name)) {
+      return 'video';
+    }
+    if (/^A(\d+)$/i.test(track.name)) {
+      return 'audio';
+    }
+
+    let hasAudioItems = false;
+    for (const item of items) {
+      if (item.trackId !== track.id) continue;
+      if (item.type === 'audio') {
+        hasAudioItems = true;
+        continue;
+      }
+      return 'video';
+    }
+
+    return hasAudioItems ? 'audio' : null;
+  },
 }));
 
 import {
+  aiCaptionsToSegments,
   buildCaptionTextItems,
+  buildCaptionTrack,
+  buildCaptionTrackAbove,
   findGeneratedCaptionItemsForClip,
   findReplaceableCaptionItemsForClip,
   getCaptionTextItemTemplate,
   findCompatibleCaptionTrack,
+  findCompatibleCaptionTrackForRanges,
+  findCompatibleGeneratedTrackForRanges,
   getCaptionRangeForClip,
   getCaptionFrameRange,
+  isGeneratedContentTrackCandidate,
+  isCaptionTrackCandidate,
   normalizeCaptionSegments,
 } from './caption-items';
+import { getTrackKind } from '../deps/timeline-contract';
 import type { TimelineItem, TimelineTrack, VideoItem } from '@/types/timeline';
 
 describe('caption-items', () => {
@@ -158,6 +220,105 @@ describe('caption-items', () => {
     expect(track?.id).toBe('track-2');
   });
 
+  it('never reuses audio tracks for caption text', () => {
+    const tracks: TimelineTrack[] = [
+      {
+        id: 'track-audio',
+        name: 'A1',
+        kind: 'audio',
+        height: 64,
+        locked: false,
+        visible: true,
+        muted: false,
+        solo: false,
+        order: 0,
+        items: [],
+      },
+      {
+        id: 'track-video',
+        name: 'V1',
+        kind: 'video',
+        height: 64,
+        locked: false,
+        visible: true,
+        muted: false,
+        solo: false,
+        order: 1,
+        items: [],
+      },
+    ];
+
+    expect(isCaptionTrackCandidate(tracks[0]!, [])).toBe(false);
+    expect(isCaptionTrackCandidate(tracks[1]!, [])).toBe(true);
+    expect(findCompatibleCaptionTrack(tracks, [], 30, 90)?.id).toBe('track-video');
+    expect(
+      findCompatibleCaptionTrackForRanges(tracks, [], [{ startFrame: 30, endFrame: 90 }])?.id,
+    ).toBe('track-video');
+  });
+
+  it('can target audio tracks for generated audio content', () => {
+    const tracks: TimelineTrack[] = [
+      {
+        id: 'track-generic-audio',
+        name: 'Track 1',
+        height: 64,
+        locked: false,
+        visible: true,
+        muted: false,
+        solo: false,
+        order: 0,
+        items: [],
+      },
+      {
+        id: 'track-video',
+        name: 'V1',
+        kind: 'video',
+        height: 64,
+        locked: false,
+        visible: true,
+        muted: false,
+        solo: false,
+        order: 1,
+        items: [],
+      },
+      {
+        id: 'track-audio',
+        name: 'A1',
+        kind: 'audio',
+        height: 64,
+        locked: false,
+        visible: true,
+        muted: false,
+        solo: false,
+        order: 2,
+        items: [],
+      },
+    ];
+    const items: TimelineItem[] = [
+      {
+        id: 'existing-audio',
+        type: 'audio',
+        trackId: 'track-generic-audio',
+        from: 0,
+        durationInFrames: 30,
+        label: 'Existing audio',
+        src: 'blob:test',
+      },
+    ];
+
+    expect(isGeneratedContentTrackCandidate(tracks[0]!, items, 'audio')).toBe(true);
+    expect(isGeneratedContentTrackCandidate(tracks[1]!, items, 'audio')).toBe(false);
+    expect(isGeneratedContentTrackCandidate(tracks[2]!, items, 'audio')).toBe(true);
+    expect(
+      findCompatibleGeneratedTrackForRanges(
+        tracks,
+        items,
+        [{ startFrame: 30, endFrame: 90 }],
+        'audio',
+      )?.id,
+    ).toBe('track-generic-audio');
+  });
+
   it('returns the overall transcript frame range', () => {
     const frameRange = getCaptionFrameRange(
       [
@@ -294,3 +455,95 @@ describe('caption-items', () => {
     expect(replaceableCaptions.map((item) => item.id)).toEqual(['legacy-caption']);
   });
 });
+
+function makeTrack(id: string, order: number): TimelineTrack {
+  return {
+    id,
+    name: id,
+    height: 40,
+    locked: false,
+    visible: true,
+    muted: false,
+    solo: false,
+    order,
+    items: [],
+  };
+}
+
+describe('aiCaptionsToSegments', () => {
+  it('returns [] for empty input', () => {
+    expect(aiCaptionsToSegments([])).toEqual([]);
+  });
+
+  it('derives end from next caption start for all but the last', () => {
+    const segments = aiCaptionsToSegments([
+      { timeSec: 0, text: 'a' },
+      { timeSec: 3, text: 'b' },
+      { timeSec: 7, text: 'c' },
+    ]);
+    expect(segments).toEqual([
+      { text: 'a', start: 0, end: 3 },
+      { text: 'b', start: 3, end: 7 },
+      { text: 'c', start: 7, end: 10 }, // 3s fallback for the tail
+    ]);
+  });
+
+  it('uses provided sampleIntervalSec for the trailing caption', () => {
+    const segments = aiCaptionsToSegments([{ timeSec: 0, text: 'only' }], 5);
+    expect(segments).toEqual([{ text: 'only', start: 0, end: 5 }]);
+  });
+
+  it('sorts captions by timeSec before converting', () => {
+    const segments = aiCaptionsToSegments([
+      { timeSec: 5, text: 'b' },
+      { timeSec: 0, text: 'a' },
+    ]);
+    expect(segments.map((s) => s.text)).toEqual(['a', 'b']);
+  });
+});
+
+describe('buildCaptionTrackAbove', () => {
+  it('places the caption track halfway between the reference and the next track up', () => {
+    const tracks = [makeTrack('a', 0), makeTrack('b', 1), makeTrack('c', 2)];
+    const captionTrack = buildCaptionTrackAbove(tracks, 2);
+    expect(captionTrack.order).toBe(1.5);
+  });
+
+  it('places the track a full integer above when nothing sits higher', () => {
+    const tracks = [makeTrack('a', 5)];
+    const captionTrack = buildCaptionTrackAbove(tracks, 5);
+    expect(captionTrack.order).toBe(4);
+  });
+
+  it('sorts visually higher than the reference clip track after insertion', () => {
+    const tracks = [makeTrack('a', 0), makeTrack('clip', 1), makeTrack('b', 2)];
+    const captionTrack = buildCaptionTrackAbove(tracks, 1);
+    const sorted = [...tracks, captionTrack].sort((x, y) => x.order - y.order);
+    const clipIndex = sorted.findIndex((t) => t.id === 'clip');
+    const captionIndex = sorted.findIndex((t) => t.id === captionTrack.id);
+    // CLAUDE.md convention: lower order = visually higher (top of timeline).
+    expect(captionIndex).toBeLessThan(clipIndex);
+  });
+
+  it('creates a video-kind overlay track so the timeline renders it immediately', () => {
+    const tracks = [makeTrack('clip', 1)];
+    const captionTrack = buildCaptionTrackAbove(tracks, 1);
+    expect(captionTrack.kind).toBe('video');
+    expect(getTrackKind(captionTrack)).toBe('video');
+    expect(captionTrack.name).toBe('V1');
+  });
+});
+
+describe('buildCaptionTrack (append-to-bottom helper)', () => {
+  it('still creates tracks at maxOrder + 1', () => {
+    const tracks = [
+      { ...makeTrack('a', 0), name: 'V1', kind: 'video' as const },
+      { ...makeTrack('b', 1), name: 'A1', kind: 'audio' as const },
+      { ...makeTrack('c', 2), name: 'V2', kind: 'video' as const },
+    ];
+    const captionTrack = buildCaptionTrack(tracks);
+    expect(captionTrack.order).toBe(3);
+    expect(captionTrack.kind).toBe('video');
+    expect(captionTrack.name).toBe('V3');
+  });
+});
diff --git a/src/features/media-library/utils/caption-items.ts b/src/features/media-library/utils/caption-items.ts
index bea59355a..eafba1fec 100644
--- a/src/features/media-library/utils/caption-items.ts
+++ b/src/features/media-library/utils/caption-items.ts
@@ -1,5 +1,11 @@
-import { DEFAULT_TRACK_HEIGHT } from '../deps/timeline-contract';
+import {
+  DEFAULT_TRACK_HEIGHT,
+  getEffectiveTrackKindForItem,
+  getNextClassicTrackName,
+  type TrackKind,
+} from '../deps/timeline-contract';
 import type { MediaTranscriptSegment } from '@/types/storage';
+import type { MediaCaption } from '@/infrastructure/analysis';
 import type {
   AudioItem,
   GeneratedCaptionSource,
@@ -10,6 +16,13 @@ import type {
 } from '@/types/timeline';
 import { timelineToSourceFrames } from '../deps/timeline-contract';
 
+/**
+ * Fallback segment duration when AI captions can't infer an `end` time from
+ * the next caption (i.e. for the last caption, or when the sample interval is
+ * unknown). Seconds.
+ */
+const AI_CAPTION_FALLBACK_DURATION_SEC = 3;
+
 interface BuildCaptionTextItemsOptions {
   mediaId: string;
   trackId: string;
@@ -19,6 +32,13 @@ interface BuildCaptionTextItemsOptions {
   canvasWidth: number;
   canvasHeight: number;
   styleTemplate?: CaptionTextItemTemplate;
+  /**
+   * Discriminator for the `captionSource.type` stamped on the generated
+   * text items. Defaults to `'transcript'` (whisper flow); AI captioning
+   * flows pass `'ai-captions'` so later replace/remove operations can tell
+   * the two kinds apart on the same clip.
+   */
+  sourceType?: GeneratedCaptionSource['type'];
 }
 
 export type CaptionTextItemTemplate = Pick<
@@ -198,39 +218,32 @@ export function findCompatibleCaptionTrack(
   startFrame: number,
   endFrame: number,
 ): TimelineTrack | null {
-  const sortedTracks = [...tracks].sort((a, b) => a.order - b.order);
-
-  for (const track of sortedTracks) {
-    if (track.visible === false || track.locked || track.isGroup) {
-      continue;
-    }
-
-    const hasOverlap = items.some((item) => {
-      if (item.trackId !== track.id) {
-        return false;
-      }
-
-      const itemEnd = item.from + item.durationInFrames;
-      return item.from < endFrame && itemEnd > startFrame;
-    });
-
-    if (!hasOverlap) {
-      return track;
-    }
-  }
-
-  return null;
+  return findCompatibleGeneratedTrackForRanges(
+    tracks,
+    items,
+    [{ startFrame, endFrame }],
+    'video',
+  );
 }
 
 export function findCompatibleCaptionTrackForRanges(
   tracks: readonly TimelineTrack[],
   items: readonly TimelineItem[],
   ranges: ReadonlyArray<{ startFrame: number; endFrame: number }>,
+): TimelineTrack | null {
+  return findCompatibleGeneratedTrackForRanges(tracks, items, ranges, 'video');
+}
+
+export function findCompatibleGeneratedTrackForRanges(
+  tracks: readonly TimelineTrack[],
+  items: readonly TimelineItem[],
+  ranges: ReadonlyArray<{ startFrame: number; endFrame: number }>,
+  requiredKind: TrackKind,
 ): TimelineTrack | null {
   const sortedTracks = [...tracks].sort((a, b) => a.order - b.order);
 
   for (const track of sortedTracks) {
-    if (track.visible === false || track.locked || track.isGroup) {
+    if (!isGeneratedContentTrackCandidate(track, items, requiredKind)) {
       continue;
     }
 
@@ -253,34 +266,135 @@ export function findCompatibleCaptionTrackForRanges(
   return null;
 }
 
+export function isGeneratedContentTrackCandidate(
+  track: TimelineTrack,
+  items: readonly TimelineItem[],
+  requiredKind: TrackKind,
+): boolean {
+  if (track.visible === false || track.locked || track.isGroup) {
+    return false;
+  }
+
+  const effectiveKind = getEffectiveTrackKindForItem(track, items);
+  if (requiredKind === 'audio') {
+    return effectiveKind === 'audio';
+  }
+
+  return effectiveKind === 'video' || effectiveKind === null;
+}
+
+export function isCaptionTrackCandidate(
+  track: TimelineTrack,
+  items: readonly TimelineItem[],
+): boolean {
+  return isGeneratedContentTrackCandidate(track, items, 'video');
+}
+
 export function buildCaptionTrack(tracks: readonly TimelineTrack[]): TimelineTrack {
   const maxOrder = tracks.reduce((highest, track) => Math.max(highest, track.order), -1);
   return {
     id: `track-captions-${Date.now()}`,
-    name: 'Captions',
+    name: getNextClassicTrackName([...tracks], 'video'),
+    kind: 'video',
     height: DEFAULT_TRACK_HEIGHT,
     locked: false,
+    syncLock: true,
     visible: true,
     muted: false,
     solo: false,
+    volume: 0,
     order: maxOrder + 1,
     items: [],
   };
 }
 
-function buildCaptionSource(mediaId: string, clipId: string): GeneratedCaptionSource {
+/**
+ * Build a captions track positioned *above* a reference track (the clip's
+ * own track in the AI-captions flow). The new track's `order` is set halfway
+ * between `referenceOrder` and the next track up, so both stay unique and no
+ * existing tracks need to shift.
+ *
+ * If nothing sits above the reference, we land a full integer lower than it.
+ * Matches the fractional-order pattern used by `insertTrack` in
+ * `use-timeline-tracks.ts`.
+ */
+export function buildCaptionTrackAbove(
+  tracks: readonly TimelineTrack[],
+  referenceOrder: number,
+): TimelineTrack {
+  const ordersStrictlyAbove = tracks
+    .map((t) => t.order)
+    .filter((order) => order < referenceOrder);
+  const previousOrder = ordersStrictlyAbove.length > 0
+    ? Math.max(...ordersStrictlyAbove)
+    : referenceOrder - 2;
+  const newOrder = (previousOrder + referenceOrder) / 2;
+
+  return {
+    id: `track-captions-${Date.now()}`,
+    name: getNextClassicTrackName([...tracks], 'video'),
+    kind: 'video',
+    height: DEFAULT_TRACK_HEIGHT,
+    locked: false,
+    syncLock: true,
+    visible: true,
+    muted: false,
+    solo: false,
+    volume: 0,
+    order: newOrder,
+    items: [],
+  };
+}
+
+function buildCaptionSource(
+  mediaId: string,
+  clipId: string,
+  type: GeneratedCaptionSource['type'] = 'transcript',
+): GeneratedCaptionSource {
   return {
-    type: 'transcript',
+    type,
     mediaId,
     clipId,
   };
 }
 
+/**
+ * Convert AI captions (point-in-time frame descriptions) into segments with
+ * start/end pairs consumable by {@link buildCaptionTextItems}.
+ *
+ * AI captions have no intrinsic duration — the end time is derived from the
+ * next caption's `timeSec`, with a fallback to the provider's sample interval
+ * (or {@link AI_CAPTION_FALLBACK_DURATION_SEC}) for the final caption.
+ */
+export function aiCaptionsToSegments(
+  captions: readonly MediaCaption[],
+  sampleIntervalSec?: number,
+): MediaTranscriptSegment[] {
+  if (captions.length === 0) return [];
+  const sorted = [...captions].sort((a, b) => a.timeSec - b.timeSec);
+  const fallbackEndDelta = sampleIntervalSec && sampleIntervalSec > 0
+    ? sampleIntervalSec
+    : AI_CAPTION_FALLBACK_DURATION_SEC;
+
+  return sorted.map((caption, index) => {
+    const next = sorted[index + 1];
+    const start = Math.max(0, caption.timeSec);
+    const end = next !== undefined
+      ? Math.max(start + 0.01, next.timeSec)
+      : start + fallbackEndDelta;
+    return {
+      text: caption.text,
+      start,
+      end,
+    };
+  });
+}
+
 export function isGeneratedCaptionTextItem(
   item: TimelineItem,
 ): item is TextItem & { captionSource: GeneratedCaptionSource } {
   return item.type === 'text'
-    && item.captionSource?.type === 'transcript'
+    && (item.captionSource?.type === 'transcript' || item.captionSource?.type === 'ai-captions')
     && item.captionSource.clipId.length > 0
     && item.captionSource.mediaId.length > 0;
 }
@@ -288,9 +402,12 @@ export function isGeneratedCaptionTextItem(
 export function findGeneratedCaptionItemsForClip(
   items: readonly TimelineItem[],
   clipId: string,
+  sourceType?: GeneratedCaptionSource['type'],
 ): Array<TextItem & { captionSource: GeneratedCaptionSource }> {
   return items.filter((item): item is TextItem & { captionSource: GeneratedCaptionSource } =>
-    isGeneratedCaptionTextItem(item) && item.captionSource.clipId === clipId
+    isGeneratedCaptionTextItem(item)
+      && item.captionSource.clipId === clipId
+      && (sourceType === undefined || item.captionSource.type === sourceType)
   );
 }
 
@@ -313,12 +430,18 @@ function isLegacyGeneratedCaptionItemForClip(
 export function findReplaceableCaptionItemsForClip(
   items: readonly TimelineItem[],
   clip: AudioItem | VideoItem,
+  sourceType?: GeneratedCaptionSource['type'],
 ): TextItem[] {
-  const generatedCaptionItems = findGeneratedCaptionItemsForClip(items, clip.id);
+  const generatedCaptionItems = findGeneratedCaptionItemsForClip(items, clip.id, sourceType);
   if (generatedCaptionItems.length > 0) {
     return generatedCaptionItems;
   }
 
+  // Legacy fallback only applies to transcript-generated captions (the only
+  // kind that predates the `captionSource` discriminator).
+  if (sourceType !== undefined && sourceType !== 'transcript') {
+    return [];
+  }
   return items.filter((item): item is TextItem => isLegacyGeneratedCaptionItemForClip(item, clip));
 }
 
@@ -352,6 +475,7 @@ export function buildCaptionTextItems({
   canvasWidth,
   canvasHeight,
   styleTemplate,
+  sourceType = 'transcript',
 }: BuildCaptionTextItemsOptions): TextItem[] {
   const normalizedSegments = normalizeCaptionSegments(segments);
   const { sourceStart, sourceEnd, sourceFps, speed } = getClipSourceBounds(clip, timelineFps);
@@ -387,11 +511,12 @@ export function buildCaptionTextItems({
     const defaultCaptionItem: TextItem = {
       id: crypto.randomUUID(),
       type: 'text',
+      textRole: 'caption',
       trackId,
       from,
       durationInFrames,
       mediaId,
-      captionSource: buildCaptionSource(mediaId, clip.id),
+      captionSource: buildCaptionSource(mediaId, clip.id, sourceType),
       label: segment.text.slice(0, 48),
       text: segment.text,
       fontSize: Math.max(36, Math.round(canvasHeight * 0.045)),
diff --git a/src/features/preview/components/edit-2up-panels.tsx b/src/features/preview/components/edit-2up-panels.tsx
index 50900daad..804ce6d4c 100644
--- a/src/features/preview/components/edit-2up-panels.tsx
+++ b/src/features/preview/components/edit-2up-panels.tsx
@@ -4,8 +4,24 @@ import {
   type VideoFrameSource,
 } from '@/features/preview/deps/export';
 import { getGlobalVideoSourcePool } from '@/features/preview/deps/player-pool';
+import {
+  backgroundBatchPreseek,
+  backgroundPreseek,
+  getCachedPredecodedBitmap,
+  waitForInflightPredecodedBitmap,
+} from '@/features/preview/utils/decoder-prewarm';
+import {
+  getCachedEditOverlayFrame,
+  getEditOverlayFrameCacheKey,
+  hasCachedEditOverlayFrame,
+  putCachedEditOverlayFrame,
+} from '@/features/preview/utils/edit-overlay-frame-cache';
+import { collectEditOverlayDirectionalPrewarmTimes } from '@/features/preview/utils/edit-overlay-prewarm-plan';
+import {
+  getActivePreviewScrubbingCache,
+  getActivePreviewVideoFrameEntry,
+} from '@/features/preview/utils/preview-scrubbing-cache-bridge';
 import type { TimelineItem } from '@/types/timeline';
-import { usePlaybackStore } from '@/shared/state/playback';
 import { resolveMediaUrl, resolveProxyUrl } from '../utils/media-resolver';
 import {
   computeFittedMediaSize,
@@ -13,6 +29,7 @@ import {
   renderPanelMedia,
 } from './edit-panel-media-utils';
 import { useBlobUrlVersion } from '@/infrastructure/browser/blob-url-manager';
+import { useEditOverlayPanelPrewarm } from './use-edit-overlay-panel-prewarm';
 
 const TYPE_PLACEHOLDER_COLORS: Record<string, string> = {
   image: '#22c55e',
@@ -29,10 +46,11 @@ const GAP = 8;
 const FALLBACK_CANVAS_WIDTH = 280;
 const FALLBACK_CANVAS_HEIGHT = 158;
 const STRICT_DECODE_FALLBACK_FAILURES = 2;
-/** Frame cache for edit overlay panels — instant revisits during drag reversal */
-const EDIT_PANEL_CACHE_MAX = 60;
-/** Quantize source time to ~frame-level resolution for cache keys */
 const CACHE_TIME_QUANTUM = 1 / 60;
+const STRICT_DECODE_SHARED_CACHE_WAIT_MS = 6;
+const EDIT_OVERLAY_PREWARM_MAX_TIMESTAMPS = 6;
+const SCRUBBING_CACHE_TOLERANCE_FACTOR = 0.9;
+const EDIT_OVERLAY_LEGACY_SEEK_SPEED_EPSILON = 0.01;
 let previewVideoInstanceCounter = 0;
 let strictDecodeInstanceCounter = 0;
 let globalEditOverlayDecoderPool: SharedVideoExtractorPool | null = null;
@@ -44,14 +62,18 @@ function getEditOverlayDecoderPool(): SharedVideoExtractorPool {
   return globalEditOverlayDecoderPool;
 }
 
-function useResolvedVideoBlobUrl(mediaId: string | undefined, useProxy: boolean): string | null {
+function quantizeOverlayCacheTime(sourceTime: number): number {
+  return Math.round(sourceTime / CACHE_TIME_QUANTUM) * CACHE_TIME_QUANTUM;
+}
+
+function useResolvedVideoBlobUrl(mediaId: string | undefined): string | null {
   const [blobUrl, setBlobUrl] = useState<string | null>(null);
   const blobUrlVersion = useBlobUrlVersion();
   const requestKeyRef = useRef<string | null>(null);
 
   useEffect(() => {
     let cancelled = false;
-    const requestKey = `${mediaId ?? 'none'}:${useProxy ? 'proxy' : 'source'}`;
+    const requestKey = `${mediaId ?? 'none'}:proxy-first`;
     if (requestKeyRef.current !== requestKey) {
       requestKeyRef.current = requestKey;
       setBlobUrl(null);
@@ -63,14 +85,12 @@ function useResolvedVideoBlobUrl(mediaId: string | undefined, useProxy: boolean)
       };
     }
 
-    if (useProxy) {
-      const proxyUrl = resolveProxyUrl(mediaId);
-      if (proxyUrl) {
-        setBlobUrl(proxyUrl);
-        return () => {
-          cancelled = true;
-        };
-      }
+    const proxyUrl = resolveProxyUrl(mediaId);
+    if (proxyUrl) {
+      setBlobUrl(proxyUrl);
+      return () => {
+        cancelled = true;
+      };
     }
 
     resolveMediaUrl(mediaId)
@@ -86,7 +106,7 @@ function useResolvedVideoBlobUrl(mediaId: string | undefined, useProxy: boolean)
     return () => {
       cancelled = true;
     };
-  }, [mediaId, useProxy, blobUrlVersion]);
+  }, [mediaId, blobUrlVersion]);
 
   return blobUrl;
 }
@@ -107,6 +127,7 @@ interface EditTwoUpPanelsProps {
 export function EditTwoUpPanels({ leftPanel, rightPanel }: EditTwoUpPanelsProps) {
   const containerRef = useRef<HTMLDivElement>(null);
   const [containerSize, setContainerSize] = useState({ width: 0, height: 0 });
+  useEditOverlayPanelPrewarm([leftPanel, rightPanel]);
 
   useEffect(() => {
     const el = containerRef.current;
@@ -217,13 +238,29 @@ interface VideoFrameProps {
 
 function VideoFrameImpl({ item, sourceTime }: VideoFrameProps) {
   const [useLegacyFallback, setUseLegacyFallback] = useState(false);
+  const [legacyFailed, setLegacyFailed] = useState(false);
+  const prefersLegacySeek = Math.abs((item.speed ?? 1) - 1) < EDIT_OVERLAY_LEGACY_SEEK_SPEED_EPSILON;
+  const canUseLegacySeek = !legacyFailed;
+
+  useEffect(() => {
+    setUseLegacyFallback(false);
+    setLegacyFailed(false);
+  }, [item.id, item.mediaId, prefersLegacySeek]);
 
   const handleStrictDecodeFailure = useCallback(() => {
+    if (!canUseLegacySeek) return;
     setUseLegacyFallback((prev) => (prev ? prev : true));
+  }, [canUseLegacySeek]);
+
+  const shouldUseLegacySeek = canUseLegacySeek && (prefersLegacySeek || useLegacyFallback);
+
+  const handleLegacyFailure = useCallback(() => {
+    setLegacyFailed(true);
+    setUseLegacyFallback(false);
   }, []);
 
-  if (useLegacyFallback) {
-    return <LegacySeekVideoFrame item={item} sourceTime={sourceTime} />;
+  if (shouldUseLegacySeek) {
+    return <LegacySeekVideoFrame item={item} sourceTime={sourceTime} onFailure={handleLegacyFailure} />;
   }
 
   return (
@@ -239,10 +276,6 @@ interface StrictDecodedVideoFrameProps extends VideoFrameProps {
   onDecodeFailure: () => void;
 }
 
-function quantizeTime(t: number): number {
-  return Math.round(t / CACHE_TIME_QUANTUM) * CACHE_TIME_QUANTUM;
-}
-
 function StrictDecodedVideoFrame({
   item,
   sourceTime,
@@ -251,6 +284,7 @@ function StrictDecodedVideoFrame({
   const canvasRef = useRef<HTMLCanvasElement>(null);
   const decoderPoolRef = useRef(getEditOverlayDecoderPool());
   const decodeLaneRef = useRef<string>(`edit-preview-strict-${++strictDecodeInstanceCounter}`);
+  const prewarmFps = Math.max(1, Math.round(item.sourceFps ?? 60));
   const extractorRef = useRef<VideoFrameSource | null>(null);
   const mountedRef = useRef(true);
   const decoderReadyRef = useRef(false);
@@ -258,26 +292,97 @@ function StrictDecodedVideoFrame({
   const pendingTimeRef = useRef<number | null>(null);
   const consecutiveDecodeFailuresRef = useRef(0);
   const latestTargetTimeRef = useRef(Math.max(0, sourceTime));
-  const useProxy = usePlaybackStore((s) => s.useProxy);
-  const blobUrl = useResolvedVideoBlobUrl(item.mediaId, useProxy);
+  const blobUrl = useResolvedVideoBlobUrl(item.mediaId);
   const contextRef = useRef<CanvasRenderingContext2D | null>(null);
   const decoderItemId = `${item.id}:${decodeLaneRef.current}`;
-  // Frame cache: quantized source time → ImageBitmap for instant revisits
-  const frameCacheRef = useRef<Map<number, ImageBitmap>>(new Map());
-  const frameCacheOrderRef = useRef<number[]>([]);
+  const prewarmInFlightRef = useRef(false);
+  const queuedPrewarmTimesRef = useRef<number[]>([]);
+  const prewarmAnchorFrameRef = useRef<number | null>(null);
 
   useEffect(() => {
     mountedRef.current = true;
     return () => {
       mountedRef.current = false;
-      // Clean up cached bitmaps on unmount
-      for (const bitmap of frameCacheRef.current.values()) {
-        bitmap.close();
-      }
-      frameCacheRef.current.clear();
+      prewarmInFlightRef.current = false;
+      queuedPrewarmTimesRef.current = [];
+      prewarmAnchorFrameRef.current = null;
     };
   }, []);
 
+  const pumpDirectionalPrewarm = useCallback(() => {
+    if (
+      prewarmInFlightRef.current
+      || !decoderReadyRef.current
+      || !mountedRef.current
+      || pendingTimeRef.current !== null
+    ) {
+      return;
+    }
+
+    if (!blobUrl) {
+      queuedPrewarmTimesRef.current = [];
+      return;
+    }
+
+    const timestamps = queuedPrewarmTimesRef.current;
+    if (timestamps.length === 0) {
+      return;
+    }
+
+    prewarmInFlightRef.current = true;
+    queuedPrewarmTimesRef.current = [];
+
+    const run = async () => {
+      try {
+        await backgroundBatchPreseek(blobUrl, timestamps);
+      } finally {
+        prewarmInFlightRef.current = false;
+        if (
+          mountedRef.current
+          && pendingTimeRef.current === null
+          && queuedPrewarmTimesRef.current.length > 0
+        ) {
+          queueMicrotask(() => {
+            if (!mountedRef.current) return;
+            pumpDirectionalPrewarm();
+          });
+        }
+      }
+    };
+
+    void run();
+  }, [blobUrl]);
+
+  const queueDirectionalPrewarm = useCallback((targetTime: number) => {
+    const extractor = extractorRef.current;
+    if (
+      !extractor
+      || !decoderReadyRef.current
+      || pendingTimeRef.current !== null
+      || !blobUrl
+    ) {
+      return;
+    }
+
+    const result = collectEditOverlayDirectionalPrewarmTimes({
+      targetTime,
+      duration: extractor.getDuration(),
+      fps: prewarmFps,
+      previousAnchorFrame: prewarmAnchorFrameRef.current,
+      quantumSeconds: CACHE_TIME_QUANTUM,
+      maxTimestamps: EDIT_OVERLAY_PREWARM_MAX_TIMESTAMPS,
+      isCached: (time) => {
+        const overlayCacheKey = getEditOverlayFrameCacheKey(blobUrl, time, CACHE_TIME_QUANTUM);
+        return hasCachedEditOverlayFrame(overlayCacheKey)
+          || getCachedPredecodedBitmap(blobUrl, time, CACHE_TIME_QUANTUM) !== null;
+      },
+    });
+
+    prewarmAnchorFrameRef.current = result.targetFrame;
+    queuedPrewarmTimesRef.current = result.times;
+    pumpDirectionalPrewarm();
+  }, [blobUrl, prewarmFps, pumpDirectionalPrewarm]);
+
   const drawFrame = useCallback(async (targetTime: number) => {
     const extractor = extractorRef.current;
     const canvas = canvasRef.current;
@@ -299,45 +404,102 @@ function StrictDecodedVideoFrame({
       canvas.height = targetHeight;
     }
 
-    // Check frame cache first
-    const cacheKey = quantizeTime(targetTime);
-    const cache = frameCacheRef.current;
-    const cacheOrder = frameCacheOrderRef.current;
-    const cached = cache.get(cacheKey);
-    if (cached) {
-      ctx.drawImage(cached, 0, 0, canvas.width, canvas.height);
-      // Move to end of LRU order
-      const idx = cacheOrder.indexOf(cacheKey);
-      if (idx !== -1) {
-        cacheOrder.splice(idx, 1);
-        cacheOrder.push(cacheKey);
+    const cacheKey = blobUrl
+      ? getEditOverlayFrameCacheKey(blobUrl, targetTime, CACHE_TIME_QUANTUM)
+      : null;
+    const quantizedTargetTime = quantizeOverlayCacheTime(targetTime);
+    const drawBitmap = (bitmap: CanvasImageSource) => {
+      ctx.clearRect(0, 0, canvas.width, canvas.height);
+      ctx.drawImage(bitmap, 0, 0, canvas.width, canvas.height);
+    };
+    const populateSharedScrubCache = (source: ImageBitmap, resolvedSourceTime: number) => {
+      const scrubbingCache = getActivePreviewScrubbingCache();
+      if (!scrubbingCache) {
+        return;
       }
+      void createImageBitmap(source)
+        .then((bitmap) => {
+          scrubbingCache.putVideoFrame(item.id, bitmap, quantizeOverlayCacheTime(resolvedSourceTime));
+        })
+        .catch(() => {
+          // Shared scrub cache population is best-effort only.
+        });
+    };
+
+    if (cacheKey) {
+      const sharedCachedFrame = getCachedEditOverlayFrame(cacheKey);
+      if (sharedCachedFrame) {
+        drawBitmap(sharedCachedFrame);
+        populateSharedScrubCache(sharedCachedFrame, targetTime);
+        return true;
+      }
+    }
+
+    const scrubbingCacheTolerance = Math.max(
+      CACHE_TIME_QUANTUM / 2,
+      (SCRUBBING_CACHE_TOLERANCE_FACTOR / prewarmFps) / 2,
+    );
+    const scrubCachedFrame = getActivePreviewVideoFrameEntry(
+      item.id,
+      quantizedTargetTime,
+      scrubbingCacheTolerance,
+    );
+    if (scrubCachedFrame) {
+      drawBitmap(scrubCachedFrame.frame);
       return true;
     }
 
-    const didDraw = await extractor.drawFrame(ctx, Math.max(0, targetTime), 0, 0, canvas.width, canvas.height);
-    if (!didDraw) return false;
-
-    // Cache the decoded frame as ImageBitmap
-    try {
-      const bitmap = await createImageBitmap(canvas);
-      cache.set(cacheKey, bitmap);
-      cacheOrder.push(cacheKey);
-      // LRU eviction
-      while (cacheOrder.length > EDIT_PANEL_CACHE_MAX) {
-        const evictKey = cacheOrder.shift()!;
-        const evicted = cache.get(evictKey);
-        if (evicted) {
-          evicted.close();
-          cache.delete(evictKey);
-        }
+    if (blobUrl) {
+      const predecodedBitmap = getCachedPredecodedBitmap(blobUrl, targetTime, CACHE_TIME_QUANTUM);
+      if (predecodedBitmap) {
+        drawBitmap(predecodedBitmap);
+        populateSharedScrubCache(predecodedBitmap, targetTime);
+        return true;
+      }
+
+      const inflightBitmap = await waitForInflightPredecodedBitmap(
+        blobUrl,
+        targetTime,
+        CACHE_TIME_QUANTUM,
+        STRICT_DECODE_SHARED_CACHE_WAIT_MS,
+      ).catch(() => null);
+      if (inflightBitmap) {
+        drawBitmap(inflightBitmap);
+        populateSharedScrubCache(inflightBitmap, targetTime);
+        return true;
+      }
+    }
+
+    const drawResult = await extractor.drawFrameWithCapture(
+      ctx,
+      Math.max(0, targetTime),
+      0,
+      0,
+      canvas.width,
+      canvas.height,
+    );
+    if (!drawResult.success) return false;
+
+    const scrubbingCache = getActivePreviewScrubbingCache();
+    if (scrubbingCache && drawResult.capturedFrame) {
+      scrubbingCache.putVideoFrame(
+        item.id,
+        drawResult.capturedFrame,
+        quantizedTargetTime,
+      );
+    }
+
+    if (cacheKey) {
+      try {
+        const bitmap = await createImageBitmap(canvas);
+        putCachedEditOverlayFrame(cacheKey, bitmap);
+      } catch {
+        // Shared overlay cache population is best-effort only.
       }
-    } catch {
-      // createImageBitmap can fail on empty canvas — not critical
     }
 
     return true;
-  }, []);
+  }, [blobUrl]);
 
   const pumpLatestFrame = useCallback(() => {
     if (renderInFlightRef.current) return;
@@ -352,6 +514,7 @@ function StrictDecodedVideoFrame({
           const didDraw = await drawFrame(targetTime).catch(() => false);
           if (didDraw) {
             consecutiveDecodeFailuresRef.current = 0;
+            queueDirectionalPrewarm(targetTime);
             continue;
           }
 
@@ -376,7 +539,7 @@ function StrictDecodedVideoFrame({
     };
 
     void run();
-  }, [drawFrame, onDecodeFailure]);
+  }, [drawFrame, onDecodeFailure, queueDirectionalPrewarm]);
 
   useEffect(() => {
     decoderReadyRef.current = false;
@@ -384,12 +547,9 @@ function StrictDecodedVideoFrame({
     pendingTimeRef.current = null;
     consecutiveDecodeFailuresRef.current = 0;
     contextRef.current = null;
-    // Clear frame cache on source change
-    for (const bitmap of frameCacheRef.current.values()) {
-      bitmap.close();
-    }
-    frameCacheRef.current.clear();
-    frameCacheOrderRef.current.length = 0;
+    prewarmInFlightRef.current = false;
+    queuedPrewarmTimesRef.current = [];
+    prewarmAnchorFrameRef.current = null;
 
     if (!blobUrl) return;
 
@@ -425,12 +585,18 @@ function StrictDecodedVideoFrame({
   }, [blobUrl, decoderItemId, onDecodeFailure, pumpLatestFrame]);
 
   useEffect(() => {
-    latestTargetTimeRef.current = Math.max(0, sourceTime);
-    pendingTimeRef.current = latestTargetTimeRef.current;
+    const targetTime = Math.max(0, sourceTime);
+    latestTargetTimeRef.current = targetTime;
+    pendingTimeRef.current = targetTime;
+
+    if (blobUrl) {
+      void backgroundPreseek(blobUrl, targetTime).catch(() => null);
+    }
+
     if (decoderReadyRef.current) {
       pumpLatestFrame();
     }
-  }, [sourceTime, pumpLatestFrame]);
+  }, [blobUrl, sourceTime, pumpLatestFrame]);
 
   return (
     <canvas
@@ -441,13 +607,16 @@ function StrictDecodedVideoFrame({
   );
 }
 
-function LegacySeekVideoFrame({ item, sourceTime }: VideoFrameProps) {
+interface LegacySeekVideoFrameProps extends VideoFrameProps {
+  onFailure: () => void;
+}
+
+function LegacySeekVideoFrame({ item, sourceTime, onFailure }: LegacySeekVideoFrameProps) {
   const canvasRef = useRef<HTMLCanvasElement>(null);
   const poolRef = useRef(getGlobalVideoSourcePool());
   const poolClipIdRef = useRef<string>(`edit-preview-${++previewVideoInstanceCounter}`);
   const videoRef = useRef<HTMLVideoElement | null>(null);
-  const useProxy = usePlaybackStore((s) => s.useProxy);
-  const blobUrl = useResolvedVideoBlobUrl(item.mediaId, useProxy);
+  const blobUrl = useResolvedVideoBlobUrl(item.mediaId);
   const contextRef = useRef<CanvasRenderingContext2D | null>(null);
   const seekingRef = useRef(false);
   const pendingTimeRef = useRef<number | null>(null);
@@ -510,6 +679,10 @@ function LegacySeekVideoFrame({ item, sourceTime }: VideoFrameProps) {
     video.playsInline = true;
     videoRef.current = video;
 
+    const handleError = () => {
+      onFailure();
+    };
+
     const handleSeeked = () => {
       seekingRef.current = false;
       drawFrame();
@@ -529,10 +702,12 @@ function LegacySeekVideoFrame({ item, sourceTime }: VideoFrameProps) {
       }
     };
 
+    video.addEventListener('error', handleError);
     video.addEventListener('seeked', handleSeeked);
     video.addEventListener('loadeddata', handleLoadedData);
 
     return () => {
+      video.removeEventListener('error', handleError);
       video.removeEventListener('seeked', handleSeeked);
       video.removeEventListener('loadeddata', handleLoadedData);
       video.pause();
@@ -541,7 +716,7 @@ function LegacySeekVideoFrame({ item, sourceTime }: VideoFrameProps) {
       pendingTimeRef.current = null;
       pool.releaseClip(clipId);
     };
-  }, [blobUrl, drawFrame, requestSeek]);
+  }, [blobUrl, drawFrame, onFailure, requestSeek]);
 
   useEffect(() => {
     const video = videoRef.current;
diff --git a/src/features/preview/components/edit-4up-panels.tsx b/src/features/preview/components/edit-4up-panels.tsx
index a6dd54370..982c2e78a 100644
--- a/src/features/preview/components/edit-4up-panels.tsx
+++ b/src/features/preview/components/edit-4up-panels.tsx
@@ -5,6 +5,7 @@ import {
   ImageFrame,
   TypePlaceholder,
 } from './edit-2up-panels';
+import { useEditOverlayPanelPrewarm } from './use-edit-overlay-panel-prewarm';
 import type { TimelineItem } from '@/types/timeline';
 import {
   getItemAspectRatio,
@@ -31,6 +32,12 @@ export function EditFourUpPanels({
 }: EditFourUpPanelsProps) {
   const containerRef = useRef<HTMLDivElement>(null);
   const [containerSize, setContainerSize] = useState({ width: 0, height: 0 });
+  useEditOverlayPanelPrewarm([
+    leftPanel,
+    rightPanel,
+    topLeftCorner ?? { item: null },
+    topRightCorner ?? { item: null },
+  ]);
 
   useEffect(() => {
     const el = containerRef.current;
diff --git a/src/features/preview/components/inline-source-preview.tsx b/src/features/preview/components/inline-source-preview.tsx
index 8a182775b..34819550e 100644
--- a/src/features/preview/components/inline-source-preview.tsx
+++ b/src/features/preview/components/inline-source-preview.tsx
@@ -162,6 +162,8 @@ const InlineSourcePreviewContent = memo(function InlineSourcePreviewContent({
                       src={blobUrl}
                       mediaType={mediaType}
                       fileName={media.fileName}
+                      pausedFrameSource="clock"
+                      forceFastScrub={seekFrame !== null}
                     />
                   </VideoConfigProvider>
                 </ClockBridgeProvider>
diff --git a/src/features/preview/components/rolling-edit-overlay-utils.ts b/src/features/preview/components/rolling-edit-overlay-utils.ts
new file mode 100644
index 000000000..460fb043c
--- /dev/null
+++ b/src/features/preview/components/rolling-edit-overlay-utils.ts
@@ -0,0 +1,28 @@
+import type { TimelineItem } from '@/types/timeline';
+import { getSourceFrameInfo } from './edit-overlay-utils';
+
+interface RollingEditPanelFramesParams {
+  trimmedItem: TimelineItem;
+  neighborItem: TimelineItem;
+  handle: 'start' | 'end';
+  neighborDelta: number;
+  fps: number;
+}
+
+export function getRollingEditPanelFrames({
+  trimmedItem,
+  neighborItem,
+  handle,
+  neighborDelta,
+  fps,
+}: RollingEditPanelFramesParams) {
+  const leftItem = handle === 'end' ? trimmedItem : neighborItem;
+  const rightItem = handle === 'end' ? neighborItem : trimmedItem;
+
+  return {
+    leftItem,
+    rightItem,
+    outInfo: getSourceFrameInfo(leftItem, Math.max(0, leftItem.durationInFrames + neighborDelta - 1), fps),
+    inInfo: getSourceFrameInfo(rightItem, neighborDelta, fps),
+  };
+}
diff --git a/src/features/preview/components/rolling-edit-overlay.test.ts b/src/features/preview/components/rolling-edit-overlay.test.ts
new file mode 100644
index 000000000..d68ab9d67
--- /dev/null
+++ b/src/features/preview/components/rolling-edit-overlay.test.ts
@@ -0,0 +1,78 @@
+import { describe, expect, it } from 'vitest';
+import type { TimelineItem } from '@/types/timeline';
+import { getRollingEditPanelFrames } from './rolling-edit-overlay-utils';
+
+function makeVideoItem(overrides: Partial<TimelineItem> = {}): TimelineItem {
+  return {
+    id: 'item',
+    type: 'video',
+    trackId: 'track-1',
+    from: 0,
+    durationInFrames: 100,
+    label: 'Clip',
+    src: 'clip.mp4',
+    sourceStart: 0,
+    sourceEnd: 200,
+    sourceDuration: 200,
+    sourceFps: 30,
+    ...overrides,
+  } as TimelineItem;
+}
+
+describe('getRollingEditPanelFrames', () => {
+  it('shows earlier incoming source frames when the edit point rolls left', () => {
+    const trimmedItem = makeVideoItem({
+      id: 'left',
+      from: 0,
+      durationInFrames: 100,
+      sourceStart: 0,
+    });
+    const neighborItem = makeVideoItem({
+      id: 'right',
+      from: 100,
+      durationInFrames: 100,
+      sourceStart: 50,
+    });
+
+    const result = getRollingEditPanelFrames({
+      trimmedItem,
+      neighborItem,
+      handle: 'end',
+      neighborDelta: -30,
+      fps: 30,
+    });
+
+    expect(result.leftItem.id).toBe('left');
+    expect(result.rightItem.id).toBe('right');
+    expect(result.outInfo.sourceFrame).toBe(69);
+    expect(result.inInfo.sourceFrame).toBe(20);
+  });
+
+  it('shows later incoming source frames when the edit point rolls right from a start handle drag', () => {
+    const neighborItem = makeVideoItem({
+      id: 'left',
+      from: 0,
+      durationInFrames: 100,
+      sourceStart: 0,
+    });
+    const trimmedItem = makeVideoItem({
+      id: 'right',
+      from: 100,
+      durationInFrames: 100,
+      sourceStart: 40,
+    });
+
+    const result = getRollingEditPanelFrames({
+      trimmedItem,
+      neighborItem,
+      handle: 'start',
+      neighborDelta: 12,
+      fps: 30,
+    });
+
+    expect(result.leftItem.id).toBe('left');
+    expect(result.rightItem.id).toBe('right');
+    expect(result.outInfo.sourceFrame).toBe(111);
+    expect(result.inInfo.sourceFrame).toBe(52);
+  });
+});
diff --git a/src/features/preview/components/rolling-edit-overlay.tsx b/src/features/preview/components/rolling-edit-overlay.tsx
index b08df8f55..e3fd18958 100644
--- a/src/features/preview/components/rolling-edit-overlay.tsx
+++ b/src/features/preview/components/rolling-edit-overlay.tsx
@@ -4,7 +4,7 @@ import {
 } from '@/features/preview/deps/timeline-store';
 import { useRollingEditPreviewStore } from '@/features/preview/deps/timeline-edit-preview';
 import { EditTwoUpPanels } from './edit-2up-panels';
-import { getSourceFrameInfo } from './edit-overlay-utils';
+import { getRollingEditPanelFrames } from './rolling-edit-overlay-utils';
 
 interface RollingEditOverlayProps {
   fps: number;
@@ -27,19 +27,12 @@ export function RollingEditOverlay({ fps }: RollingEditOverlayProps) {
   const neighborItem = itemsMap.get(neighborItemId);
   if (!trimmedItem || !neighborItem) return null;
 
-  const leftItem = handle === 'end' ? trimmedItem : neighborItem;
-  const rightItem = handle === 'end' ? neighborItem : trimmedItem;
-
-  const editPointFrame =
-    handle === 'end'
-      ? leftItem.from + leftItem.durationInFrames + neighborDelta
-      : rightItem.from + neighborDelta;
-
-  const outLocalFrame = Math.max(0, editPointFrame - leftItem.from - 1);
-  const inLocalFrame = Math.max(0, editPointFrame - rightItem.from);
-
-  const outInfo = getSourceFrameInfo(leftItem, outLocalFrame, fps);
-  const inInfo = getSourceFrameInfo(rightItem, inLocalFrame, fps);
+  const {
+    leftItem,
+    rightItem,
+    outInfo,
+    inInfo,
+  } = getRollingEditPanelFrames({ trimmedItem, neighborItem, handle, neighborDelta, fps });
 
   return (
     <EditTwoUpPanels
@@ -58,4 +51,3 @@ export function RollingEditOverlay({ fps }: RollingEditOverlayProps) {
     />
   );
 }
-
diff --git a/src/features/preview/components/source-composition.tsx b/src/features/preview/components/source-composition.tsx
index 624054419..f37c55030 100644
--- a/src/features/preview/components/source-composition.tsx
+++ b/src/features/preview/components/source-composition.tsx
@@ -16,7 +16,14 @@ import {
   type VideoFrameSource,
 } from '@/features/preview/deps/export';
 import { resolveProxyUrl } from '../utils/media-resolver';
+import {
+  backgroundBatchPreseek,
+  getCachedPredecodedBitmap,
+  waitForInflightPredecodedBitmap,
+} from '../utils/decoder-prewarm';
+import { getDirectionalPrewarmOffsets } from '../utils/fast-scrub-prewarm';
 import { usePlaybackStore } from '@/shared/state/playback';
+import { useSourcePlayerStore } from '@/shared/state/source-player';
 import { useMediaLibraryStore } from '@/features/preview/deps/media-library';
 import { FileAudio } from 'lucide-react';
 
@@ -25,6 +32,8 @@ interface SourceCompositionProps {
   src: string;
   mediaType: 'video' | 'audio' | 'image';
   fileName: string;
+  pausedFrameSource?: 'clock' | 'source-player';
+  forceFastScrub?: boolean;
 }
 
 let sourceMonitorVideoInstanceCounter = 0;
@@ -35,6 +44,12 @@ const SOURCE_MONITOR_STRICT_DECODE_FALLBACK_FAILURES = 2;
 const SOURCE_MONITOR_FRAME_CACHE_MAX = 90;
 const SOURCE_MONITOR_CACHE_TIME_QUANTUM = 1 / 60;
 const SOURCE_MONITOR_PLAYING_RESYNC_THRESHOLD_FRAMES = 6;
+const SOURCE_MONITOR_PREWARM_MAX_TIMESTAMPS = 6;
+const SOURCE_MONITOR_PREWARM_FORWARD_STEPS = 4;
+const SOURCE_MONITOR_PREWARM_BACKWARD_STEPS = 6;
+const SOURCE_MONITOR_PREWARM_OPPOSITE_STEPS = 2;
+const SOURCE_MONITOR_PREWARM_NEUTRAL_RADIUS = 2;
+const SOURCE_MONITOR_SHARED_CACHE_WAIT_MS = 4;
 
 function getSourceMonitorDecoderPool(): SharedVideoExtractorPool {
   if (!globalSourceMonitorDecoderPool) {
@@ -66,9 +81,23 @@ function useSourceMonitorVideoSrc(mediaId: string | undefined, src: string): str
   }, [mediaId, proxyStatus, src, useProxy]);
 }
 
-export function SourceComposition({ mediaId, src, mediaType, fileName }: SourceCompositionProps) {
+export function SourceComposition({
+  mediaId,
+  src,
+  mediaType,
+  fileName,
+  pausedFrameSource = 'source-player',
+  forceFastScrub = false,
+}: SourceCompositionProps) {
   if (mediaType === 'video') {
-    return <VideoSource mediaId={mediaId} src={src} />;
+    return (
+      <VideoSource
+        mediaId={mediaId}
+        src={src}
+        pausedFrameSource={pausedFrameSource}
+        forceFastScrub={forceFastScrub}
+      />
+    );
   }
   if (mediaType === 'image') {
     return <ImageSource src={src} />;
@@ -76,11 +105,26 @@ export function SourceComposition({ mediaId, src, mediaType, fileName }: SourceC
   return <AudioSource src={src} fileName={fileName} />;
 }
 
-function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) {
+function VideoSource({
+  mediaId,
+  src,
+  pausedFrameSource,
+  forceFastScrub,
+}: {
+  mediaId?: string;
+  src: string;
+  pausedFrameSource: 'clock' | 'source-player';
+  forceFastScrub: boolean;
+}) {
   const activeSrc = useSourceMonitorVideoSrc(mediaId, src);
   const clock = useClock();
   const playing = useClockIsPlaying();
   const playbackRate = useClockPlaybackRate();
+  const followSourcePlayerFrames = pausedFrameSource === 'source-player';
+  const sourcePlayerPreviewScrubbing = useSourcePlayerStore((s) => (
+    followSourcePlayerFrames && s.previewSourceFrame !== null
+  ));
+  const isPreviewScrubbing = forceFastScrub || sourcePlayerPreviewScrubbing;
   const videoContainerRef = useRef<HTMLDivElement>(null);
   const videoRef = useRef<HTMLVideoElement | null>(null);
   const audioRef = useRef<HTMLAudioElement | null>(null);
@@ -99,18 +143,52 @@ function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) {
   const consecutiveDecodeFailuresRef = useRef(0);
   const frameCacheRef = useRef<Map<number, ImageBitmap>>(new Map());
   const frameCacheOrderRef = useRef<number[]>([]);
+  const prewarmInFlightRef = useRef(false);
+  const queuedPrewarmTimesRef = useRef<number[]>([]);
+  const prewarmAnchorFrameRef = useRef<number | null>(null);
   const { fps } = useVideoConfig();
   const lastFrameRef = useRef(clock.currentFrame);
   const playingRef = useRef(playing);
+  const currentSourceFrameRef = useRef<number>(useSourcePlayerStore.getState().currentSourceFrame);
+  const previewSourceFrameRef = useRef<number | null>(useSourcePlayerStore.getState().previewSourceFrame);
+  const pausedRenderTargetKeyRef = useRef<number | null>(null);
   const decoderItemId = `${mediaId ?? 'source-monitor'}:${decodeLaneRef.current}`;
   const [useLegacyPausedSeek, setUseLegacyPausedSeek] = useState(false);
   const [strictDecodeReady, setStrictDecodeReady] = useState(false);
   const [hasDecodedFrame, setHasDecodedFrame] = useState(false);
+  const [decodedFrameKey, setDecodedFrameKey] = useState<number | null>(null);
+  const [pausedRenderTargetKey, setPausedRenderTargetKey] = useState<number | null>(null);
 
   useEffect(() => {
     playingRef.current = playing;
   }, [playing]);
 
+  useEffect(() => {
+    if (!followSourcePlayerFrames) {
+      currentSourceFrameRef.current = clock.currentFrame;
+      previewSourceFrameRef.current = null;
+      return;
+    }
+
+    return useSourcePlayerStore.subscribe((state) => {
+      currentSourceFrameRef.current = state.currentSourceFrame;
+      previewSourceFrameRef.current = state.previewSourceFrame;
+    });
+  }, [clock.currentFrame, followSourcePlayerFrames]);
+
+  const getResolvedPausedSourceFrame = useCallback(() => {
+    if (!followSourcePlayerFrames) {
+      return clock.currentFrame;
+    }
+
+    const previewFrame = previewSourceFrameRef.current;
+    if (previewFrame !== null) {
+      return previewFrame;
+    }
+
+    return currentSourceFrameRef.current;
+  }, [clock.currentFrame, followSourcePlayerFrames]);
+
   useEffect(() => {
     mountedRef.current = true;
     return () => {
@@ -120,14 +198,123 @@ function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) {
       }
       frameCacheRef.current.clear();
       frameCacheOrderRef.current = [];
+      prewarmInFlightRef.current = false;
+      queuedPrewarmTimesRef.current = [];
+      prewarmAnchorFrameRef.current = null;
     };
   }, []);
 
   useEffect(() => {
     setUseLegacyPausedSeek(false);
     setHasDecodedFrame(false);
+    setDecodedFrameKey(null);
+    setPausedRenderTargetKey(null);
+    pausedRenderTargetKeyRef.current = null;
+    prewarmInFlightRef.current = false;
+    queuedPrewarmTimesRef.current = [];
+    prewarmAnchorFrameRef.current = null;
   }, [activeSrc, mediaId]);
 
+  const pumpDirectionalPrewarm = useCallback(() => {
+    if (
+      prewarmInFlightRef.current
+      || !decoderReadyRef.current
+      || !mountedRef.current
+      || playingRef.current
+      || pendingTimeRef.current !== null
+    ) {
+      return;
+    }
+
+    if (!activeSrc) {
+      queuedPrewarmTimesRef.current = [];
+      return;
+    }
+
+    const timestamps = queuedPrewarmTimesRef.current;
+    if (timestamps.length === 0) {
+      return;
+    }
+
+    prewarmInFlightRef.current = true;
+    queuedPrewarmTimesRef.current = [];
+
+    const run = async () => {
+      try {
+        await backgroundBatchPreseek(activeSrc, timestamps);
+      } finally {
+        prewarmInFlightRef.current = false;
+        if (
+          mountedRef.current
+          && !playingRef.current
+          && pendingTimeRef.current === null
+          && queuedPrewarmTimesRef.current.length > 0
+        ) {
+          queueMicrotask(() => {
+            if (!mountedRef.current) return;
+            pumpDirectionalPrewarm();
+          });
+        }
+      }
+    };
+
+    void run();
+  }, [activeSrc]);
+
+  const queueDirectionalPrewarm = useCallback((targetTime: number) => {
+    const extractor = extractorRef.current;
+    if (
+      !extractor
+      || !decoderReadyRef.current
+      || playingRef.current
+      || pendingTimeRef.current !== null
+    ) {
+      return;
+    }
+
+    const duration = extractor.getDuration();
+    if (!Number.isFinite(duration) || duration <= 0) {
+      return;
+    }
+
+    const targetFrame = Math.max(0, Math.round(targetTime * fps));
+    const previousAnchorFrame = prewarmAnchorFrameRef.current;
+    const direction: -1 | 0 | 1 = previousAnchorFrame === null || previousAnchorFrame === targetFrame
+      ? 0
+      : targetFrame > previousAnchorFrame
+        ? 1
+        : -1;
+    prewarmAnchorFrameRef.current = targetFrame;
+
+    const offsets = getDirectionalPrewarmOffsets(direction, {
+      forwardSteps: SOURCE_MONITOR_PREWARM_FORWARD_STEPS,
+      backwardSteps: SOURCE_MONITOR_PREWARM_BACKWARD_STEPS,
+      oppositeSteps: SOURCE_MONITOR_PREWARM_OPPOSITE_STEPS,
+      neutralRadius: SOURCE_MONITOR_PREWARM_NEUTRAL_RADIUS,
+    });
+
+    const maxFrame = Math.max(0, Math.floor(duration * fps) - 1);
+    const cache = frameCacheRef.current;
+    const nextPrewarmTimes: number[] = [];
+    const seen = new Set<number>();
+
+    for (const offset of offsets) {
+      const prewarmFrame = targetFrame + offset;
+      if (prewarmFrame < 0 || prewarmFrame > maxFrame) continue;
+      const prewarmTime = quantizeSourceMonitorTime(prewarmFrame / fps);
+      if (prewarmTime === quantizeSourceMonitorTime(targetTime)) continue;
+      if (cache.has(prewarmTime) || seen.has(prewarmTime)) continue;
+      seen.add(prewarmTime);
+      nextPrewarmTimes.push(prewarmTime);
+      if (nextPrewarmTimes.length >= SOURCE_MONITOR_PREWARM_MAX_TIMESTAMPS) {
+        break;
+      }
+    }
+
+    queuedPrewarmTimesRef.current = nextPrewarmTimes;
+    pumpDirectionalPrewarm();
+  }, [fps, pumpDirectionalPrewarm]);
+
   const drawDecodedFrame = useCallback(async (targetTime: number) => {
     const extractor = extractorRef.current;
     const canvas = canvasRef.current;
@@ -150,6 +337,10 @@ function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) {
     }
 
     const cacheKey = quantizeSourceMonitorTime(targetTime);
+    const markDecodedFrame = () => {
+      setHasDecodedFrame(true);
+      setDecodedFrameKey((prev) => (prev === cacheKey ? prev : cacheKey));
+    };
     const cache = frameCacheRef.current;
     const cacheOrder = frameCacheOrderRef.current;
     const cached = cache.get(cacheKey);
@@ -161,7 +352,33 @@ function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) {
         cacheOrder.splice(cacheIndex, 1);
         cacheOrder.push(cacheKey);
       }
+      markDecodedFrame();
+      return true;
+    }
+
+    const drawSharedBitmap = (bitmap: ImageBitmap): boolean => {
+      ctx.clearRect(0, 0, canvas.width, canvas.height);
+      ctx.drawImage(bitmap, 0, 0, canvas.width, canvas.height);
       return true;
+    };
+
+    if (activeSrc) {
+      const sharedBitmap = getCachedPredecodedBitmap(activeSrc, Math.max(0, targetTime), SOURCE_MONITOR_CACHE_TIME_QUANTUM);
+      if (sharedBitmap && drawSharedBitmap(sharedBitmap)) {
+        markDecodedFrame();
+        return true;
+      }
+
+      const inflightBitmap = await waitForInflightPredecodedBitmap(
+        activeSrc,
+        Math.max(0, targetTime),
+        SOURCE_MONITOR_CACHE_TIME_QUANTUM,
+        SOURCE_MONITOR_SHARED_CACHE_WAIT_MS,
+      ).catch(() => null);
+      if (inflightBitmap && drawSharedBitmap(inflightBitmap)) {
+        markDecodedFrame();
+        return true;
+      }
     }
 
     const didDraw = await extractor.drawFrame(
@@ -190,8 +407,9 @@ function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) {
       // Cache population is best-effort only.
     }
 
+    markDecodedFrame();
     return true;
-  }, []);
+  }, [activeSrc]);
 
   const pumpLatestDecodedFrame = useCallback(() => {
     if (renderInFlightRef.current) return;
@@ -211,7 +429,7 @@ function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) {
           const didDraw = await drawDecodedFrame(targetTime).catch(() => false);
           if (didDraw) {
             consecutiveDecodeFailuresRef.current = 0;
-            setHasDecodedFrame(true);
+            queueDirectionalPrewarm(targetTime);
             continue;
           }
 
@@ -243,7 +461,7 @@ function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) {
     };
 
     void run();
-  }, [drawDecodedFrame]);
+  }, [drawDecodedFrame, queueDirectionalPrewarm]);
 
   // Acquire/release pooled element when source changes.
   useEffect(() => {
@@ -292,6 +510,9 @@ function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) {
     pendingTimeRef.current = null;
     consecutiveDecodeFailuresRef.current = 0;
     contextRef.current = null;
+    prewarmInFlightRef.current = false;
+    queuedPrewarmTimesRef.current = [];
+    prewarmAnchorFrameRef.current = null;
 
     for (const bitmap of frameCacheRef.current.values()) {
       bitmap.close();
@@ -339,15 +560,22 @@ function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) {
     const video = videoRef.current;
     const audio = audioRef.current;
     const targetTime = frame / fps;
+    const targetCacheKey = quantizeSourceMonitorTime(targetTime);
     latestTargetTimeRef.current = targetTime;
 
     lastFrameRef.current = frame;
 
-    if (!playingRef.current && !useLegacyPausedSeek) {
+    if (!playingRef.current && !useLegacyPausedSeek && !isPreviewScrubbing) {
+      if (pausedRenderTargetKeyRef.current !== targetCacheKey) {
+        pausedRenderTargetKeyRef.current = targetCacheKey;
+        setPausedRenderTargetKey(targetCacheKey);
+      }
       pendingTimeRef.current = targetTime;
       if (decoderReadyRef.current) {
         pumpLatestDecodedFrame();
       }
+    } else if (isPreviewScrubbing) {
+      pendingTimeRef.current = null;
     }
 
     const syncAudioTime = () => {
@@ -376,7 +604,7 @@ function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) {
     const canSeek = video.readyState >= 1;
     if (!canSeek) return;
 
-    if (!playingRef.current && strictDecodeReady && hasDecodedFrame && !useLegacyPausedSeek) {
+    if (!playingRef.current && strictDecodeReady && hasDecodedFrame && !useLegacyPausedSeek && !isPreviewScrubbing) {
       syncAudioTime();
       return;
     }
@@ -395,25 +623,67 @@ function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) {
       return;
     }
 
-    try {
-      poolRef.current.seekClip(poolClipIdRef.current, frame / fps, { fast: true });
-    } catch {
-      // Ignore seek errors while media is loading
+    if (isPreviewScrubbing) {
+      if (Math.abs(video.currentTime - targetTime) >= 0.016) {
+        try {
+          video.currentTime = targetTime;
+        } catch {
+          // Ignore seek errors while media is loading
+        }
+      }
+    } else {
+      try {
+        poolRef.current.seekClip(poolClipIdRef.current, frame / fps, { fast: true });
+      } catch {
+        // Ignore seek errors while media is loading
+      }
     }
 
     syncAudioTime();
-  }, [activeSrc, fps, hasDecodedFrame, pumpLatestDecodedFrame, src, strictDecodeReady, useLegacyPausedSeek]);
+  }, [
+    activeSrc,
+    fps,
+    hasDecodedFrame,
+    isPreviewScrubbing,
+    pumpLatestDecodedFrame,
+    src,
+    strictDecodeReady,
+    useLegacyPausedSeek,
+  ]);
 
   useEffect(() => {
-    syncSourceFrame(clock.currentFrame);
+    syncSourceFrame(playing ? clock.currentFrame : getResolvedPausedSourceFrame());
     return clock.onFrameChange((frame) => {
+      if (!playingRef.current) {
+        return;
+      }
       syncSourceFrame(frame);
     });
-  }, [clock, syncSourceFrame]);
+  }, [clock, getResolvedPausedSourceFrame, playing, syncSourceFrame]);
+
+  useEffect(() => {
+    syncSourceFrame(playing ? clock.currentFrame : getResolvedPausedSourceFrame());
+  }, [clock, getResolvedPausedSourceFrame, playing, syncSourceFrame]);
 
   useEffect(() => {
-    syncSourceFrame(clock.currentFrame);
-  }, [clock, playing, syncSourceFrame]);
+    if (!followSourcePlayerFrames) {
+      return;
+    }
+
+    return useSourcePlayerStore.subscribe((state, prevState) => {
+      if (
+        playingRef.current
+        || (
+          state.previewSourceFrame === prevState.previewSourceFrame
+          && state.currentSourceFrame === prevState.currentSourceFrame
+        )
+      ) {
+        return;
+      }
+
+      syncSourceFrame(state.previewSourceFrame ?? state.currentSourceFrame);
+    });
+  }, [followSourcePlayerFrames, syncSourceFrame]);
 
   // Handle play/pause sync
   useEffect(() => {
@@ -454,7 +724,15 @@ function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) {
     }
   }, [playbackRate, playing, src]);
 
-  const showDecodedCanvas = !playing && strictDecodeReady && hasDecodedFrame && !useLegacyPausedSeek;
+  const showDecodedCanvas = (
+    !playing
+    && !isPreviewScrubbing
+    && strictDecodeReady
+    && hasDecodedFrame
+    && !useLegacyPausedSeek
+    && decodedFrameKey !== null
+    && decodedFrameKey === pausedRenderTargetKey
+  );
 
   return (
     <AbsoluteFill>
diff --git a/src/features/preview/components/source-monitor.test.tsx b/src/features/preview/components/source-monitor.test.tsx
index 600821972..89fd5d587 100644
--- a/src/features/preview/components/source-monitor.test.tsx
+++ b/src/features/preview/components/source-monitor.test.tsx
@@ -1,6 +1,6 @@
 import { StrictMode, type ReactNode } from 'react';
 import { beforeAll, beforeEach, describe, expect, it, vi } from 'vitest';
-import { render, waitFor } from '@testing-library/react';
+import { fireEvent, render, waitFor } from '@testing-library/react';
 
 const editorStoreState = vi.hoisted(() => ({
   sourcePreviewMediaId: 'media-1' as string | null,
@@ -11,6 +11,7 @@ const sourcePlayerStoreState = vi.hoisted(() => ({
   playerMethods: null as unknown,
   currentMediaId: null as string | null,
   currentSourceFrame: 0,
+  previewSourceFrame: null as number | null,
   inPoint: null as number | null,
   outPoint: null as number | null,
   pendingSeekFrame: null as number | null,
@@ -19,6 +20,7 @@ const sourcePlayerStoreState = vi.hoisted(() => ({
   setCurrentMediaId: vi.fn(),
   releaseCurrentMediaId: vi.fn(),
   setCurrentSourceFrame: vi.fn(),
+  setPreviewSourceFrame: vi.fn(),
   setInPoint: vi.fn(),
   setOutPoint: vi.fn(),
   clearInOutPoints: vi.fn(),
@@ -42,23 +44,31 @@ const itemsStoreState = vi.hoisted(() => ({
   tracks: [],
 }));
 
+const playerMethodsState = vi.hoisted(() => ({
+  seek: vi.fn(),
+  play: vi.fn(),
+  pause: vi.fn(),
+  toggle: vi.fn(),
+  frameBack: vi.fn(),
+  frameForward: vi.fn(),
+}));
+
+const clockState = vi.hoisted(() => ({
+  currentFrame: 0,
+  isPlaying: false,
+}));
+
 vi.mock('@/features/preview/deps/player-context', () => ({
   PlayerEmitterProvider: ({ children }: { children: ReactNode }) => <>{children}</>,
   ClockBridgeProvider: ({ children }: { children: ReactNode }) => <>{children}</>,
   VideoConfigProvider: ({ children }: { children: ReactNode }) => <>{children}</>,
   useClock: () => ({
-    currentFrame: 0,
-    isPlaying: false,
+    currentFrame: clockState.currentFrame,
+    isPlaying: clockState.isPlaying,
     onFrameChange: () => () => {},
   }),
-  useClockIsPlaying: () => false,
-  usePlayer: () => ({
-    seek: vi.fn(),
-    play: vi.fn(),
-    toggle: vi.fn(),
-    frameBack: vi.fn(),
-    frameForward: vi.fn(),
-  }),
+  useClockIsPlaying: () => clockState.isPlaying,
+  usePlayer: () => playerMethodsState,
 }));
 
 vi.mock('./source-composition', () => ({
@@ -165,11 +175,19 @@ describe('SourceMonitor current media ownership', () => {
     }
 
     vi.stubGlobal('ResizeObserver', ResizeObserverMock);
+    vi.stubGlobal('requestAnimationFrame', (callback: FrameRequestCallback) => (
+      window.setTimeout(() => callback(performance.now()), 0)
+    ));
+    vi.stubGlobal('cancelAnimationFrame', (handle: number) => {
+      window.clearTimeout(handle);
+    });
   });
 
   beforeEach(() => {
     vi.clearAllMocks();
     editorStoreState.sourcePreviewMediaId = 'media-1';
+    clockState.currentFrame = 0;
+    clockState.isPlaying = false;
   });
 
   it('does not release the current media during the initial Strict Mode remount', async () => {
@@ -199,4 +217,64 @@ describe('SourceMonitor current media ownership', () => {
     expect(sourcePlayerStoreState.releaseCurrentMediaId).toHaveBeenCalledWith('media-1');
   });
 
+  it('batches seek bar drags and commits the final frame on mouseup', async () => {
+    const rendered = render(<SourceMonitor mediaId="media-1" />);
+
+    await waitFor(() => {
+      expect(sourcePlayerStoreState.setCurrentMediaId).toHaveBeenCalledWith('media-1');
+    });
+
+    const seekBar = rendered.getByTestId('source-monitor-seek-bar');
+    vi.spyOn(seekBar, 'getBoundingClientRect').mockReturnValue({
+      x: 0,
+      y: 0,
+      top: 0,
+      left: 0,
+      right: 100,
+      bottom: 10,
+      width: 100,
+      height: 10,
+      toJSON: () => ({}),
+    });
+
+    fireEvent.mouseDown(seekBar, { clientX: 25 });
+    fireEvent.mouseMove(document, { clientX: 75 });
+
+    expect(playerMethodsState.seek).not.toHaveBeenCalled();
+    await waitFor(() => {
+      expect(sourcePlayerStoreState.setCurrentSourceFrame).toHaveBeenLastCalledWith(112);
+    });
+
+    fireEvent.mouseUp(document);
+
+    expect(playerMethodsState.seek).toHaveBeenCalledTimes(1);
+    expect(playerMethodsState.seek).toHaveBeenCalledWith(112);
+  });
+
+  it('pauses playback when seek-bar scrubbing starts', async () => {
+    clockState.isPlaying = true;
+    const rendered = render(<SourceMonitor mediaId="media-1" />);
+
+    await waitFor(() => {
+      expect(sourcePlayerStoreState.setCurrentMediaId).toHaveBeenCalledWith('media-1');
+    });
+
+    const seekBar = rendered.getByTestId('source-monitor-seek-bar');
+    vi.spyOn(seekBar, 'getBoundingClientRect').mockReturnValue({
+      x: 0,
+      y: 0,
+      top: 0,
+      left: 0,
+      right: 100,
+      bottom: 10,
+      width: 100,
+      height: 10,
+      toJSON: () => ({}),
+    });
+
+    fireEvent.mouseDown(seekBar, { clientX: 25 });
+
+    expect(playerMethodsState.pause).toHaveBeenCalledTimes(1);
+  });
+
 });
diff --git a/src/features/preview/components/source-monitor.tsx b/src/features/preview/components/source-monitor.tsx
index 7184e6e3f..8a772345c 100644
--- a/src/features/preview/components/source-monitor.tsx
+++ b/src/features/preview/components/source-monitor.tsx
@@ -41,6 +41,7 @@ import { useEditorStore } from '@/app/state/editor';
 import { useSourcePlayerStore } from '@/shared/state/source-player';
 import { useSelectionStore } from '@/shared/state/selection';
 import { EDITOR_LAYOUT_CSS_VALUES, getEditorLayout } from '@/app/editor-layout';
+import { createScrubThrottleState, shouldCommitScrubFrame } from '../deps/timeline-utils';
 import { cn } from '@/shared/ui/cn';
 import { formatTimecodeCompact } from '@/shared/utils/time-utils';
 import type { TimelineTrack } from '@/types/timeline';
@@ -559,6 +560,12 @@ function SourcePlaybackControls({
     [fps, formatFrameNumber],
   );
 
+  const clearPreviewSourceFrame = useCallback(() => {
+    if (interactive) {
+      useSourcePlayerStore.getState().setPreviewSourceFrame(null);
+    }
+  }, [interactive]);
+
   const updateFrameDisplay = useCallback((frame: number) => {
     currentFrameRef.current = frame;
     if (interactive) {
@@ -575,21 +582,44 @@ function SourcePlaybackControls({
     }
   }, [fps, formatFrameNumber, interactive, lastFrame]);
 
+  const commitSourceSeek = useCallback((frame: number) => {
+    clearPreviewSourceFrame();
+    updateFrameDisplay(frame);
+    player.seek(frame);
+  }, [clearPreviewSourceFrame, player, updateFrameDisplay]);
+
   // Bridge player methods into the source player store for keyboard shortcuts
   useEffect(() => {
     if (!interactive) return;
     const setPlayerMethods = useSourcePlayerStore.getState().setPlayerMethods;
     setPlayerMethods({
-      toggle: player.toggle,
-      seek: player.seek,
-      frameBack: player.frameBack,
-      frameForward: player.frameForward,
+      toggle: () => {
+        const previewFrame = useSourcePlayerStore.getState().previewSourceFrame;
+        if (previewFrame !== null) {
+          commitSourceSeek(previewFrame);
+        }
+        player.toggle();
+      },
+      pause: () => {
+        player.pause();
+      },
+      seek: (frame) => {
+        commitSourceSeek(frame);
+      },
+      frameBack: (frames) => {
+        clearPreviewSourceFrame();
+        player.frameBack(frames);
+      },
+      frameForward: (frames) => {
+        clearPreviewSourceFrame();
+        player.frameForward(frames);
+      },
       getDurationInFrames: () => durationInFrames,
     });
     return () => {
       useSourcePlayerStore.getState().setPlayerMethods(null);
     };
-  }, [durationInFrames, interactive, player.toggle, player.seek, player.frameBack, player.frameForward]);
+  }, [clearPreviewSourceFrame, commitSourceSeek, durationInFrames, interactive, player]);
 
   useEffect(() => {
     updateFrameDisplay(clock.currentFrame);
@@ -603,20 +633,29 @@ function SourcePlaybackControls({
     });
   }, [clock, player, updateFrameDisplay]);
 
-  // Consume pending seek (e.g. double-click opens clip at its In point)
+  // Consume pending seek. Always pause → seek → (optionally) play so
+  // switching scenes lands a clean transition: no `player.play()` short-
+  // circuiting because the previous scene was still playing (the ref
+  // `imperativePlaying.current` blocks a second play), and the video
+  // element isn't decoding the old frame while the seek is in flight.
   const pendingSeekFrame = useSourcePlayerStore((s) => s.pendingSeekFrame);
   useEffect(() => {
     if (!interactive) return;
     if (pendingSeekFrame !== null) {
-      player.seek(pendingSeekFrame);
-      useSourcePlayerStore.getState().setPendingSeekFrame(null);
+      player.pause();
+      commitSourceSeek(pendingSeekFrame);
+      const store = useSourcePlayerStore.getState();
+      store.setPendingSeekFrame(null);
+      const shouldPlay = store.pendingPlay;
+      store.setPendingPlay(false);
+      if (shouldPlay) player.play();
     }
-  }, [interactive, pendingSeekFrame, player]);
+  }, [commitSourceSeek, interactive, pendingSeekFrame, player]);
 
   useEffect(() => {
     if (seekFrame === null) return;
-    player.seek(seekFrame);
-  }, [player, seekFrame]);
+    commitSourceSeek(seekFrame);
+  }, [commitSourceSeek, seekFrame]);
 
   // Read I/O points from store
   const inPoint = useSourcePlayerStore((s) => s.inPoint);
@@ -630,29 +669,124 @@ function SourcePlaybackControls({
   const draggingRef = useRef(false);
   const onMoveRef = useRef<((ev: MouseEvent) => void) | null>(null);
   const onUpRef = useRef<(() => void) | null>(null);
-
-  const seekFromX = useCallback(
+  const pendingBarSeekFrameRef = useRef<number | null>(null);
+  const pendingBarPointerXRef = useRef<number | null>(null);
+  const barSeekRafRef = useRef<number | null>(null);
+  const lastIssuedBarSeekFrameRef = useRef<number | null>(null);
+  const scrubThrottleStateRef = useRef(createScrubThrottleState({
+    frame: clock.currentFrame,
+    nowMs: performance.now(),
+  }));
+
+  const frameFromBarX = useCallback(
     (clientX: number) => {
       const bar = barRef.current;
-      if (!bar) return;
+      if (!bar) return null;
       const rect = bar.getBoundingClientRect();
+      if (rect.width <= 0) {
+        return 0;
+      }
       const pct = Math.max(0, Math.min(1, (clientX - rect.left) / rect.width));
-      player.seek(Math.round(pct * lastFrame));
+      return Math.round(pct * lastFrame);
     },
-    [player, lastFrame],
+    [lastFrame],
   );
 
+  const flushBarSeekFrame = useCallback((frame: number) => {
+    lastIssuedBarSeekFrameRef.current = frame;
+    commitSourceSeek(frame);
+  }, [commitSourceSeek]);
+
+  const getBarPixelsPerSecond = useCallback(() => {
+    const bar = barRef.current;
+    if (!bar || durationInFrames <= 0 || fps <= 0) {
+      return 0;
+    }
+
+    return (bar.clientWidth * fps) / durationInFrames;
+  }, [durationInFrames, fps]);
+
+  const previewBarSeekFrame = useCallback((frame: number) => {
+    pendingBarSeekFrameRef.current = frame;
+
+    if (interactive) {
+      useSourcePlayerStore.getState().setPreviewSourceFrame(frame);
+    }
+    if (currentFrameRef.current !== frame) {
+      updateFrameDisplay(frame);
+    }
+  }, [interactive, updateFrameDisplay]);
+
+  const scheduleBarSeekFrame = useCallback((frame: number, pointerX: number, force = false) => {
+    pendingBarSeekFrameRef.current = frame;
+    pendingBarPointerXRef.current = pointerX;
+
+    if (force) {
+      previewBarSeekFrame(frame);
+      return;
+    }
+
+    if (barSeekRafRef.current !== null) {
+      return;
+    }
+
+    barSeekRafRef.current = requestAnimationFrame(() => {
+      barSeekRafRef.current = null;
+      const pendingFrame = pendingBarSeekFrameRef.current;
+      const pendingPointerX = pendingBarPointerXRef.current;
+      if (pendingFrame === null || pendingPointerX === null) {
+        return;
+      }
+
+      if (shouldCommitScrubFrame({
+        state: scrubThrottleStateRef.current,
+        pointerX: pendingPointerX,
+        targetFrame: pendingFrame,
+        pixelsPerSecond: getBarPixelsPerSecond(),
+        nowMs: performance.now(),
+      })) {
+        previewBarSeekFrame(pendingFrame);
+      }
+    });
+  }, [getBarPixelsPerSecond, previewBarSeekFrame]);
+
   const handleBarMouseDown = useCallback(
     (e: React.MouseEvent) => {
       e.preventDefault();
       e.stopPropagation();
       draggingRef.current = true;
-      seekFromX(e.clientX);
+      if (playing) {
+        player.pause();
+        replayingRef.current = false;
+      }
+      const initialFrame = frameFromBarX(e.clientX);
+      if (initialFrame !== null) {
+        scrubThrottleStateRef.current = createScrubThrottleState({
+          pointerX: e.clientX,
+          frame: initialFrame,
+          nowMs: performance.now(),
+        });
+        scheduleBarSeekFrame(initialFrame, e.clientX, true);
+      }
       const onMove = (ev: MouseEvent) => {
-        if (draggingRef.current) seekFromX(ev.clientX);
+        if (!draggingRef.current) return;
+        const nextFrame = frameFromBarX(ev.clientX);
+        if (nextFrame !== null) {
+          scheduleBarSeekFrame(nextFrame, ev.clientX);
+        }
       };
       const onUp = () => {
+        const pendingFrame = pendingBarSeekFrameRef.current;
         draggingRef.current = false;
+        pendingBarSeekFrameRef.current = null;
+        pendingBarPointerXRef.current = null;
+        if (barSeekRafRef.current !== null) {
+          cancelAnimationFrame(barSeekRafRef.current);
+          barSeekRafRef.current = null;
+        }
+        if (pendingFrame !== null) {
+          flushBarSeekFrame(pendingFrame);
+        }
         if (onMoveRef.current) {
           document.removeEventListener('mousemove', onMoveRef.current);
           onMoveRef.current = null;
@@ -667,12 +801,19 @@ function SourcePlaybackControls({
       document.addEventListener('mousemove', onMove);
       document.addEventListener('mouseup', onUp);
     },
-    [seekFromX],
+    [flushBarSeekFrame, frameFromBarX, player, playing, scheduleBarSeekFrame],
   );
 
   // Clean up document listeners on unmount
   useEffect(() => {
     return () => {
+      pendingBarSeekFrameRef.current = null;
+      pendingBarPointerXRef.current = null;
+      lastIssuedBarSeekFrameRef.current = null;
+      if (barSeekRafRef.current !== null) {
+        cancelAnimationFrame(barSeekRafRef.current);
+        barSeekRafRef.current = null;
+      }
       if (onMoveRef.current) {
         document.removeEventListener('mousemove', onMoveRef.current);
         onMoveRef.current = null;
@@ -803,9 +944,35 @@ function SourcePlaybackControls({
     const { inPoint: ip, outPoint: op } = useSourcePlayerStore.getState();
     if (ip === null && op === null) return;
     replayingRef.current = true;
-    player.seek(ip ?? 0);
+    commitSourceSeek(ip ?? 0);
     player.play();
-  }, [player]);
+  }, [commitSourceSeek, player]);
+
+  const handleGoToStart = useCallback(() => {
+    commitSourceSeek(0);
+  }, [commitSourceSeek]);
+
+  const handleStepBack = useCallback(() => {
+    clearPreviewSourceFrame();
+    player.frameBack(1);
+  }, [clearPreviewSourceFrame, player]);
+
+  const handleTogglePlayback = useCallback(() => {
+    const previewFrame = useSourcePlayerStore.getState().previewSourceFrame;
+    if (previewFrame !== null) {
+      commitSourceSeek(previewFrame);
+    }
+    player.toggle();
+  }, [commitSourceSeek, player]);
+
+  const handleStepForward = useCallback(() => {
+    clearPreviewSourceFrame();
+    player.frameForward(1);
+  }, [clearPreviewSourceFrame, player]);
+
+  const handleGoToEnd = useCallback(() => {
+    commitSourceSeek(lastFrame);
+  }, [commitSourceSeek, lastFrame]);
 
   const activeTrack = useMemo(
     () => (activeTrackId ? tracks.find((track) => track.id === activeTrackId) ?? null : null),
@@ -1021,6 +1188,7 @@ function SourcePlaybackControls({
           {/* Seek bar */}
           <div
             ref={barRef}
+            data-testid="source-monitor-seek-bar"
             className="w-full h-1.5 bg-muted rounded cursor-pointer relative"
             onMouseDown={handleBarMouseDown}
           >
@@ -1098,7 +1266,7 @@ function SourcePlaybackControls({
         <div className="flex items-center gap-0.5 shrink-0">
           <Tooltip>
             <TooltipTrigger asChild>
-              <Button variant="ghost" size="icon" style={{ width: EDITOR_LAYOUT_CSS_VALUES.toolbarButtonSize, height: EDITOR_LAYOUT_CSS_VALUES.toolbarButtonSize }} onClick={() => player.seek(0)}>
+              <Button variant="ghost" size="icon" style={{ width: EDITOR_LAYOUT_CSS_VALUES.toolbarButtonSize, height: EDITOR_LAYOUT_CSS_VALUES.toolbarButtonSize }} onClick={handleGoToStart}>
                 <SkipBack className="w-3.5 h-3.5" />
               </Button>
             </TooltipTrigger>
@@ -1106,7 +1274,7 @@ function SourcePlaybackControls({
           </Tooltip>
           <Tooltip>
             <TooltipTrigger asChild>
-              <Button variant="ghost" size="icon" style={{ width: EDITOR_LAYOUT_CSS_VALUES.toolbarButtonSize, height: EDITOR_LAYOUT_CSS_VALUES.toolbarButtonSize }} onClick={() => player.frameBack(1)}>
+              <Button variant="ghost" size="icon" style={{ width: EDITOR_LAYOUT_CSS_VALUES.toolbarButtonSize, height: EDITOR_LAYOUT_CSS_VALUES.toolbarButtonSize }} onClick={handleStepBack}>
                 <ChevronLeft className="w-3.5 h-3.5" />
               </Button>
             </TooltipTrigger>
@@ -1114,7 +1282,7 @@ function SourcePlaybackControls({
           </Tooltip>
           <Tooltip>
             <TooltipTrigger asChild>
-              <Button size="icon" style={{ width: EDITOR_LAYOUT_CSS_VALUES.toolbarButtonSize, height: EDITOR_LAYOUT_CSS_VALUES.toolbarButtonSize }} onClick={() => player.toggle()}>
+              <Button size="icon" style={{ width: EDITOR_LAYOUT_CSS_VALUES.toolbarButtonSize, height: EDITOR_LAYOUT_CSS_VALUES.toolbarButtonSize }} onClick={handleTogglePlayback}>
                 {playing ? <Pause className="w-3.5 h-3.5" /> : <Play className="w-3.5 h-3.5 ml-0.5" />}
               </Button>
             </TooltipTrigger>
@@ -1122,7 +1290,7 @@ function SourcePlaybackControls({
           </Tooltip>
           <Tooltip>
             <TooltipTrigger asChild>
-              <Button variant="ghost" size="icon" style={{ width: EDITOR_LAYOUT_CSS_VALUES.toolbarButtonSize, height: EDITOR_LAYOUT_CSS_VALUES.toolbarButtonSize }} onClick={() => player.frameForward(1)}>
+              <Button variant="ghost" size="icon" style={{ width: EDITOR_LAYOUT_CSS_VALUES.toolbarButtonSize, height: EDITOR_LAYOUT_CSS_VALUES.toolbarButtonSize }} onClick={handleStepForward}>
                 <ChevronRight className="w-3.5 h-3.5" />
               </Button>
             </TooltipTrigger>
@@ -1130,7 +1298,7 @@ function SourcePlaybackControls({
           </Tooltip>
           <Tooltip>
             <TooltipTrigger asChild>
-              <Button variant="ghost" size="icon" style={{ width: EDITOR_LAYOUT_CSS_VALUES.toolbarButtonSize, height: EDITOR_LAYOUT_CSS_VALUES.toolbarButtonSize }} onClick={() => player.seek(lastFrame)}>
+              <Button variant="ghost" size="icon" style={{ width: EDITOR_LAYOUT_CSS_VALUES.toolbarButtonSize, height: EDITOR_LAYOUT_CSS_VALUES.toolbarButtonSize }} onClick={handleGoToEnd}>
                 <SkipForward className="w-3.5 h-3.5" />
               </Button>
             </TooltipTrigger>
diff --git a/src/features/preview/components/timecode-display.test.tsx b/src/features/preview/components/timecode-display.test.tsx
index bf3161621..c1fcaab4e 100644
--- a/src/features/preview/components/timecode-display.test.tsx
+++ b/src/features/preview/components/timecode-display.test.tsx
@@ -54,4 +54,30 @@ describe('TimecodeDisplay', () => {
     expect(button).toHaveTextContent('0012');
     expect(button).toHaveTextContent('0999');
   });
+
+  it('shows the skim preview frame in the timecode readout', () => {
+    render(<TimecodeDisplay fps={30} totalFrames={1000} />);
+
+    const button = screen.getByRole('button');
+    expect(button).toHaveTextContent('00:00:12');
+
+    usePlaybackStore.getState().setPreviewFrame(48);
+
+    expect(button).toHaveTextContent('00:01:18');
+  });
+
+  it('prefers the displayed overlay frame when fast scrub owns presentation', () => {
+    render(<TimecodeDisplay fps={30} totalFrames={1000} />);
+
+    const button = screen.getByRole('button');
+    usePlaybackStore.setState({
+      currentFrame: 12,
+      currentFrameEpoch: 1,
+      previewFrame: 48,
+      previewFrameEpoch: 2,
+    });
+    usePreviewBridgeStore.getState().setDisplayedFrame(50);
+
+    expect(button).toHaveTextContent('00:01:20');
+  });
 });
diff --git a/src/features/preview/components/timecode-display.tsx b/src/features/preview/components/timecode-display.tsx
index cd387be52..8e0c66df8 100644
--- a/src/features/preview/components/timecode-display.tsx
+++ b/src/features/preview/components/timecode-display.tsx
@@ -1,5 +1,6 @@
 import { useState, useEffect, useRef, useCallback } from 'react';
-import { usePlaybackStore } from '@/shared/state/playback';
+import { getResolvedPlaybackFrame, usePlaybackStore } from '@/shared/state/playback';
+import { usePreviewBridgeStore } from '@/shared/state/preview-bridge';
 import { formatTimecodeCompact } from '@/shared/utils/time-utils';
 
 interface TimecodeDisplayProps {
@@ -38,7 +39,20 @@ export function TimecodeDisplay({ fps, totalFrames }: TimecodeDisplayProps) {
     return frame.toString().padStart(maxDigits, '0');
   }, []);
 
-  // Subscribe to currentFrame changes and update DOM directly (no React re-renders)
+  const getVisibleFrame = useCallback(() => {
+    const playbackState = usePlaybackStore.getState();
+    return getResolvedPlaybackFrame({
+      currentFrame: playbackState.currentFrame,
+      currentFrameEpoch: playbackState.currentFrameEpoch,
+      previewFrame: playbackState.previewFrame,
+      previewFrameEpoch: playbackState.previewFrameEpoch,
+      isPlaying: playbackState.isPlaying,
+      displayedFrame: usePreviewBridgeStore.getState().displayedFrame,
+    });
+  }, []);
+
+  // Subscribe to the resolved visible preview frame and update DOM directly
+  // (no React re-renders during playback/scrub).
   useEffect(() => {
     const updateDisplay = (frame: number) => {
       if (!currentTimeRef.current) return;
@@ -48,22 +62,29 @@ export function TimecodeDisplay({ fps, totalFrames }: TimecodeDisplayProps) {
     };
 
     // Initial update
-    updateDisplay(usePlaybackStore.getState().currentFrame);
+    updateDisplay(getVisibleFrame());
 
-    // Subscribe to store changes
-    return usePlaybackStore.subscribe((state) => {
-      updateDisplay(state.currentFrame);
-    });
-  }, [formatFrameNumber]);
+    const syncDisplay = () => {
+      updateDisplay(getVisibleFrame());
+    };
+
+    const unsubscribePlayback = usePlaybackStore.subscribe(syncDisplay);
+    const unsubscribePreviewBridge = usePreviewBridgeStore.subscribe(syncDisplay);
+
+    return () => {
+      unsubscribePlayback();
+      unsubscribePreviewBridge();
+    };
+  }, [formatFrameNumber, getVisibleFrame]);
 
   // Update display when showFrames or fps changes (rare - can trigger re-render)
   useEffect(() => {
     if (!currentTimeRef.current) return;
-    const frame = usePlaybackStore.getState().currentFrame;
+    const frame = getVisibleFrame();
     currentTimeRef.current.textContent = showFrames
       ? formatFrameNumber(frame)
       : formatTimecodeCompact(frame, fps);
-  }, [showFrames, fps, formatFrameNumber]);
+  }, [showFrames, fps, formatFrameNumber, getVisibleFrame]);
 
   return (
     <button
@@ -76,7 +97,7 @@ export function TimecodeDisplay({ fps, totalFrames }: TimecodeDisplayProps) {
         ref={currentTimeRef}
         className="text-primary font-semibold"
       >
-        {showFrames ? formatFrameNumber(usePlaybackStore.getState().currentFrame) : formatTimecodeCompact(usePlaybackStore.getState().currentFrame, fps)}
+        {showFrames ? formatFrameNumber(getVisibleFrame()) : formatTimecodeCompact(getVisibleFrame(), fps)}
       </span>
       <span className="text-muted-foreground/50">/</span>
       <span>
diff --git a/src/features/preview/components/use-edit-overlay-panel-prewarm.ts b/src/features/preview/components/use-edit-overlay-panel-prewarm.ts
new file mode 100644
index 000000000..29f99fdee
--- /dev/null
+++ b/src/features/preview/components/use-edit-overlay-panel-prewarm.ts
@@ -0,0 +1,128 @@
+import { useEffect, useMemo, useRef } from 'react';
+import { useBlobUrlVersion } from '@/infrastructure/browser/blob-url-manager';
+import { backgroundBatchPreseek } from '@/features/preview/utils/decoder-prewarm';
+import type { TimelineItem } from '@/types/timeline';
+import { resolveMediaUrl, resolveProxyUrl } from '../utils/media-resolver';
+import { collectEditOverlayDirectionalPrewarmTimes } from '../utils/edit-overlay-prewarm-plan';
+import {
+  getEditOverlayFrameCacheKey,
+  hasCachedEditOverlayFrame,
+} from '../utils/edit-overlay-frame-cache';
+import { getCachedPredecodedBitmap } from '../utils/decoder-prewarm';
+
+const CACHE_TIME_QUANTUM = 1 / 60;
+const EDIT_OVERLAY_PREWARM_MAX_TIMESTAMPS = 6;
+
+export function useEditOverlayPanelPrewarm(
+  panels: ReadonlyArray<{ item: TimelineItem | null; sourceTime?: number }>,
+): void {
+  const blobUrlVersion = useBlobUrlVersion();
+  const previousAnchorFrameByPanelRef = useRef(new Map<string, number | null>());
+  const panelSignature = panels.map((panel) => {
+    const item = panel.item;
+    if (!item || item.type !== 'video') {
+      return 'none';
+    }
+    return `${item.id}:${item.mediaId ?? 'none'}:${Math.max(0, panel.sourceTime ?? 0).toFixed(6)}`;
+  }).join('|');
+  const prewarmTargets = useMemo(
+    () => panels.map((panel) => ({ item: panel.item, sourceTime: panel.sourceTime })),
+    [panelSignature],
+  );
+
+  useEffect(() => {
+    let cancelled = false;
+
+    const run = async () => {
+      const requests = await Promise.all(prewarmTargets.map(async (panel, index) => {
+        const item = panel.item;
+        if (!item || item.type !== 'video' || !item.mediaId) {
+          return null;
+        }
+
+        const targetTime = Math.max(0, panel.sourceTime ?? 0);
+        const proxyUrl = resolveProxyUrl(item.mediaId);
+        if (proxyUrl) {
+          return { src: proxyUrl, targetTime };
+        }
+
+        const mediaUrl = await resolveMediaUrl(item.mediaId).catch(() => null);
+        if (!mediaUrl) {
+          return null;
+        }
+
+        return {
+          src: mediaUrl,
+          targetTime,
+          panelKey: `${index}:${item.id}:${item.mediaId ?? 'none'}`,
+          fps: Math.max(1, Math.round(item.sourceFps ?? 60)),
+          duration: item.sourceDuration && Number.isFinite(item.sourceDuration)
+            ? Math.max(targetTime + CACHE_TIME_QUANTUM, item.sourceDuration / Math.max(1, Math.round(item.sourceFps ?? 60)))
+            : targetTime + 1,
+        };
+      }));
+
+      if (cancelled) return;
+
+      const groupedBySrc = new Map<string, number[]>();
+      const activePanelKeys = new Set<string>();
+      for (const request of requests) {
+        if (!request) continue;
+        activePanelKeys.add(request.panelKey);
+
+        const quantizedTime = Math.round(request.targetTime / CACHE_TIME_QUANTUM) * CACHE_TIME_QUANTUM;
+        const existing = groupedBySrc.get(request.src);
+        if (existing) {
+          if (!existing.includes(quantizedTime)) {
+            existing.push(quantizedTime);
+          }
+        } else {
+          groupedBySrc.set(request.src, [quantizedTime]);
+        }
+
+        const previousAnchorFrame = previousAnchorFrameByPanelRef.current.get(request.panelKey) ?? null;
+        const directionalPlan = collectEditOverlayDirectionalPrewarmTimes({
+          targetTime: request.targetTime,
+          duration: request.duration,
+          fps: request.fps,
+          previousAnchorFrame,
+          quantumSeconds: CACHE_TIME_QUANTUM,
+          maxTimestamps: EDIT_OVERLAY_PREWARM_MAX_TIMESTAMPS,
+          isCached: (time) => {
+            const overlayCacheKey = getEditOverlayFrameCacheKey(request.src, time, CACHE_TIME_QUANTUM);
+            return hasCachedEditOverlayFrame(overlayCacheKey)
+              || getCachedPredecodedBitmap(request.src, time, CACHE_TIME_QUANTUM) !== null;
+          },
+        });
+        previousAnchorFrameByPanelRef.current.set(request.panelKey, directionalPlan.targetFrame);
+
+        for (const time of directionalPlan.times) {
+          const bySrc = groupedBySrc.get(request.src);
+          if (!bySrc) {
+            groupedBySrc.set(request.src, [time]);
+            continue;
+          }
+          if (!bySrc.includes(time)) {
+            bySrc.push(time);
+          }
+        }
+      }
+
+      for (const panelKey of [...previousAnchorFrameByPanelRef.current.keys()]) {
+        if (!activePanelKeys.has(panelKey)) {
+          previousAnchorFrameByPanelRef.current.delete(panelKey);
+        }
+      }
+
+      for (const [src, timestamps] of groupedBySrc) {
+        void backgroundBatchPreseek(src, timestamps).catch(() => null);
+      }
+    };
+
+    void run();
+
+    return () => {
+      cancelled = true;
+    };
+  }, [blobUrlVersion, prewarmTargets]);
+}
diff --git a/src/features/preview/components/video-preview.sync.test.tsx b/src/features/preview/components/video-preview.sync.test.tsx
index 2643f5030..15c16b4c9 100644
--- a/src/features/preview/components/video-preview.sync.test.tsx
+++ b/src/features/preview/components/video-preview.sync.test.tsx
@@ -175,6 +175,33 @@ function createMockCanvasContext(): CanvasRenderingContext2D {
   } as unknown as CanvasRenderingContext2D;
 }
 
+function createRendererDouble(overrides: Partial<{
+  preload: ReturnType<typeof vi.fn>;
+  renderFrame: ReturnType<typeof vi.fn>;
+  prewarmFrame: ReturnType<typeof vi.fn>;
+  prewarmFrames: ReturnType<typeof vi.fn>;
+  invalidateFrameCache: ReturnType<typeof vi.fn>;
+  setDomVideoElementProvider: ReturnType<typeof vi.fn>;
+  getScrubbingCache: () => null;
+  dispose: ReturnType<typeof vi.fn>;
+}> = {}) {
+  const prewarmFrame = overrides.prewarmFrame ?? vi.fn(async () => {});
+  return {
+    preload: overrides.preload ?? vi.fn(async () => {}),
+    renderFrame: overrides.renderFrame ?? vi.fn(async () => {}),
+    prewarmFrame,
+    prewarmFrames: overrides.prewarmFrames ?? vi.fn(async (frames: number[]) => {
+      for (const frame of frames) {
+        await prewarmFrame(frame);
+      }
+    }),
+    invalidateFrameCache: overrides.invalidateFrameCache ?? vi.fn(),
+    setDomVideoElementProvider: overrides.setDomVideoElementProvider ?? vi.fn(),
+    getScrubbingCache: overrides.getScrubbingCache ?? (() => null),
+    dispose: overrides.dispose ?? vi.fn(),
+  };
+}
+
 class ResizeObserverMock {
   observe() {}
   unobserve() {}
@@ -711,6 +738,143 @@ describe('VideoPreview sync behavior', () => {
     });
   });
 
+  it('rebuilds and immediately rerenders the fast-scrub overlay when a shape toggles into mask mode', async () => {
+    useItemsStore.getState().setTracks([
+      {
+        id: 'track-mask',
+        name: 'Mask',
+        height: 60,
+        locked: false,
+        visible: true,
+        muted: false,
+        solo: false,
+        order: 0,
+        items: [],
+      },
+      {
+        id: 'track-video',
+        name: 'Video',
+        height: 60,
+        locked: false,
+        visible: true,
+        muted: false,
+        solo: false,
+        order: 1,
+        items: [],
+      },
+    ]);
+    useItemsStore.getState().setItems([
+      {
+        id: 'shape-1',
+        type: 'shape',
+        trackId: 'track-mask',
+        from: 0,
+        durationInFrames: 120,
+        shapeType: 'rectangle',
+        fillColor: '#ffffff',
+        isMask: false,
+      } as TimelineItem,
+      {
+        id: 'item-1',
+        type: 'video',
+        trackId: 'track-video',
+        from: 0,
+        durationInFrames: 120,
+        src: 'blob:mock-video',
+        effects: [
+          {
+            id: 'effect-1',
+            enabled: true,
+            effect: { type: 'gpu-effect', gpuEffectType: 'gpu-sepia', params: { amount: 0.5 } },
+          },
+        ],
+      } as TimelineItem,
+    ]);
+
+    const { container } = render(
+      <VideoPreview
+        project={{ width: 1920, height: 1080, backgroundColor: '#000000' }}
+        containerSize={{ width: 1280, height: 720 }}
+      />
+    );
+
+    const scrubCanvas = container.querySelectorAll('canvas')[0] as HTMLCanvasElement;
+
+    await waitFor(() => {
+      expect(rendererMockState.instances.length).toBeGreaterThan(0);
+    });
+
+    const initialRendererCount = rendererMockState.instances.length;
+    const initialRenderer = rendererMockState.instances[initialRendererCount - 1]!;
+
+    await waitFor(() => {
+      expect(initialRenderer.renderFrame).toHaveBeenCalledWith(0);
+      expect(scrubCanvas.style.visibility).toBe('visible');
+      expect(getDisplayedFrame()).toBe(0);
+    });
+
+    let resolveRebuiltRender: (() => void) | null = null;
+    createCompositionRendererMock.mockImplementationOnce(async () => {
+      const renderFrame = vi.fn(() => new Promise<void>((resolve) => {
+        resolveRebuiltRender = resolve;
+      }));
+      const renderer = createRendererDouble({ renderFrame });
+      rendererMockState.instances.push(renderer);
+      return renderer;
+    });
+
+    act(() => {
+      useItemsStore.getState().setItems([
+        {
+          id: 'shape-1',
+          type: 'shape',
+          trackId: 'track-mask',
+          from: 0,
+          durationInFrames: 120,
+          shapeType: 'rectangle',
+          fillColor: '#ffffff',
+          isMask: true,
+        } as TimelineItem,
+        {
+          id: 'item-1',
+          type: 'video',
+          trackId: 'track-video',
+          from: 0,
+          durationInFrames: 120,
+          src: 'blob:mock-video',
+          effects: [
+            {
+              id: 'effect-1',
+              enabled: true,
+              effect: { type: 'gpu-effect', gpuEffectType: 'gpu-sepia', params: { amount: 0.5 } },
+            },
+          ],
+        } as TimelineItem,
+      ]);
+    });
+
+    await waitFor(() => {
+      expect(rendererMockState.instances.length).toBeGreaterThan(initialRendererCount);
+    });
+
+    const rebuiltRenderer = rendererMockState.instances[rendererMockState.instances.length - 1]!;
+    await waitFor(() => {
+      expect(rebuiltRenderer.renderFrame).toHaveBeenCalledWith(0);
+    });
+
+    expect(scrubCanvas.style.visibility).toBe('visible');
+    expect(getDisplayedFrame()).toBe(0);
+
+    act(() => {
+      resolveRebuiltRender?.();
+    });
+
+    await waitFor(() => {
+      expect(scrubCanvas.style.visibility).toBe('visible');
+      expect(getDisplayedFrame()).toBe(0);
+    });
+  });
+
   it('reuses the active fast-scrub renderer for committed transform updates on gpu-effect clips', async () => {
     useItemsStore.getState().setTracks([
       {
@@ -950,6 +1114,216 @@ describe('VideoPreview sync behavior', () => {
     });
   });
 
+  it('re-renders the paused currentFrame when committed gpu effect params change', async () => {
+    useItemsStore.getState().setTracks([
+      {
+        id: 'track-video',
+        name: 'Video',
+        height: 60,
+        locked: false,
+        visible: true,
+        muted: false,
+        solo: false,
+        order: 0,
+        items: [],
+      },
+    ]);
+    useItemsStore.getState().setItems([
+      {
+        id: 'item-effected',
+        type: 'video',
+        trackId: 'track-video',
+        from: 0,
+        durationInFrames: 120,
+        src: 'blob:mock-video',
+        effects: [
+          {
+            id: 'effect-sepia',
+            enabled: true,
+            effect: {
+              type: 'gpu-effect',
+              gpuEffectType: 'gpu-sepia',
+              params: { amount: 0.5 },
+            },
+          },
+        ],
+      } as TimelineItem,
+    ]);
+    act(() => {
+      usePlaybackStore.getState().setCurrentFrame(24);
+    });
+
+    const { container } = render(
+      <VideoPreview
+        project={{ width: 1920, height: 1080, backgroundColor: '#000000' }}
+        containerSize={{ width: 1280, height: 720 }}
+      />
+    );
+
+    const scrubCanvas = container.querySelectorAll('canvas')[0] as HTMLCanvasElement;
+
+    const renderer = await waitFor(() => {
+      expect(createCompositionRendererMock).toHaveBeenCalledTimes(1);
+      expect(rendererMockState.instances.length).toBe(1);
+      return rendererMockState.instances[0]!;
+    });
+
+    await waitFor(() => {
+      expect(renderer.renderFrame).toHaveBeenCalledWith(24);
+      expect(getDisplayedFrame()).toBe(24);
+      expect(scrubCanvas.style.visibility).toBe('visible');
+    });
+
+    renderer.invalidateFrameCache.mockClear();
+    renderer.renderFrame.mockClear();
+
+    act(() => {
+      useItemsStore.getState().setItems([
+        {
+          id: 'item-effected',
+          type: 'video',
+          trackId: 'track-video',
+          from: 0,
+          durationInFrames: 120,
+          src: 'blob:mock-video',
+          effects: [
+            {
+              id: 'effect-sepia',
+              enabled: true,
+              effect: {
+                type: 'gpu-effect',
+                gpuEffectType: 'gpu-sepia',
+                params: { amount: 0.8 },
+              },
+            },
+          ],
+        } as TimelineItem,
+      ]);
+    });
+
+    await waitFor(() => {
+      expect(renderer.invalidateFrameCache).toHaveBeenCalledWith({
+        ranges: [{ startFrame: 0, endFrame: 120 }],
+      });
+      expect(renderer.renderFrame).toHaveBeenCalledWith(24);
+      expect(getDisplayedFrame()).toBe(24);
+      expect(scrubCanvas.style.visibility).toBe('visible');
+    });
+  });
+
+  it('re-renders the paused currentFrame after media resolution finishes on refresh', async () => {
+    const mediaId = 'media-effected';
+    setMockBlobUrl(mediaId, 'blob:effected-video');
+
+    const media = {
+      id: mediaId,
+      projectId: 'project-1',
+      fileName: 'effected.mp4',
+      fileSize: 1024,
+      mimeType: 'video/mp4',
+      width: 1920,
+      height: 1080,
+      duration: 4,
+      createdAt: Date.now(),
+      updatedAt: Date.now(),
+    } as (typeof useMediaLibraryStore.getState)['mediaItems'][number];
+
+    useMediaLibraryStore.setState({
+      mediaItems: [media],
+      mediaById: {
+        [mediaId]: media,
+      },
+      brokenMediaIds: [],
+    });
+
+    useItemsStore.getState().setTracks([
+      {
+        id: 'track-mask',
+        name: 'Mask',
+        height: 60,
+        locked: false,
+        visible: true,
+        muted: false,
+        solo: false,
+        order: 0,
+        items: [],
+      },
+      {
+        id: 'track-video',
+        name: 'Video',
+        height: 60,
+        locked: false,
+        visible: true,
+        muted: false,
+        solo: false,
+        order: 1,
+        items: [],
+      },
+    ]);
+    useItemsStore.getState().setItems([
+      {
+        id: 'mask-shape',
+        type: 'shape',
+        trackId: 'track-mask',
+        from: 0,
+        durationInFrames: 120,
+        label: 'Mask',
+        shapeType: 'rectangle',
+        fillColor: '#ffffff',
+        isMask: true,
+        maskType: 'clip',
+      } as TimelineItem,
+      {
+        id: 'item-effected',
+        type: 'video',
+        trackId: 'track-video',
+        mediaId,
+        from: 0,
+        durationInFrames: 120,
+        src: '',
+        effects: [
+          {
+            id: 'effect-sepia',
+            enabled: true,
+            effect: {
+              type: 'gpu-effect',
+              gpuEffectType: 'gpu-sepia',
+              params: { amount: 0.5 },
+            },
+          },
+        ],
+      } as TimelineItem,
+    ]);
+    act(() => {
+      usePlaybackStore.getState().setCurrentFrame(24);
+    });
+
+    const { container } = render(
+      <VideoPreview
+        project={{ width: 1920, height: 1080, backgroundColor: '#000000' }}
+        containerSize={{ width: 1280, height: 720 }}
+      />
+    );
+
+    const scrubCanvas = container.querySelectorAll('canvas')[0] as HTMLCanvasElement;
+
+    await waitFor(() => {
+      expect(seekToMock).toHaveBeenCalledWith(24);
+    });
+
+    const renderer = await waitFor(() => {
+      expect(createCompositionRendererMock).toHaveBeenCalled();
+      expect(rendererMockState.instances.length).toBeGreaterThan(0);
+      return rendererMockState.instances[rendererMockState.instances.length - 1]!;
+    });
+
+    await waitFor(() => {
+      expect(renderer.renderFrame).toHaveBeenCalledWith(24);
+      expect(getDisplayedFrame()).toBe(24);
+      expect(scrubCanvas.style.visibility).toBe('visible');
+    });
+  });
+
   it('renders a paused currentFrame through the fast-scrub overlay for live gpu effect previews', async () => {
     useItemsStore.getState().setTracks([
       {
@@ -1024,6 +1398,110 @@ describe('VideoPreview sync behavior', () => {
     });
   });
 
+  it('re-renders the paused currentFrame when a live gpu effect preview is committed', async () => {
+    useItemsStore.getState().setTracks([
+      {
+        id: 'track-video',
+        name: 'Video',
+        height: 60,
+        locked: false,
+        visible: true,
+        muted: false,
+        solo: false,
+        order: 0,
+        items: [],
+      },
+    ]);
+    useItemsStore.getState().setItems([
+      {
+        id: 'item-previewed',
+        type: 'video',
+        trackId: 'track-video',
+        from: 0,
+        durationInFrames: 120,
+        src: 'blob:mock-video',
+        effects: [
+          {
+            id: 'effect-preview',
+            enabled: true,
+            effect: {
+              type: 'gpu-effect',
+              gpuEffectType: 'gpu-sepia',
+              params: { amount: 0.5 },
+            },
+          },
+        ],
+      } as TimelineItem,
+    ]);
+    act(() => {
+      usePlaybackStore.getState().setCurrentFrame(24);
+    });
+
+    const { container } = render(
+      <VideoPreview
+        project={{ width: 1920, height: 1080, backgroundColor: '#000000' }}
+        containerSize={{ width: 1280, height: 720 }}
+      />
+    );
+
+    const scrubCanvas = container.querySelectorAll('canvas')[0] as HTMLCanvasElement;
+
+    const renderer = await waitFor(() => {
+      expect(createCompositionRendererMock).toHaveBeenCalledTimes(1);
+      expect(rendererMockState.instances.length).toBe(1);
+      return rendererMockState.instances[0]!;
+    });
+
+    await waitFor(() => {
+      expect(renderer.renderFrame).toHaveBeenCalledWith(24);
+      expect(getDisplayedFrame()).toBe(24);
+      expect(scrubCanvas.style.visibility).toBe('visible');
+    });
+
+    act(() => {
+      useGizmoStore.getState().setEffectsPreviewNew({
+        'item-previewed': [
+          {
+            id: 'effect-preview',
+            enabled: true,
+            effect: {
+              type: 'gpu-effect',
+              gpuEffectType: 'gpu-sepia',
+              params: { amount: 0.8 },
+            },
+          },
+        ],
+      });
+    });
+
+    await waitFor(() => {
+      expect(renderer.renderFrame).toHaveBeenCalledWith(24);
+    });
+
+    renderer.invalidateFrameCache.mockClear();
+    renderer.renderFrame.mockClear();
+
+    act(() => {
+      useTimelineStore.getState().updateEffect('item-previewed', 'effect-preview', {
+        effect: {
+          type: 'gpu-effect',
+          gpuEffectType: 'gpu-sepia',
+          params: { amount: 0.8 },
+        },
+      });
+      useGizmoStore.getState().clearPreview();
+    });
+
+    await waitFor(() => {
+      expect(renderer.invalidateFrameCache).toHaveBeenCalledWith({
+        ranges: [{ startFrame: 0, endFrame: 120 }],
+      });
+      expect(renderer.renderFrame).toHaveBeenCalledWith(24);
+      expect(getDisplayedFrame()).toBe(24);
+      expect(scrubCanvas.style.visibility).toBe('visible');
+    });
+  });
+
   it('keeps the fast-scrub overlay visible when playback pauses on a gpu-effect clip', async () => {
     useItemsStore.getState().setTracks([
       {
@@ -1653,6 +2131,65 @@ describe('VideoPreview sync behavior', () => {
     expect(getDisplayedFrame()).toBeNull();
   });
 
+  it('prefers the Player path for generated caption scrubs', async () => {
+    useItemsStore.getState().setTracks([
+      {
+        id: 'track-caption',
+        name: 'V2',
+        height: 60,
+        locked: false,
+        visible: true,
+        muted: false,
+        solo: false,
+        order: 0,
+        items: [],
+      },
+    ]);
+    useItemsStore.getState().setItems([
+      {
+        id: 'caption-1',
+        type: 'text',
+        trackId: 'track-caption',
+        from: 0,
+        durationInFrames: 120,
+        label: 'Caption',
+        text: 'Caption line',
+        textRole: 'caption',
+        captionSource: {
+          type: 'transcript',
+          clipId: 'video-1',
+          mediaId: 'media-1',
+        },
+      } as unknown as (typeof useItemsStore.getState)['items'][number],
+    ]);
+    useTimelineStore.setState({ keyframes: [] });
+
+    const { container } = render(
+      <VideoPreview
+        project={{ width: 1920, height: 1080, backgroundColor: '#000000' }}
+        containerSize={{ width: 1280, height: 720 }}
+      />
+    );
+
+    const scrubCanvas = container.querySelectorAll('canvas')[0] as HTMLCanvasElement;
+
+    await waitFor(() => {
+      expect(seekToMock).toHaveBeenCalled();
+    });
+    seekToMock.mockClear();
+
+    act(() => {
+      usePlaybackStore.getState().setScrubFrame(48);
+    });
+
+    await waitFor(() => {
+      expect(seekToMock).toHaveBeenCalledWith(48);
+    });
+
+    expect(scrubCanvas.style.visibility).toBe('hidden');
+    expect(getDisplayedFrame()).toBeNull();
+  });
+
   it('keeps fast-scrub overlay visible until Player confirms the exact scrub release frame', async () => {
     const { container } = render(
       <VideoPreview
diff --git a/src/features/preview/components/video-preview.tsx b/src/features/preview/components/video-preview.tsx
index 01a16e0a7..a4f03a345 100644
--- a/src/features/preview/components/video-preview.tsx
+++ b/src/features/preview/components/video-preview.tsx
@@ -246,7 +246,7 @@ export const VideoPreview = memo(function VideoPreview({
     fastScrubScaledKeyframes,
     fastScrubInputProps,
     fastScrubPreviewItems,
-    fastScrubTracksFingerprint,
+    fastScrubTracksTopologyFingerprint,
     getPreviewTransformOverride,
     getPreviewEffectsOverride,
     getPreviewCornerPinOverride,
@@ -312,11 +312,11 @@ export const VideoPreview = memo(function VideoPreview({
       project.width,
       project.height,
       project.backgroundColor ?? '',
-      fastScrubTracksFingerprint,
+      fastScrubTracksTopologyFingerprint,
       playbackTransitionFingerprint,
     ].join('::')
   ), [
-    fastScrubTracksFingerprint,
+    fastScrubTracksTopologyFingerprint,
     fps,
     playbackTransitionFingerprint,
     project.backgroundColor,
@@ -383,6 +383,7 @@ export const VideoPreview = memo(function VideoPreview({
     fps,
     isResolving,
     forceFastScrubOverlay,
+    items,
     playerRenderSize,
     renderSize,
     fastScrubInputProps,
diff --git a/src/features/preview/deps/timeline-utils.ts b/src/features/preview/deps/timeline-utils.ts
index a76f5a6f7..af75ed174 100644
--- a/src/features/preview/deps/timeline-utils.ts
+++ b/src/features/preview/deps/timeline-utils.ts
@@ -12,3 +12,8 @@ export {
   getTrackKind,
   type DroppableMediaType,
 } from './timeline-contract';
+export {
+  createScrubThrottleState,
+  shouldCommitScrubFrame,
+  type ScrubThrottleState,
+} from './timeline-contract';
diff --git a/src/features/preview/hooks/use-preview-capture-bridge.ts b/src/features/preview/hooks/use-preview-capture-bridge.ts
index 5262d5373..b88c5e44a 100644
--- a/src/features/preview/hooks/use-preview-capture-bridge.ts
+++ b/src/features/preview/hooks/use-preview-capture-bridge.ts
@@ -1,4 +1,4 @@
-import { useEffect, type MutableRefObject } from 'react';
+import { useEffect, useRef, type MutableRefObject } from 'react';
 import type { CaptureOptions } from '@/shared/state/playback';
 
 interface UsePreviewCaptureBridgeParams {
@@ -26,6 +26,15 @@ export function usePreviewCaptureBridge({
   captureImageDataInFlightRef,
   captureScaleCanvasRef,
 }: UsePreviewCaptureBridgeParams) {
+  const unmountingRef = useRef(false);
+
+  useEffect(() => {
+    unmountingRef.current = false;
+    return () => {
+      unmountingRef.current = true;
+    };
+  }, []);
+
   useEffect(() => {
     setCaptureFrame(captureCurrentFrame);
     setCaptureFrameImageData(captureCurrentFrameImageData);
@@ -34,7 +43,9 @@ export function usePreviewCaptureBridge({
       setCaptureFrame(null);
       setCaptureFrameImageData(null);
       setCaptureCanvasSource(null);
-      setDisplayedFrame(null);
+      if (unmountingRef.current) {
+        setDisplayedFrame(null);
+      }
       captureInFlightRef.current = null;
       captureImageDataInFlightRef.current = null;
       captureScaleCanvasRef.current = null;
diff --git a/src/features/preview/hooks/use-preview-composition-model.ts b/src/features/preview/hooks/use-preview-composition-model.ts
index 550e2e7c3..7f27a8955 100644
--- a/src/features/preview/hooks/use-preview-composition-model.ts
+++ b/src/features/preview/hooks/use-preview-composition-model.ts
@@ -13,7 +13,7 @@ import { useMaskEditorStore } from '../stores/mask-editor-store';
 import { resolveProxyUrl } from '../utils/media-resolver';
 import {
   getMediaResolveCost,
-  toTrackFingerprint,
+  toTrackTopologyFingerprint,
   type FastScrubBoundarySource,
   type VideoSourceSpan,
 } from '../utils/preview-constants';
@@ -106,7 +106,7 @@ export function usePreviewCompositionModel({
     scrubVideoSourceSpans,
     fastScrubBoundaryFrames,
     fastScrubBoundarySources,
-    fastScrubTracksFingerprint,
+    fastScrubTracksTopologyFingerprint,
     totalFrames,
     inputProps,
     playerRenderSize,
@@ -201,7 +201,7 @@ export function usePreviewCompositionModel({
     fastScrubScaledKeyframes,
     fastScrubInputProps,
     fastScrubPreviewItems,
-    fastScrubTracksFingerprint,
+    fastScrubTracksTopologyFingerprint,
     getPreviewTransformOverride,
     getPreviewEffectsOverride,
     getPreviewCornerPinOverride,
@@ -310,7 +310,7 @@ export function buildPreviewCompositionData({
 
   const resolvedTracks = resolvedTrackList;
   const fastScrubTracks = fastScrubTrackList;
-  const fastScrubTracksFingerprint = toTrackFingerprint(fastScrubTrackList);
+  const fastScrubTracksTopologyFingerprint = toTrackTopologyFingerprint(fastScrubTrackList);
   const furthestItemEndFrame = items.reduce((max, item) => Math.max(max, item.from + item.durationInFrames), 0);
   const totalFrames = furthestItemEndFrame === 0 ? 900 : furthestItemEndFrame + (fps * 5);
   const inputProps: CompositionInputProps = {
@@ -352,7 +352,7 @@ export function buildPreviewCompositionData({
     scrubVideoSourceSpans: scrubSpans,
     fastScrubBoundaryFrames: sortedBoundaryFrames,
     fastScrubBoundarySources: sortedBoundarySources,
-    fastScrubTracksFingerprint,
+    fastScrubTracksTopologyFingerprint,
     totalFrames,
     inputProps,
     playerRenderSize,
diff --git a/src/features/preview/hooks/use-preview-render-pump-controller.ts b/src/features/preview/hooks/use-preview-render-pump-controller.ts
index 488dc81d7..9dbc7cd5e 100644
--- a/src/features/preview/hooks/use-preview-render-pump-controller.ts
+++ b/src/features/preview/hooks/use-preview-render-pump-controller.ts
@@ -1,4 +1,4 @@
-import { useEffect, type MutableRefObject, type RefObject } from 'react';
+import { useEffect, useRef, type MutableRefObject, type RefObject } from 'react';
 import type { PlayerRef } from '@/features/preview/deps/player-core';
 import { getGlobalVideoSourcePool } from '@/features/preview/deps/player-pool';
 import { usePlaybackStore } from '@/shared/state/playback';
@@ -229,6 +229,15 @@ export function usePreviewRenderPump({
   trackPlayerSeek,
   recordRenderFrameJitter,
 }: UsePreviewRenderPumpParams) {
+  const unmountingRef = useRef(false);
+
+  useEffect(() => {
+    unmountingRef.current = false;
+    return () => {
+      unmountingRef.current = true;
+    };
+  }, []);
+
   useEffect(() => {
     scrubMountedRef.current = true;
 
@@ -1556,7 +1565,9 @@ export function usePreviewRenderPump({
       clearPendingFastScrubHandoff();
       clearScheduledTransitionPrepare();
       clearTransitionPlaybackSession();
-      hideAllOverlays();
+      if (unmountingRef.current) {
+        hideAllOverlays();
+      }
       if (playbackRafId !== null) {
         cancelAnimationFrame(playbackRafId);
         playbackRafId = null;
diff --git a/src/features/preview/hooks/use-preview-renderer-controller.ts b/src/features/preview/hooks/use-preview-renderer-controller.ts
index b39402630..571937f68 100644
--- a/src/features/preview/hooks/use-preview-renderer-controller.ts
+++ b/src/features/preview/hooks/use-preview-renderer-controller.ts
@@ -7,6 +7,7 @@ import type { TimelineItem } from '@/types/timeline';
 import type { ResolvedTransform } from '@/types/transform';
 import type { CaptureOptions } from '@/shared/state/playback';
 import { usePlaybackStore } from '@/shared/state/playback';
+import { usePreviewBridgeStore, type PostEditWarmRequest } from '@/shared/state/preview-bridge';
 import { createLogger } from '@/shared/logging/logger';
 import type { PreviewPathVerticesOverride } from '../deps/composition-runtime';
 import { getPreviewRuntimeSnapshotFromPlaybackState } from '../utils/preview-state-coordinator';
@@ -15,6 +16,7 @@ import {
   FAST_SCRUB_RENDERER_ENABLED,
   blobToDataUrl,
 } from '../utils/preview-constants';
+import { setActivePreviewScrubbingCache } from '../utils/preview-scrubbing-cache-bridge';
 import { collectVisualInvalidationRanges } from '../utils/preview-frame-invalidation';
 import { isFrameInRanges } from '@/shared/utils/frame-invalidation';
 import { usePreviewCaptureBridge } from './use-preview-capture-bridge';
@@ -27,6 +29,7 @@ interface UsePreviewRendererControllerParams {
   fps: number;
   isResolving: boolean;
   forceFastScrubOverlay: boolean;
+  items: TimelineItem[];
   playerRenderSize: { width: number; height: number };
   renderSize: { width: number; height: number };
   fastScrubInputProps: CompositionInputProps;
@@ -93,6 +96,7 @@ export function usePreviewRendererController({
   fps,
   isResolving,
   forceFastScrubOverlay,
+  items,
   playerRenderSize,
   renderSize,
   fastScrubInputProps,
@@ -159,6 +163,10 @@ export function usePreviewRendererController({
     tracks: fastScrubScaledTracks,
     keyframes: fastScrubScaledKeyframes,
   });
+  const previousItemsRef = useRef(items);
+  const previousIsResolvingRef = useRef(isResolving);
+  const pendingPostEditWarmRequestRef = useRef<PostEditWarmRequest | null>(null);
+  const postEditWarmInFlightRef = useRef(false);
 
   useLayoutEffect(() => {
     const canvas = scrubCanvasRef.current;
@@ -200,6 +208,7 @@ export function usePreviewRendererController({
         logger.warn('Failed to dispose renderer:', error);
       }
       scrubRendererRef.current = null;
+      setActivePreviewScrubbingCache(null);
     }
     scrubRendererStructureKeyRef.current = null;
 
@@ -333,6 +342,18 @@ export function usePreviewRendererController({
           getLiveItemSnapshot,
           getLiveKeyframes,
         });
+        scrubOffscreenCanvasRef.current = offscreen;
+        scrubOffscreenCtxRef.current = offscreenCtx;
+        scrubOffscreenRenderedFrameRef.current = null;
+        scrubRendererRef.current = renderer;
+        scrubRendererStructureKeyRef.current = fastScrubRendererStructureKey;
+        setActivePreviewScrubbingCache(
+          'getScrubbingCache' in renderer ? renderer.getScrubbingCache() : null,
+        );
+        if ('warmGpuPipeline' in renderer) {
+          void renderer.warmGpuPipeline();
+        }
+
         const playbackState = usePlaybackStore.getState();
         const runtimeSnapshot = getPreviewRuntimeSnapshotFromPlaybackState(
           playbackState,
@@ -352,26 +373,17 @@ export function usePreviewRendererController({
             }
           });
         scrubPreloadPromiseRef.current = preloadPromise;
-
-        await Promise.race([
+        void Promise.race([
           preloadPromise,
           new Promise<void>((resolve) => {
             setTimeout(resolve, FAST_SCRUB_PRELOAD_BUDGET_MS);
           }),
         ]);
-
-        scrubOffscreenCanvasRef.current = offscreen;
-        scrubOffscreenCtxRef.current = offscreenCtx;
-        scrubOffscreenRenderedFrameRef.current = null;
-        scrubRendererRef.current = renderer;
-        scrubRendererStructureKeyRef.current = fastScrubRendererStructureKey;
-        if ('warmGpuPipeline' in renderer) {
-          void renderer.warmGpuPipeline();
-        }
         return renderer;
       } catch (error) {
         logger.warn('Failed to initialize renderer, falling back to Player seeks:', error);
         scrubRendererRef.current = null;
+        setActivePreviewScrubbingCache(null);
         scrubOffscreenCanvasRef.current = null;
         scrubOffscreenCtxRef.current = null;
         scrubOffscreenRenderedFrameRef.current = null;
@@ -426,8 +438,37 @@ export function usePreviewRendererController({
   }, [ensureFastScrubRenderer, scrubOffscreenCanvasRef, scrubOffscreenRenderedFrameRef]);
 
   useEffect(() => {
+    const hadRenderer = scrubRendererRef.current !== null || bgTransitionRendererRef.current !== null;
     disposeFastScrubRenderer();
-  }, [disposeFastScrubRenderer, fastScrubRendererStructureKey, renderSize.height, renderSize.width]);
+
+    if (!hadRenderer) {
+      return;
+    }
+
+    const playbackState = usePlaybackStore.getState();
+    const targetFrame = playbackState.previewFrame ?? playbackState.currentFrame;
+    if (
+      forceFastScrubOverlay
+      || playbackState.previewFrame !== null
+      || showFastScrubOverlayRef.current
+      || showPlaybackTransitionOverlayRef.current
+    ) {
+      scrubRequestedFrameRef.current = targetFrame;
+      void resumeScrubLoopRef.current();
+    }
+  }, [
+    bgTransitionRendererRef,
+    disposeFastScrubRenderer,
+    fastScrubRendererStructureKey,
+    forceFastScrubOverlay,
+    renderSize.height,
+    renderSize.width,
+    resumeScrubLoopRef,
+    scrubRendererRef,
+    scrubRequestedFrameRef,
+    showFastScrubOverlayRef,
+    showPlaybackTransitionOverlayRef,
+  ]);
 
   useEffect(() => {
     const previousVisualState = previousVisualStateRef.current;
@@ -520,6 +561,150 @@ export function usePreviewRendererController({
     transitionSessionBufferedFramesRef,
   ]);
 
+  useEffect(() => {
+    const wasResolving = previousIsResolvingRef.current;
+    previousIsResolvingRef.current = isResolving;
+
+    if (isResolving || !wasResolving) {
+      return;
+    }
+
+    const playbackState = usePlaybackStore.getState();
+    const targetFrame = playbackState.previewFrame ?? playbackState.currentFrame;
+    const needsRenderedFrame = (
+      forceFastScrubOverlay
+      || playbackState.previewFrame !== null
+      || showFastScrubOverlayRef.current
+      || showPlaybackTransitionOverlayRef.current
+    );
+
+    if (!needsRenderedFrame) {
+      return;
+    }
+
+    scrubRequestedFrameRef.current = targetFrame;
+    void resumeScrubLoopRef.current();
+  }, [
+    forceFastScrubOverlay,
+    isResolving,
+    resumeScrubLoopRef,
+    scrubRequestedFrameRef,
+    showFastScrubOverlayRef,
+    showPlaybackTransitionOverlayRef,
+  ]);
+
+  useEffect(() => {
+    const previousItems = previousItemsRef.current;
+    previousItemsRef.current = items;
+
+    if (previousItems === items) {
+      return;
+    }
+
+    const scrubRenderer = scrubRendererRef.current;
+    const scrubRendererMatchesStructure = (
+      scrubRendererStructureKeyRef.current === fastScrubRendererStructureKey
+    );
+    if (!scrubRenderer || !scrubRendererMatchesStructure) {
+      return;
+    }
+
+    const playbackState = usePlaybackStore.getState();
+    const targetFrame = playbackState.previewFrame ?? playbackState.currentFrame;
+    const needsRenderedFrame = (
+      forceFastScrubOverlay
+      || playbackState.previewFrame !== null
+      || showFastScrubOverlayRef.current
+      || showPlaybackTransitionOverlayRef.current
+    );
+
+    if (!needsRenderedFrame) {
+      return;
+    }
+
+    scrubRenderer.invalidateFrameCache({ frames: [targetFrame] });
+    if (scrubOffscreenRenderedFrameRef.current === targetFrame) {
+      scrubOffscreenRenderedFrameRef.current = null;
+    }
+    scrubRequestedFrameRef.current = targetFrame;
+    void resumeScrubLoopRef.current();
+  }, [
+    fastScrubRendererStructureKey,
+    forceFastScrubOverlay,
+    items,
+    resumeScrubLoopRef,
+    scrubOffscreenRenderedFrameRef,
+    scrubRendererRef,
+    scrubRendererStructureKeyRef,
+    scrubRequestedFrameRef,
+    showFastScrubOverlayRef,
+    showPlaybackTransitionOverlayRef,
+  ]);
+
+  useEffect(() => {
+    const flushPostEditWarmRequest = async () => {
+      if (postEditWarmInFlightRef.current) {
+        return;
+      }
+
+      postEditWarmInFlightRef.current = true;
+      try {
+        while (pendingPostEditWarmRequestRef.current) {
+          const request = pendingPostEditWarmRequestRef.current;
+          pendingPostEditWarmRequestRef.current = null;
+
+          const playbackState = usePlaybackStore.getState();
+          if (isResolving || playbackState.isPlaying || playbackState.previewFrame !== null) {
+            continue;
+          }
+
+          const renderer = await ensureFastScrubRenderer();
+          if (!renderer) {
+            continue;
+          }
+
+          try {
+            const framesToWarm = request.frames.length > 0 ? request.frames : [request.frame];
+            const warmRunwayFrames = Array.from(new Set([
+              ...framesToWarm,
+              request.frame - 2,
+              request.frame - 1,
+              request.frame + 1,
+              request.frame + 2,
+            ].filter((frame) => frame >= 0)));
+
+            if ('prewarmFrames' in renderer && warmRunwayFrames.length > 0) {
+              await renderer.prewarmFrames?.(warmRunwayFrames);
+            }
+            if ('prewarmItems' in renderer && request.itemIds.length > 0) {
+              await renderer.prewarmItems?.(request.itemIds, request.frame);
+            }
+            if (scrubOffscreenRenderedFrameRef.current !== request.frame) {
+              await renderer.renderFrame(request.frame);
+              scrubOffscreenRenderedFrameRef.current = request.frame;
+            }
+          } catch {
+            // Best effort only.
+          }
+        }
+      } finally {
+        postEditWarmInFlightRef.current = false;
+        if (pendingPostEditWarmRequestRef.current) {
+          void flushPostEditWarmRequest();
+        }
+      }
+    };
+
+    return usePreviewBridgeStore.subscribe((state, prev) => {
+      if (state.postEditWarmRequest === prev.postEditWarmRequest || !state.postEditWarmRequest) {
+        return;
+      }
+      const request = state.postEditWarmRequest;
+      pendingPostEditWarmRequestRef.current = request;
+      void flushPostEditWarmRequest();
+    });
+  }, [ensureFastScrubRenderer, isResolving, scrubOffscreenRenderedFrameRef]);
+
   const captureCurrentFrame = useCallback(async (options?: CaptureOptions): Promise<string | null> => {
     if (captureInFlightRef.current) {
       return captureInFlightRef.current;
diff --git a/src/features/preview/utils/edit-overlay-frame-cache.test.ts b/src/features/preview/utils/edit-overlay-frame-cache.test.ts
new file mode 100644
index 000000000..53ef38d4f
--- /dev/null
+++ b/src/features/preview/utils/edit-overlay-frame-cache.test.ts
@@ -0,0 +1,55 @@
+import { afterEach, describe, expect, it, vi } from 'vitest';
+import {
+  clearEditOverlayFrameCache,
+  getCachedEditOverlayFrame,
+  getEditOverlayFrameCacheKey,
+  getEditOverlayFrameCacheSize,
+  putCachedEditOverlayFrame,
+} from './edit-overlay-frame-cache';
+
+function createMockBitmap(label: string): ImageBitmap {
+  return {
+    close: vi.fn(),
+    label,
+  } as unknown as ImageBitmap;
+}
+
+describe('edit-overlay-frame-cache', () => {
+  afterEach(() => {
+    clearEditOverlayFrameCache();
+  });
+
+  it('quantizes keys by source time', () => {
+    const base = getEditOverlayFrameCacheKey('blob:test', 1.001, 1 / 60);
+    const nearby = getEditOverlayFrameCacheKey('blob:test', 1.004, 1 / 60);
+    const far = getEditOverlayFrameCacheKey('blob:test', 1.03, 1 / 60);
+
+    expect(base).toBe(nearby);
+    expect(base).not.toBe(far);
+  });
+
+  it('reuses existing cached frames for identical keys', () => {
+    const key = 'blob:test::1.000000';
+    const first = createMockBitmap('first');
+    const duplicate = createMockBitmap('duplicate');
+
+    putCachedEditOverlayFrame(key, first);
+    putCachedEditOverlayFrame(key, duplicate);
+
+    expect(getCachedEditOverlayFrame(key)).toBe(first);
+    expect(duplicate.close).toHaveBeenCalledTimes(1);
+    expect(getEditOverlayFrameCacheSize()).toBe(1);
+  });
+
+  it('evicts the oldest frame when capacity is exceeded', () => {
+    const oldest = createMockBitmap('oldest');
+    const newest = createMockBitmap('newest');
+
+    putCachedEditOverlayFrame('a', oldest, 1);
+    putCachedEditOverlayFrame('b', newest, 1);
+
+    expect(oldest.close).toHaveBeenCalledTimes(1);
+    expect(getCachedEditOverlayFrame('a')).toBeUndefined();
+    expect(getCachedEditOverlayFrame('b')).toBe(newest);
+  });
+});
diff --git a/src/features/preview/utils/edit-overlay-frame-cache.ts b/src/features/preview/utils/edit-overlay-frame-cache.ts
new file mode 100644
index 000000000..de0e8deee
--- /dev/null
+++ b/src/features/preview/utils/edit-overlay-frame-cache.ts
@@ -0,0 +1,64 @@
+const DEFAULT_EDIT_OVERLAY_CACHE_MAX = 180;
+
+const cache = new Map<string, ImageBitmap>();
+
+export function getEditOverlayFrameCacheKey(
+  src: string,
+  sourceTime: number,
+  quantumSeconds: number,
+): string {
+  const quantized = Math.round(sourceTime / quantumSeconds) * quantumSeconds;
+  return `${src}::${quantized.toFixed(6)}`;
+}
+
+export function getCachedEditOverlayFrame(key: string): ImageBitmap | undefined {
+  const bitmap = cache.get(key);
+  if (!bitmap) {
+    return undefined;
+  }
+
+  // Touch entry to preserve recent frames during drag reversals.
+  cache.delete(key);
+  cache.set(key, bitmap);
+  return bitmap;
+}
+
+export function hasCachedEditOverlayFrame(key: string): boolean {
+  return cache.has(key);
+}
+
+export function putCachedEditOverlayFrame(
+  key: string,
+  bitmap: ImageBitmap,
+  maxEntries: number = DEFAULT_EDIT_OVERLAY_CACHE_MAX,
+): void {
+  const existing = cache.get(key);
+  if (existing) {
+    cache.delete(key);
+    cache.set(key, existing);
+    bitmap.close();
+    return;
+  }
+
+  cache.set(key, bitmap);
+  while (cache.size > Math.max(1, maxEntries)) {
+    const oldest = cache.entries().next().value as [string, ImageBitmap] | undefined;
+    if (!oldest) {
+      break;
+    }
+    const [oldestKey, oldestBitmap] = oldest;
+    cache.delete(oldestKey);
+    oldestBitmap.close();
+  }
+}
+
+export function clearEditOverlayFrameCache(): void {
+  for (const bitmap of cache.values()) {
+    bitmap.close();
+  }
+  cache.clear();
+}
+
+export function getEditOverlayFrameCacheSize(): number {
+  return cache.size;
+}
diff --git a/src/features/preview/utils/edit-overlay-prewarm-plan.test.ts b/src/features/preview/utils/edit-overlay-prewarm-plan.test.ts
new file mode 100644
index 000000000..2f77f821c
--- /dev/null
+++ b/src/features/preview/utils/edit-overlay-prewarm-plan.test.ts
@@ -0,0 +1,69 @@
+import { describe, expect, it } from 'vitest';
+import { collectEditOverlayDirectionalPrewarmTimes } from './edit-overlay-prewarm-plan';
+
+function quantize(time: number): number {
+  return Math.round(time / (1 / 60)) * (1 / 60);
+}
+
+describe('collectEditOverlayDirectionalPrewarmTimes', () => {
+  it('biases forward prewarm after forward motion', () => {
+    const result = collectEditOverlayDirectionalPrewarmTimes({
+      targetTime: 2,
+      duration: 10,
+      fps: 30,
+      previousAnchorFrame: 58,
+      quantumSeconds: 1 / 60,
+      maxTimestamps: 6,
+    });
+
+    expect(result.direction).toBe(1);
+    expect(result.times).toEqual([
+      quantize(61 / 30),
+      quantize(62 / 30),
+      quantize(63 / 30),
+      quantize(64 / 30),
+      quantize(59 / 30),
+      quantize(58 / 30),
+    ]);
+  });
+
+  it('biases backward prewarm after backward motion', () => {
+    const result = collectEditOverlayDirectionalPrewarmTimes({
+      targetTime: 2,
+      duration: 10,
+      fps: 30,
+      previousAnchorFrame: 62,
+      quantumSeconds: 1 / 60,
+      maxTimestamps: 6,
+    });
+
+    expect(result.direction).toBe(-1);
+    expect(result.times).toEqual([
+      quantize(59 / 30),
+      quantize(58 / 30),
+      quantize(57 / 30),
+      quantize(56 / 30),
+      quantize(55 / 30),
+      quantize(54 / 30),
+    ]);
+  });
+
+  it('skips cached and duplicate times', () => {
+    const result = collectEditOverlayDirectionalPrewarmTimes({
+      targetTime: 2,
+      duration: 10,
+      fps: 30,
+      previousAnchorFrame: 58,
+      quantumSeconds: 1 / 60,
+      maxTimestamps: 6,
+      isCached: (time) => time === quantize(61 / 30) || time === quantize(59 / 30),
+    });
+
+    expect(result.times).toEqual([
+      quantize(62 / 30),
+      quantize(63 / 30),
+      quantize(64 / 30),
+      quantize(58 / 30),
+    ]);
+  });
+});
diff --git a/src/features/preview/utils/edit-overlay-prewarm-plan.ts b/src/features/preview/utils/edit-overlay-prewarm-plan.ts
new file mode 100644
index 000000000..ee529dee5
--- /dev/null
+++ b/src/features/preview/utils/edit-overlay-prewarm-plan.ts
@@ -0,0 +1,64 @@
+import { getDirectionalPrewarmOffsets } from './fast-scrub-prewarm';
+
+export interface CollectEditOverlayPrewarmTimesInput {
+  targetTime: number;
+  duration: number;
+  fps: number;
+  previousAnchorFrame: number | null;
+  quantumSeconds: number;
+  maxTimestamps: number;
+  isCached?: (time: number) => boolean;
+}
+
+export interface CollectEditOverlayPrewarmTimesResult {
+  direction: -1 | 0 | 1;
+  targetFrame: number;
+  times: number[];
+}
+
+function quantizeTime(time: number, quantumSeconds: number): number {
+  return Math.round(time / quantumSeconds) * quantumSeconds;
+}
+
+export function collectEditOverlayDirectionalPrewarmTimes(
+  input: CollectEditOverlayPrewarmTimesInput,
+): CollectEditOverlayPrewarmTimesResult {
+  const targetFrame = Math.max(0, Math.round(input.targetTime * input.fps));
+  const direction: -1 | 0 | 1 = input.previousAnchorFrame === null || input.previousAnchorFrame === targetFrame
+    ? 0
+    : targetFrame > input.previousAnchorFrame
+      ? 1
+      : -1;
+
+  if (!Number.isFinite(input.duration) || input.duration <= 0 || input.fps <= 0) {
+    return { direction, targetFrame, times: [] };
+  }
+
+  const maxFrame = Math.max(0, Math.floor(input.duration * input.fps) - 1);
+  const offsets = getDirectionalPrewarmOffsets(direction);
+  const times: number[] = [];
+  const seen = new Set<number>();
+  const quantizedTargetTime = quantizeTime(input.targetTime, input.quantumSeconds);
+
+  for (const offset of offsets) {
+    const prewarmFrame = targetFrame + offset;
+    if (prewarmFrame < 0 || prewarmFrame > maxFrame) continue;
+
+    const prewarmTime = quantizeTime(prewarmFrame / input.fps, input.quantumSeconds);
+    if (prewarmTime === quantizedTargetTime || seen.has(prewarmTime)) continue;
+    if (input.isCached?.(prewarmTime)) continue;
+
+    seen.add(prewarmTime);
+    times.push(prewarmTime);
+
+    if (times.length >= input.maxTimestamps) {
+      break;
+    }
+  }
+
+  return {
+    direction,
+    targetFrame,
+    times,
+  };
+}
diff --git a/src/features/preview/utils/preview-constants.test.ts b/src/features/preview/utils/preview-constants.test.ts
new file mode 100644
index 000000000..21aeba1be
--- /dev/null
+++ b/src/features/preview/utils/preview-constants.test.ts
@@ -0,0 +1,89 @@
+import { describe, expect, it } from 'vitest';
+import type { CompositionInputProps } from '@/types/export';
+import { toTrackTopologyFingerprint } from './preview-constants';
+
+function makeTracks(
+  overrides: Partial<CompositionInputProps['tracks'][number]['items'][number]> = {},
+): CompositionInputProps['tracks'] {
+  return [
+    {
+      id: 'track-1',
+      name: 'Video',
+      order: 0,
+      visible: true,
+      muted: false,
+      solo: false,
+      height: 60,
+      locked: false,
+      items: [
+        {
+          id: 'clip-1',
+          type: 'video',
+          trackId: 'track-1',
+          from: 100,
+          durationInFrames: 80,
+          label: 'Clip',
+          mediaId: 'media-1',
+          src: 'proxy://clip-1',
+          sourceStart: 25,
+          sourceEnd: 300,
+          sourceDuration: 400,
+          sourceFps: 30,
+          ...overrides,
+        },
+      ],
+    },
+  ] as CompositionInputProps['tracks'];
+}
+
+describe('toTrackTopologyFingerprint', () => {
+  it('stays stable across timing-only trim changes', () => {
+    const before = makeTracks();
+    const after = makeTracks({
+      from: 112,
+      durationInFrames: 68,
+      sourceStart: 37,
+      sourceEnd: 312,
+      speed: 1.25,
+    });
+
+    expect(toTrackTopologyFingerprint(after)).toBe(toTrackTopologyFingerprint(before));
+  });
+
+  it('changes when the underlying source changes', () => {
+    const before = makeTracks();
+    const after = makeTracks({
+      src: 'proxy://clip-1-v2',
+    });
+
+    expect(toTrackTopologyFingerprint(after)).not.toBe(toTrackTopologyFingerprint(before));
+  });
+
+  it('changes when clip identity changes', () => {
+    const before = makeTracks();
+    const after = makeTracks({
+      id: 'clip-2',
+    });
+
+    expect(toTrackTopologyFingerprint(after)).not.toBe(toTrackTopologyFingerprint(before));
+  });
+
+  it('changes when a shape toggles into or out of mask mode', () => {
+    const before = makeTracks({
+      id: 'shape-1',
+      type: 'shape',
+      mediaId: undefined,
+      src: undefined,
+      isMask: false,
+    });
+    const after = makeTracks({
+      id: 'shape-1',
+      type: 'shape',
+      mediaId: undefined,
+      src: undefined,
+      isMask: true,
+    });
+
+    expect(toTrackTopologyFingerprint(after)).not.toBe(toTrackTopologyFingerprint(before));
+  });
+});
diff --git a/src/features/preview/utils/preview-constants.ts b/src/features/preview/utils/preview-constants.ts
index dfe6fa8bb..1c78c24c2 100644
--- a/src/features/preview/utils/preview-constants.ts
+++ b/src/features/preview/utils/preview-constants.ts
@@ -172,6 +172,28 @@ export function toTrackFingerprint(tracks: CompositionInputProps['tracks']): str
   return parts.join('|');
 }
 
+/**
+ * Fingerprint only the stable renderer topology: track visibility/order and
+ * item identity/source membership. Timing trims and source-window shifts
+ * should invalidate frame ranges, not tear down the whole fast-scrub renderer.
+ */
+export function toTrackTopologyFingerprint(tracks: CompositionInputProps['tracks']): string {
+  const parts: string[] = [];
+  for (const track of tracks) {
+    parts.push(
+      `t:${track.id}:${track.order}:${track.visible ? 1 : 0}:${track.solo ? 1 : 0}:${track.muted ? 1 : 0}`
+    );
+    for (const item of track.items) {
+      const src = 'src' in item ? (item.src ?? '') : '';
+      const isMask = item.type === 'shape' ? (item.isMask ? 1 : 0) : 0;
+      parts.push(
+        `i:${item.id}:${item.type}:${item.mediaId ?? ''}:${src}:${item.trackId}:${isMask}`
+      );
+    }
+  }
+  return parts.join('|');
+}
+
 export function getPreloadBudget(mode: PreviewInteractionMode): number {
   if (mode === 'scrubbing') return PRELOAD_MAX_IDS_PER_TICK_SCRUB;
   if (mode === 'playing') return PRELOAD_MAX_IDS_PER_TICK_PLAYING;
diff --git a/src/features/preview/utils/preview-scrubbing-cache-bridge.test.ts b/src/features/preview/utils/preview-scrubbing-cache-bridge.test.ts
new file mode 100644
index 000000000..a91db0c60
--- /dev/null
+++ b/src/features/preview/utils/preview-scrubbing-cache-bridge.test.ts
@@ -0,0 +1,44 @@
+import { afterEach, describe, expect, it } from 'vitest';
+import { ScrubbingCache } from './scrubbing-cache';
+import {
+  getActivePreviewScrubbingCache,
+  getActivePreviewVideoFrameEntry,
+  setActivePreviewScrubbingCache,
+} from './preview-scrubbing-cache-bridge';
+
+function createMockFrame(): ImageBitmap {
+  return {
+    close() {},
+  } as unknown as ImageBitmap;
+}
+
+describe('preview scrubbing cache bridge', () => {
+  afterEach(() => {
+    setActivePreviewScrubbingCache(null);
+  });
+
+  it('returns the active scrubbing cache instance', () => {
+    const cache = new ScrubbingCache();
+
+    setActivePreviewScrubbingCache(cache);
+
+    expect(getActivePreviewScrubbingCache()).toBe(cache);
+  });
+
+  it('reads tier-2 video frames from the active scrubbing cache', () => {
+    const cache = new ScrubbingCache();
+    const frame = createMockFrame();
+    cache.putVideoFrame('item-1', frame, 1);
+
+    setActivePreviewScrubbingCache(cache);
+
+    const entry = getActivePreviewVideoFrameEntry('item-1', 1.02, 0.05);
+
+    expect(entry?.frame).toBe(frame);
+    expect(entry?.sourceTime).toBe(1);
+  });
+
+  it('returns undefined when no scrubbing cache is active', () => {
+    expect(getActivePreviewVideoFrameEntry('item-1', 1, 0.05)).toBeUndefined();
+  });
+});
diff --git a/src/features/preview/utils/preview-scrubbing-cache-bridge.ts b/src/features/preview/utils/preview-scrubbing-cache-bridge.ts
new file mode 100644
index 000000000..6bda3ffb8
--- /dev/null
+++ b/src/features/preview/utils/preview-scrubbing-cache-bridge.ts
@@ -0,0 +1,19 @@
+import type { ScrubbingCache, VideoFrameEntry } from './scrubbing-cache';
+
+let activePreviewScrubbingCache: ScrubbingCache | null = null;
+
+export function setActivePreviewScrubbingCache(cache: ScrubbingCache | null): void {
+  activePreviewScrubbingCache = cache;
+}
+
+export function getActivePreviewScrubbingCache(): ScrubbingCache | null {
+  return activePreviewScrubbingCache;
+}
+
+export function getActivePreviewVideoFrameEntry(
+  itemId: string,
+  sourceTime?: number,
+  maxSourceTimeDelta = Number.POSITIVE_INFINITY,
+): VideoFrameEntry | undefined {
+  return activePreviewScrubbingCache?.getVideoFrameEntry(itemId, sourceTime, maxSourceTimeDelta);
+}
diff --git a/src/features/preview/utils/scrubbing-cache.test.ts b/src/features/preview/utils/scrubbing-cache.test.ts
index 8e83e5c0a..194e7aad6 100644
--- a/src/features/preview/utils/scrubbing-cache.test.ts
+++ b/src/features/preview/utils/scrubbing-cache.test.ts
@@ -44,10 +44,28 @@ describe('ScrubbingCache tier 2 video frames', () => {
     cache.putVideoFrame('item-1', firstFrame, 1.0);
     cache.putVideoFrame('item-1', secondFrame, 1.1);
 
-    expect(firstFrame.close).toHaveBeenCalledTimes(1);
+    expect(firstFrame.close).not.toHaveBeenCalled();
     expect(secondFrame.close).not.toHaveBeenCalled();
   });
 
+  it('keeps a small recent-frame runway per item and returns the closest match', () => {
+    const cache = new ScrubbingCache();
+    const frameA = createMockFrame();
+    const frameB = createMockFrame();
+    const frameC = createMockFrame();
+
+    cache.putVideoFrame('item-1', frameA, 1.0);
+    cache.putVideoFrame('item-1', frameB, 1.1);
+    cache.putVideoFrame('item-1', frameC, 1.2);
+
+    const entry = cache.getVideoFrameEntry('item-1', 1.11, 0.05);
+
+    expect(entry?.frame).toBe(frameB);
+    expect(frameA.close).not.toHaveBeenCalled();
+    expect(frameB.close).not.toHaveBeenCalled();
+    expect(frameC.close).not.toHaveBeenCalled();
+  });
+
   it('closes cached frames when invalidating tier 2 entries', () => {
     const cache = new ScrubbingCache();
     const frame = createMockFrame();
diff --git a/src/features/preview/utils/scrubbing-cache.ts b/src/features/preview/utils/scrubbing-cache.ts
index e6f1a79a2..7c002ca36 100644
--- a/src/features/preview/utils/scrubbing-cache.ts
+++ b/src/features/preview/utils/scrubbing-cache.ts
@@ -169,16 +169,63 @@ export interface VideoFrameEntry {
 }
 
 class VideoFrameCache {
-  private cache = new Map<string, VideoFrameEntry>();
+  private cache = new Map<string, VideoFrameEntry[]>();
+  private maxEntriesPerItem: number;
 
-  get(itemId: string): VideoFrameEntry | undefined {
-    return this.cache.get(itemId);
+  constructor(maxEntriesPerItem = 4) {
+    this.maxEntriesPerItem = Math.max(1, maxEntriesPerItem);
+  }
+
+  get(
+    itemId: string,
+    sourceTime?: number,
+    maxSourceTimeDelta = Number.POSITIVE_INFINITY,
+  ): VideoFrameEntry | undefined {
+    const entries = this.cache.get(itemId);
+    if (!entries || entries.length === 0) {
+      return undefined;
+    }
+
+    if (sourceTime === undefined) {
+      return entries[entries.length - 1];
+    }
+
+    let bestIndex = -1;
+    let bestDistance = Number.POSITIVE_INFINITY;
+    for (let i = 0; i < entries.length; i++) {
+      const entry = entries[i]!;
+      const distance = Math.abs(entry.sourceTime - sourceTime);
+      if (distance > maxSourceTimeDelta || distance >= bestDistance) {
+        continue;
+      }
+      bestDistance = distance;
+      bestIndex = i;
+    }
+
+    if (bestIndex === -1) {
+      return undefined;
+    }
+
+    const [entry] = entries.splice(bestIndex, 1);
+    entries.push(entry!);
+    return entry;
   }
 
   put(itemId: string, frame: Tier2VideoFrame, sourceTime: number): void {
-    const old = this.cache.get(itemId);
-    if (old) old.frame.close();
-    this.cache.set(itemId, { frame, sourceTime });
+    const entries = this.cache.get(itemId) ?? [];
+    const existingIndex = entries.findIndex((entry) => Math.abs(entry.sourceTime - sourceTime) <= Number.EPSILON);
+    if (existingIndex !== -1) {
+      const [existing] = entries.splice(existingIndex, 1);
+      existing?.frame.close();
+    }
+
+    entries.push({ frame, sourceTime });
+    while (entries.length > this.maxEntriesPerItem) {
+      const evicted = entries.shift();
+      evicted?.frame.close();
+    }
+
+    this.cache.set(itemId, entries);
   }
 
   has(itemId: string): boolean {
@@ -186,12 +233,18 @@ class VideoFrameCache {
   }
 
   get size(): number {
-    return this.cache.size;
+    let total = 0;
+    for (const entries of this.cache.values()) {
+      total += entries.length;
+    }
+    return total;
   }
 
   clear(): void {
-    for (const entry of this.cache.values()) {
-      entry.frame.close();
+    for (const entries of this.cache.values()) {
+      for (const entry of entries) {
+        entry.frame.close();
+      }
     }
     this.cache.clear();
   }
@@ -411,16 +464,10 @@ export class ScrubbingCache {
     sourceTime?: number,
     maxSourceTimeDelta = Number.POSITIVE_INFINITY,
   ): VideoFrameEntry | undefined {
-    const entry = this.tier2.get(itemId);
+    const entry = this.tier2.get(itemId, sourceTime, maxSourceTimeDelta);
     if (!entry) {
       return undefined;
     }
-    if (
-      sourceTime !== undefined
-      && Math.abs(entry.sourceTime - sourceTime) > maxSourceTimeDelta
-    ) {
-      return undefined;
-    }
     this._tier2Hits++;
     return entry;
   }
diff --git a/src/features/preview/utils/text-render-guard.test.ts b/src/features/preview/utils/text-render-guard.test.ts
index 8bc5353a4..976364dbd 100644
--- a/src/features/preview/utils/text-render-guard.test.ts
+++ b/src/features/preview/utils/text-render-guard.test.ts
@@ -86,6 +86,33 @@ describe('shouldPreferPlayerForStyledTextScrub', () => {
     expect(shouldPreferPlayerForStyledTextScrub(tracks, [])).toBe(false);
   });
 
+  it('returns true for visible generated captions even without styling or animation', () => {
+    const tracks: TimelineTrack[] = [
+      {
+        ...BASE_TRACK,
+        items: [
+          {
+            id: 'caption-1',
+            type: 'text',
+            trackId: 'track-1',
+            from: 0,
+            durationInFrames: 90,
+            label: 'Caption',
+            text: 'Hello world',
+            textRole: 'caption',
+            captionSource: {
+              type: 'transcript',
+              clipId: 'video-1',
+              mediaId: 'media-1',
+            },
+          },
+        ],
+      },
+    ];
+
+    expect(shouldPreferPlayerForStyledTextScrub(tracks, [])).toBe(true);
+  });
+
   it('ignores hidden tracks', () => {
     const tracks: TimelineTrack[] = [
       {
diff --git a/src/features/preview/utils/text-render-guard.ts b/src/features/preview/utils/text-render-guard.ts
index 080143ae2..2b57834a6 100644
--- a/src/features/preview/utils/text-render-guard.ts
+++ b/src/features/preview/utils/text-render-guard.ts
@@ -10,6 +10,11 @@ function hasVisibleStyledAnimatedText(
   for (const item of track.items) {
     if (item.type !== 'text') continue;
 
+    const isGeneratedCaption = item.textRole === 'caption' || item.captionSource !== undefined;
+    if (isGeneratedCaption) {
+      return true;
+    }
+
     const hasStyledText = !!item.textShadow || (item.stroke?.width ?? 0) > 0;
     if (!hasStyledText) continue;
 
diff --git a/src/features/project-bundle/schemas/project-schema.ts b/src/features/project-bundle/schemas/project-schema.ts
index 7e1cfb6ca..619975513 100644
--- a/src/features/project-bundle/schemas/project-schema.ts
+++ b/src/features/project-bundle/schemas/project-schema.ts
@@ -105,7 +105,7 @@ const textStrokeSchema = z.object({
 });
 
 const captionSourceSchema = z.object({
-  type: z.literal('transcript'),
+  type: z.enum(['transcript', 'ai-captions']),
   clipId: z.string().min(1),
   mediaId: z.string().min(1),
 });
@@ -306,6 +306,7 @@ const timelineItemSchema = z.object({
   trimEnd: z.number().optional(),
   // Text fields
   text: z.string().optional(),
+  textRole: z.literal('caption').optional(),
   captionSource: captionSourceSchema.optional(),
   fontSize: z.number().optional(),
   fontFamily: z.string().optional(),
diff --git a/src/features/projects/hooks/use-project-thumbnail.ts b/src/features/projects/hooks/use-project-thumbnail.ts
index cc907c986..9a86c0975 100644
--- a/src/features/projects/hooks/use-project-thumbnail.ts
+++ b/src/features/projects/hooks/use-project-thumbnail.ts
@@ -6,8 +6,8 @@ import { createLogger } from '@/shared/logging/logger';
 const logger = createLogger('ProjectThumbnail');
 
 /**
- * Hook to load project thumbnail from IndexedDB Blob storage.
- * Falls back to deprecated base64 thumbnail for backward compatibility.
+ * Hook to load a project thumbnail from workspace-backed blob storage.
+ * Falls back to the deprecated base64 thumbnail for backward compatibility.
  *
  * @param project - The project to get thumbnail for
  * @returns Object URL for the thumbnail, or undefined if not available
@@ -20,7 +20,7 @@ export function useProjectThumbnail(project: Project): string | undefined {
     let cancelled = false;
 
     async function loadThumbnail() {
-      // Try to load from thumbnailId first (new Blob-based storage)
+      // Try to load from thumbnailId first (workspace-backed blob storage)
       if (project.thumbnailId) {
         try {
           const thumbnailData = await getThumbnail(project.thumbnailId);
@@ -30,7 +30,7 @@ export function useProjectThumbnail(project: Project): string | undefined {
             return;
           }
         } catch (error) {
-          logger.warn('Failed to load thumbnail from IndexedDB:', error);
+          logger.warn('Failed to load thumbnail from workspace storage:', error);
         }
       }
 
@@ -53,4 +53,3 @@ export function useProjectThumbnail(project: Project): string | undefined {
 
   return thumbnailUrl;
 }
-
diff --git a/src/features/projects/stores/project-store.ts b/src/features/projects/stores/project-store.ts
index 6682650ee..0ee613161 100644
--- a/src/features/projects/stores/project-store.ts
+++ b/src/features/projects/stores/project-store.ts
@@ -101,7 +101,7 @@ export const useProjectStore = create<ProjectState & ProjectActions>()(
         filterResolution: undefined,
         filterFps: undefined,
 
-        // Load all projects from IndexedDB
+        // Load all projects from workspace storage
         loadProjects: async () => {
           set({ isLoading: true, error: null });
 
@@ -557,4 +557,3 @@ useSettingsStore.subscribe((state, prevState) => {
     }
   }
 });
-
diff --git a/src/features/scene-browser/components/highlighted-text.tsx b/src/features/scene-browser/components/highlighted-text.tsx
new file mode 100644
index 000000000..439ad862a
--- /dev/null
+++ b/src/features/scene-browser/components/highlighted-text.tsx
@@ -0,0 +1,40 @@
+import { memo } from 'react';
+
+interface HighlightedTextProps {
+  text: string;
+  /** Sorted, non-overlapping ranges (output of `findMatchSpans`). */
+  spans: Array<[number, number]>;
+  className?: string;
+}
+
+/**
+ * Render `text` with `<mark/>` wrappers around each match span. Assumes
+ * spans are sorted and non-overlapping — rank.ts merges overlaps before
+ * returning them.
+ */
+export const HighlightedText = memo(function HighlightedText({
+  text,
+  spans,
+  className,
+}: HighlightedTextProps) {
+  if (spans.length === 0) {
+    return <span className={className}>{text}</span>;
+  }
+  const parts: React.ReactNode[] = [];
+  let cursor = 0;
+  spans.forEach((span, index) => {
+    const [from, to] = span;
+    if (from > cursor) parts.push(text.slice(cursor, from));
+    parts.push(
+      <mark
+        key={`${from}-${to}-${index}`}
+        className="bg-primary/25 text-foreground rounded-sm px-0.5 -mx-0.5"
+      >
+        {text.slice(from, to)}
+      </mark>,
+    );
+    cursor = to;
+  });
+  if (cursor < text.length) parts.push(text.slice(cursor));
+  return <span className={className}>{parts}</span>;
+});
diff --git a/src/features/scene-browser/components/library-palette-grid.tsx b/src/features/scene-browser/components/library-palette-grid.tsx
new file mode 100644
index 000000000..cd1021fed
--- /dev/null
+++ b/src/features/scene-browser/components/library-palette-grid.tsx
@@ -0,0 +1,197 @@
+import { memo, useCallback, useLayoutEffect, useRef, useState } from 'react';
+import { MoreHorizontal, Palette } from 'lucide-react';
+import { Popover, PopoverContent, PopoverTrigger } from '@/components/ui/popover';
+import { cn } from '@/shared/ui/cn';
+import { useLibraryPalette } from '../hooks/use-library-palette';
+import { useSceneBrowserStore } from '../stores/scene-browser-store';
+import { labToRgb } from './scene-palette-swatches';
+
+/**
+ * Color Mode picker — a weighted-k-means grid of the library's actual
+ * dominant colors. Clicking a swatch pins it as a single-entry reference
+ * palette, which the existing ranker turns into a similarity search.
+ *
+ * The grid caps at two visible rows. When more swatches exist than fit,
+ * the tail collapses behind a "+N" button that opens a flyout containing
+ * every cluster. The auto-fill track sizing keeps the visible rows tidy
+ * at any container width; the overflow button always occupies the last
+ * grid cell so the rows stay aligned.
+ */
+
+interface LibraryPaletteGridProps {
+  scope: string | null;
+  className?: string;
+}
+
+const SWATCH_SIZE_PX = 22;
+const GAP_PX = 4;
+const MAX_ROWS = 2;
+
+interface Cluster {
+  l: number;
+  a: number;
+  b: number;
+  weight: number;
+}
+
+export const LibraryPaletteGrid = memo(function LibraryPaletteGrid({
+  scope,
+  className,
+}: LibraryPaletteGridProps) {
+  const clusters = useLibraryPalette(scope);
+  const setReference = useSceneBrowserStore((s) => s.setReference);
+  const setQuery = useSceneBrowserStore((s) => s.setQuery);
+  const currentRef = useSceneBrowserStore((s) => s.reference);
+
+  const containerRef = useRef<HTMLDivElement | null>(null);
+  const [columns, setColumns] = useState(0);
+
+  useLayoutEffect(() => {
+    const el = containerRef.current;
+    if (!el) return;
+    const update = () => {
+      const width = el.clientWidth;
+      if (width <= 0) return;
+      // Match the grid's auto-fill math: floor((W + gap) / (size + gap)).
+      const perRow = Math.max(1, Math.floor((width + GAP_PX) / (SWATCH_SIZE_PX + GAP_PX)));
+      setColumns(perRow);
+    };
+    update();
+    const observer = new ResizeObserver(update);
+    observer.observe(el);
+    return () => observer.disconnect();
+  }, []);
+
+  const handlePick = useCallback((cluster: Cluster) => {
+    setQuery('');
+    setReference({
+      sceneId: `library-color-${Math.round(cluster.l)}-${Math.round(cluster.a)}-${Math.round(cluster.b)}`,
+      label: 'Library color',
+      palette: [{ l: cluster.l, a: cluster.a, b: cluster.b, weight: 1 }],
+    });
+  }, [setQuery, setReference]);
+
+  if (clusters.length === 0) {
+    return (
+      <div className={cn(
+        'flex items-center gap-2 rounded-md border border-dashed border-border/60 px-3 py-2 text-[12px] text-muted-foreground',
+        className,
+      )}>
+        <Palette className="h-3.5 w-3.5" />
+        <span>No palettes indexed yet — run AI captioning to populate.</span>
+      </div>
+    );
+  }
+
+  const totalWeight = clusters.reduce((sum, c) => sum + c.weight, 0) || 1;
+  const capacity = columns > 0 ? columns * MAX_ROWS : clusters.length;
+  const overflow = columns > 0 && clusters.length > capacity;
+  // Reserve the last cell for the "more" button when overflowing so rows
+  // stay aligned. Otherwise show every cluster.
+  const visibleCount = overflow ? capacity - 1 : clusters.length;
+  const visible = clusters.slice(0, visibleCount);
+  const hidden = overflow ? clusters.slice(visibleCount) : [];
+
+  return (
+    <div
+      ref={containerRef}
+      className={cn('grid w-full', className)}
+      style={{
+        gridTemplateColumns: `repeat(auto-fill, minmax(${SWATCH_SIZE_PX}px, 1fr))`,
+        gap: `${GAP_PX}px`,
+      }}
+    >
+      {visible.map((cluster, i) => (
+        <SwatchButton
+          key={`v-${i}`}
+          cluster={cluster}
+          totalWeight={totalWeight}
+          active={isActiveRef(cluster, currentRef)}
+          onPick={handlePick}
+        />
+      ))}
+      {overflow && (
+        <Popover>
+          <PopoverTrigger asChild>
+            <button
+              type="button"
+              className={cn(
+                'flex aspect-square items-center justify-center rounded-md border border-white/10',
+                'bg-secondary/60 text-muted-foreground transition-all',
+                'hover:-translate-y-0.5 hover:text-foreground hover:shadow-md',
+                'focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-primary',
+              )}
+              title={`${hidden.length} more color${hidden.length === 1 ? '' : 's'}`}
+              aria-label={`Show ${hidden.length} more colors`}
+            >
+              <MoreHorizontal className="h-3 w-3" />
+            </button>
+          </PopoverTrigger>
+          <PopoverContent align="end" className="w-64 p-2">
+            <div className="mb-1.5 flex items-center justify-between px-1 text-[10.5px] uppercase tracking-wide text-muted-foreground">
+              <span>More colors</span>
+              <span>{hidden.length}</span>
+            </div>
+            <div
+              className="grid"
+              style={{
+                gridTemplateColumns: `repeat(auto-fill, minmax(${SWATCH_SIZE_PX}px, 1fr))`,
+                gap: `${GAP_PX}px`,
+              }}
+            >
+              {hidden.map((cluster, i) => (
+                <SwatchButton
+                  key={`h-${i}`}
+                  cluster={cluster}
+                  totalWeight={totalWeight}
+                  active={isActiveRef(cluster, currentRef)}
+                  onPick={handlePick}
+                />
+              ))}
+            </div>
+          </PopoverContent>
+        </Popover>
+      )}
+    </div>
+  );
+});
+
+function SwatchButton({
+  cluster,
+  totalWeight,
+  active,
+  onPick,
+}: {
+  cluster: Cluster;
+  totalWeight: number;
+  active: boolean;
+  onPick: (cluster: Cluster) => void;
+}) {
+  const [r, g, b] = labToRgb(cluster.l, cluster.a, cluster.b);
+  const share = cluster.weight / totalWeight;
+  const label = `Find scenes in this color (${Math.round(share * 100)}% of the library)`;
+  return (
+    <button
+      type="button"
+      onClick={() => onPick(cluster)}
+      className={cn(
+        'aspect-square rounded-md border border-white/10 transition-all',
+        'hover:-translate-y-0.5 hover:shadow-md focus-visible:outline-none',
+        'focus-visible:ring-2 focus-visible:ring-primary',
+        active && 'ring-2 ring-primary',
+      )}
+      style={{ backgroundColor: `rgb(${r}, ${g}, ${b})` }}
+      title={label}
+      aria-label={label}
+    />
+  );
+}
+
+function isActiveRef(
+  cluster: Cluster,
+  ref: { palette: ReadonlyArray<{ l: number; a: number; b: number }> } | null,
+): boolean {
+  if (!ref || ref.palette.length !== 1) return false;
+  const first = ref.palette[0]!;
+  return first.l === cluster.l && first.a === cluster.a && first.b === cluster.b;
+}
diff --git a/src/features/scene-browser/components/scene-browser-panel.tsx b/src/features/scene-browser/components/scene-browser-panel.tsx
new file mode 100644
index 000000000..286df5172
--- /dev/null
+++ b/src/features/scene-browser/components/scene-browser-panel.tsx
@@ -0,0 +1,471 @@
+import { useCallback, useLayoutEffect, useMemo, useRef, useState } from 'react';
+import { BrainCircuit, Check, ChevronDown, Filter, LayoutGrid, List, Loader2, Sparkles, Wand2 } from 'lucide-react';
+import {
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
+} from '@/components/ui/select';
+import {
+  DropdownMenu,
+  DropdownMenuContent,
+  DropdownMenuItem,
+  DropdownMenuSeparator,
+  DropdownMenuTrigger,
+} from '@/components/ui/dropdown-menu';
+import { ScrollArea } from '@/components/ui/scroll-area';
+import { cn } from '@/shared/ui/cn';
+import { useMediaLibraryStore, mediaAnalysisService } from '../deps/media-library';
+import { useSceneBrowserStore, type SceneBrowserSortMode, type SceneBrowserViewMode } from '../stores/scene-browser-store';
+import { useRankedScenes } from '../hooks/use-ranked-scenes';
+import { useSemanticIndex } from '../hooks/use-semantic-index';
+import { SceneSearchField, SceneSearchModeButtons } from './scene-search-input';
+import { SceneRow } from './scene-row';
+import { SceneCard } from './scene-card';
+
+interface SceneBrowserPanelProps {
+  className?: string;
+}
+
+const SORT_OPTIONS: Array<{ value: SceneBrowserSortMode; label: string }> = [
+  { value: 'relevance', label: 'Relevance' },
+  { value: 'time', label: 'Timestamp' },
+  { value: 'name', label: 'Media name' },
+];
+
+export function SceneBrowserPanel({ className }: SceneBrowserPanelProps) {
+  const query = useSceneBrowserStore((s) => s.query);
+  const scope = useSceneBrowserStore((s) => s.scope);
+  const setScope = useSceneBrowserStore((s) => s.setScope);
+  const sortMode = useSceneBrowserStore((s) => s.sortMode);
+  const setSortMode = useSceneBrowserStore((s) => s.setSortMode);
+  const viewMode = useSceneBrowserStore((s) => s.viewMode);
+  const setViewMode = useSceneBrowserStore((s) => s.setViewMode);
+
+  const mediaItems = useMediaLibraryStore((s) => s.mediaItems);
+  const mediaWithCaptions = useMemo(
+    () => mediaItems.filter((m) => (m.aiCaptions?.length ?? 0) > 0),
+    [mediaItems],
+  );
+
+  const [analyzeBusy, setAnalyzeBusy] = useState(false);
+  const analyzableMedia = useMemo(
+    () => mediaItems.filter((m) => m.mimeType.startsWith('video/') || m.mimeType.startsWith('image/')),
+    [mediaItems],
+  );
+  const missingCount = useMemo(
+    () => analyzableMedia.filter((m) => (m.aiCaptions?.length ?? 0) === 0).length,
+    [analyzableMedia],
+  );
+
+  const headerRef = useRef<HTMLDivElement | null>(null);
+  const [headerWidth, setHeaderWidth] = useState<number>(Number.POSITIVE_INFINITY);
+  useLayoutEffect(() => {
+    const el = headerRef.current;
+    if (!el) return;
+    const update = () => {
+      setHeaderWidth(el.clientWidth);
+    };
+    update();
+    const observer = new ResizeObserver(update);
+    observer.observe(el);
+    return () => observer.disconnect();
+  }, []);
+
+  // Two thresholds for compacting — buttons (Color/Keyword/Analyze) carry
+  // fixed-width labels and only need to collapse when genuinely cramped,
+  // but the scope `<Select>` shows arbitrary-length filenames in a 144px
+  // pill, so we collapse it earlier and much earlier when a specific
+  // media is picked (any real filename truncates in that width).
+  const compact = headerWidth < 360;
+  const compactScope = headerWidth < 440 || (scope !== null && headerWidth < 560);
+
+  const runAnalyze = useCallback(
+    async (kind: 'missing' | 'all' | 'scope', mediaId?: string) => {
+      if (analyzeBusy) return;
+      setAnalyzeBusy(true);
+      try {
+        if (kind === 'scope' && mediaId) {
+          await mediaAnalysisService.analyzeMedia(mediaId);
+        } else if (kind === 'missing') {
+          await mediaAnalysisService.analyzeBatch({ onlyMissing: true });
+        } else {
+          await mediaAnalysisService.analyzeBatch({ onlyMissing: false });
+        }
+      } finally {
+        setAnalyzeBusy(false);
+      }
+    },
+    [analyzeBusy],
+  );
+
+  const {
+    scenes,
+    totalScenes,
+    clipsWithCaptions,
+    reanalyzingMedia,
+    isQuerying,
+  } = useRankedScenes();
+  const indexProgress = useSemanticIndex();
+
+  const scopedMedia = useMemo(
+    () => (scope ? mediaWithCaptions.find((m) => m.id === scope) ?? null : null),
+    [scope, mediaWithCaptions],
+  );
+
+  const handleScopeChange = useCallback((value: string) => {
+    setScope(value === 'all' ? null : value);
+  }, [setScope]);
+
+  const showMediaName = scope === null;
+  const hasAnyCaptions = totalScenes > 0;
+  const hasResults = scenes.length > 0;
+  const isFiltered = query.trim().length > 0;
+
+  const scopeLabel = scopedMedia
+    ? `${clipsWithCaptions} clip · ${totalScenes} ${totalScenes === 1 ? 'scene' : 'scenes'}`
+    : `${clipsWithCaptions} ${clipsWithCaptions === 1 ? 'clip' : 'clips'} · ${totalScenes} ${totalScenes === 1 ? 'scene' : 'scenes'}`;
+
+  return (
+    <div className={cn('flex min-h-0 flex-col', className)}>
+      <div ref={headerRef} className="flex flex-col gap-2 border-b border-border/50 px-3 py-2">
+        <div className="flex items-center gap-2">
+          <SceneSearchModeButtons compact={compact} />
+          <AnalyzeMenu
+            busy={analyzeBusy}
+            totalAnalyzable={analyzableMedia.length}
+            missingCount={missingCount}
+            scopedMedia={scopedMedia}
+            onRun={runAnalyze}
+            compact={compact}
+          />
+          <div className="flex-1" />
+          {compactScope ? (
+            <CompactScopePicker
+              scope={scope}
+              scopedMedia={scopedMedia}
+              mediaWithCaptions={mediaWithCaptions}
+              onChange={handleScopeChange}
+            />
+          ) : (
+            <Select value={scope ?? 'all'} onValueChange={handleScopeChange}>
+              <SelectTrigger className="h-8 w-36 text-[11px]">
+                <SelectValue placeholder="Scope" />
+              </SelectTrigger>
+              <SelectContent>
+                <SelectItem value="all">All media</SelectItem>
+                {mediaWithCaptions.map((media) => (
+                  <SelectItem key={media.id} value={media.id}>
+                    {media.fileName}
+                  </SelectItem>
+                ))}
+              </SelectContent>
+            </Select>
+          )}
+        </div>
+        <div className="flex items-center">
+          <SceneSearchField />
+        </div>
+      </div>
+
+      <div className="flex items-center justify-between border-b border-border/30 px-3 py-1.5 text-[11px] text-muted-foreground">
+        <span className="flex items-center gap-1.5">
+          <Sparkles className="h-3 w-3 text-purple-400" />
+          {isFiltered
+            ? `${scenes.length} ${scenes.length === 1 ? 'match' : 'matches'} · ${scopeLabel}`
+            : scopeLabel}
+        </span>
+        <div className="flex items-center gap-1.5">
+          <ViewModeToggle value={viewMode} onChange={setViewMode} />
+          <span className="text-muted-foreground/80">Sort</span>
+          <Select value={sortMode} onValueChange={(v) => setSortMode(v as SceneBrowserSortMode)}>
+            <SelectTrigger className="h-6 w-28 text-[11px]">
+              <SelectValue />
+            </SelectTrigger>
+            <SelectContent>
+              {SORT_OPTIONS.map((option) => (
+                <SelectItem key={option.value} value={option.value}>
+                  {option.label}
+                </SelectItem>
+              ))}
+            </SelectContent>
+          </Select>
+        </div>
+      </div>
+
+      {/* The `[&>...]` override forces Radix's inner viewport wrapper to
+          block layout. Radix defaults it to `display: table; min-width: 100%`
+          which lets the wrapper grow past the viewport width if any row has
+          a long intrinsic min-width — that overflow slides row content
+          underneath the vertical scrollbar. Block layout keeps the wrapper
+          clamped to viewport width so the scrollbar sits in its own column. */}
+      <ScrollArea className="flex-1 min-h-0 mr-2 [&>[data-radix-scroll-area-viewport]>div]:!block">
+        <div className={cn('pl-2 pr-3 py-2', viewMode === 'list' && 'space-y-0.5')}>
+          {reanalyzingMedia.length > 0 && (
+            <ReanalyzingBanner items={reanalyzingMedia} />
+          )}
+          {(indexProgress.indexTotal > 0 || indexProgress.loadingModel) && (
+            <SemanticIndexBanner progress={indexProgress} />
+          )}
+          {hasResults ? (
+            viewMode === 'grid' ? (
+              <div className="grid grid-cols-[repeat(auto-fill,minmax(140px,1fr))] gap-2">
+                {scenes.map((scene, index) => (
+                  <SceneCard
+                    key={scene.id}
+                    scene={scene}
+                    showMediaName={showMediaName}
+                    showSignals={isQuerying}
+                    isTop={isQuerying && index === 0}
+                  />
+                ))}
+              </div>
+            ) : (
+              scenes.map((scene, index) => (
+                <SceneRow
+                  key={scene.id}
+                  scene={scene}
+                  showMediaName={showMediaName}
+                  showSignals={isQuerying}
+                  isTop={isQuerying && index === 0}
+                />
+              ))
+            )
+          ) : reanalyzingMedia.length === 0 ? (
+            <EmptyState hasAnyCaptions={hasAnyCaptions} isFiltered={isFiltered} />
+          ) : null}
+        </div>
+      </ScrollArea>
+    </div>
+  );
+}
+
+function SemanticIndexBanner({
+  progress,
+}: {
+  progress: { indexing: number; indexTotal: number; loadingModel: boolean };
+}) {
+  const label = progress.loadingModel
+    ? 'Downloading semantic model (~22 MB, first run only)…'
+    : `Indexing captions for semantic search — ${progress.indexing}/${progress.indexTotal} clips`;
+  return (
+    <div className="flex items-center gap-2 rounded-md border border-purple-400/20 bg-purple-400/5 px-3 py-2 text-[11px] text-purple-300/90">
+      <BrainCircuit className="h-3 w-3 shrink-0 animate-pulse" />
+      <span className="truncate">{label}</span>
+    </div>
+  );
+}
+
+function ReanalyzingBanner({
+  items,
+}: {
+  items: Array<{ id: string; fileName: string }>;
+}) {
+  const label = items.length === 1
+    ? items[0]!.fileName
+    : `${items.length} clips`;
+  return (
+    <div className="flex items-center gap-2 rounded-md border border-primary/20 bg-primary/5 px-3 py-2 text-[11px] text-primary/90">
+      <Loader2 className="h-3 w-3 shrink-0 animate-spin" />
+      <span className="truncate">
+        Re-analyzing <span className="font-medium">{label}</span> — scenes will refresh when done.
+      </span>
+    </div>
+  );
+}
+
+/**
+ * Compact replacement for the scope `<Select>` used when the browser header
+ * is too narrow to fit the full pill. Renders a small filter-icon button
+ * (with a dot when a specific clip is scoped) that opens a dropdown listing
+ * "All media" + every captioned clip.
+ */
+function CompactScopePicker({
+  scope,
+  scopedMedia,
+  mediaWithCaptions,
+  onChange,
+}: {
+  scope: string | null;
+  scopedMedia: { id: string; fileName: string } | null;
+  mediaWithCaptions: ReadonlyArray<{ id: string; fileName: string }>;
+  onChange: (value: string) => void;
+}) {
+  const title = scopedMedia ? `Scope: ${scopedMedia.fileName}` : 'Scope: All media';
+  return (
+    <DropdownMenu>
+      <DropdownMenuTrigger asChild>
+        <button
+          type="button"
+          className={cn(
+            'relative flex h-8 items-center gap-1 rounded-md border px-2 text-[11px] transition-colors',
+            scope
+              ? 'border-primary/60 bg-primary/10 text-primary'
+              : 'border-border bg-secondary text-muted-foreground hover:text-foreground',
+          )}
+          title={title}
+          aria-label={title}
+        >
+          <Filter className="h-3 w-3" />
+          <ChevronDown className="h-3 w-3 opacity-70" />
+          {scope && (
+            <span className="absolute -right-0.5 -top-0.5 h-1.5 w-1.5 rounded-full bg-primary" />
+          )}
+        </button>
+      </DropdownMenuTrigger>
+      <DropdownMenuContent align="end" className="w-64">
+        <DropdownMenuItem onClick={() => onChange('all')}>
+          All media
+          {scope === null && <Check className="ml-auto h-3 w-3" />}
+        </DropdownMenuItem>
+        {mediaWithCaptions.length > 0 && <DropdownMenuSeparator />}
+        {mediaWithCaptions.map((media) => (
+          <DropdownMenuItem key={media.id} onClick={() => onChange(media.id)}>
+            <span className="truncate">{media.fileName}</span>
+            {scope === media.id && <Check className="ml-auto h-3 w-3 shrink-0" />}
+          </DropdownMenuItem>
+        ))}
+      </DropdownMenuContent>
+    </DropdownMenu>
+  );
+}
+
+function AnalyzeMenu({
+  busy,
+  totalAnalyzable,
+  missingCount,
+  scopedMedia,
+  onRun,
+  compact,
+}: {
+  busy: boolean;
+  totalAnalyzable: number;
+  missingCount: number;
+  scopedMedia: { id: string; fileName: string } | null;
+  onRun: (kind: 'missing' | 'all' | 'scope', mediaId?: string) => void;
+  compact?: boolean;
+}) {
+  const disabled = totalAnalyzable === 0;
+  return (
+    <DropdownMenu>
+      <DropdownMenuTrigger asChild>
+        <button
+          type="button"
+          disabled={disabled || busy}
+          className={cn(
+            'flex h-8 items-center gap-1 rounded-md border px-2 text-[11px] transition-colors',
+            'border-border bg-secondary text-muted-foreground hover:text-foreground',
+            (disabled || busy) && 'cursor-not-allowed opacity-60',
+          )}
+          title="Analyze media with AI"
+          aria-label="Analyze media with AI"
+        >
+          {busy ? <Loader2 className="h-3 w-3 animate-spin" /> : <Wand2 className="h-3 w-3" />}
+          {!compact && 'Analyze'}
+          <ChevronDown className="h-3 w-3 opacity-70" />
+        </button>
+      </DropdownMenuTrigger>
+      <DropdownMenuContent align="end" className="w-64">
+        {scopedMedia && (
+          <>
+            <DropdownMenuItem onClick={() => onRun('scope', scopedMedia.id)}>
+              <Sparkles className="mr-2 h-3 w-3" />
+              <span className="truncate">Analyze "{scopedMedia.fileName}"</span>
+            </DropdownMenuItem>
+            <DropdownMenuSeparator />
+          </>
+        )}
+        <DropdownMenuItem onClick={() => onRun('missing')} disabled={missingCount === 0}>
+          <Sparkles className="mr-2 h-3 w-3" />
+          Analyze new media
+          <span className="ml-auto text-[10px] text-muted-foreground">
+            {missingCount} {missingCount === 1 ? 'clip' : 'clips'}
+          </span>
+        </DropdownMenuItem>
+        <DropdownMenuItem onClick={() => onRun('all')} disabled={totalAnalyzable === 0}>
+          <Wand2 className="mr-2 h-3 w-3" />
+          Re-analyze all
+          <span className="ml-auto text-[10px] text-muted-foreground">
+            {totalAnalyzable} {totalAnalyzable === 1 ? 'clip' : 'clips'}
+          </span>
+        </DropdownMenuItem>
+      </DropdownMenuContent>
+    </DropdownMenu>
+  );
+}
+
+function ViewModeToggle({
+  value,
+  onChange,
+}: {
+  value: SceneBrowserViewMode;
+  onChange: (mode: SceneBrowserViewMode) => void;
+}) {
+  return (
+    <div className="flex items-center rounded-md border border-border bg-secondary p-0.5">
+      <button
+        type="button"
+        onClick={() => onChange('list')}
+        className={cn(
+          'flex h-5 w-5 items-center justify-center rounded-[3px] transition-colors',
+          value === 'list'
+            ? 'bg-primary/15 text-primary'
+            : 'text-muted-foreground hover:text-foreground',
+        )}
+        title="List view"
+        aria-label="List view"
+        aria-pressed={value === 'list'}
+      >
+        <List className="h-3 w-3" />
+      </button>
+      <button
+        type="button"
+        onClick={() => onChange('grid')}
+        className={cn(
+          'flex h-5 w-5 items-center justify-center rounded-[3px] transition-colors',
+          value === 'grid'
+            ? 'bg-primary/15 text-primary'
+            : 'text-muted-foreground hover:text-foreground',
+        )}
+        title="Grid view"
+        aria-label="Grid view"
+        aria-pressed={value === 'grid'}
+      >
+        <LayoutGrid className="h-3 w-3" />
+      </button>
+    </div>
+  );
+}
+
+function EmptyState({
+  hasAnyCaptions,
+  isFiltered,
+}: {
+  hasAnyCaptions: boolean;
+  isFiltered: boolean;
+}) {
+  if (!hasAnyCaptions) {
+    return (
+      <div className="flex flex-col items-center gap-2 px-6 py-12 text-center text-muted-foreground">
+        <Sparkles className="h-6 w-6 text-purple-400/60" />
+        <p className="text-[12px]">No AI captions yet.</p>
+        <p className="max-w-xs text-[11px] text-muted-foreground/80">
+          Run <span className="font-medium">Analyze with AI</span> on a clip from the media
+          library to generate searchable scene captions.
+        </p>
+      </div>
+    );
+  }
+  if (isFiltered) {
+    return (
+      <div className="flex flex-col items-center gap-1 px-6 py-10 text-center text-muted-foreground">
+        <p className="text-[12px]">No scenes match your search.</p>
+        <p className="text-[11px] text-muted-foreground/80">
+          Try a shorter query or switch scope to <span className="font-medium">All media</span>.
+        </p>
+      </div>
+    );
+  }
+  return null;
+}
diff --git a/src/features/scene-browser/components/scene-card.tsx b/src/features/scene-browser/components/scene-card.tsx
new file mode 100644
index 000000000..4a8de8847
--- /dev/null
+++ b/src/features/scene-browser/components/scene-card.tsx
@@ -0,0 +1,175 @@
+import { memo, useCallback } from 'react';
+import { Film, Palette, Search } from 'lucide-react';
+import { cn } from '@/shared/ui/cn';
+import { formatDuration } from '../deps/media-library';
+import { useCaptionThumbnail } from '../hooks/use-caption-thumbnail';
+import { useSceneBrowserStore } from '../stores/scene-browser-store';
+import { nearestColorFamily } from '../utils/color-boost';
+import { seekToScene } from '../utils/seek';
+import { SceneMatchBadges, SceneMatchStrength } from './scene-match-badges';
+import { ScenePaletteSwatches } from './scene-palette-swatches';
+import type { ScoredScene } from '../utils/rank';
+
+interface SceneCardProps {
+  scene: ScoredScene;
+  /** When true, render the source filename line — hidden in per-media scope. */
+  showMediaName: boolean;
+  /** True when this card is the first result for the active query. */
+  isTop?: boolean;
+  /** Only render match signal chrome when a query is active. */
+  showSignals?: boolean;
+}
+
+function parseCaptionIndex(sceneId: string): number | null {
+  const idx = sceneId.lastIndexOf(':');
+  if (idx < 0) return null;
+  const parsed = Number(sceneId.slice(idx + 1));
+  return Number.isInteger(parsed) && parsed >= 0 ? parsed : null;
+}
+
+export const SceneCard = memo(function SceneCard({
+  scene,
+  showMediaName,
+  isTop,
+  showSignals,
+}: SceneCardProps) {
+  const captionIndex = parseCaptionIndex(scene.id);
+  const thumbUrl = useCaptionThumbnail(
+    scene.thumbRelPath,
+    scene.thumbRelPath || captionIndex === null
+      ? undefined
+      : { mediaId: scene.mediaId, captionIndex, timeSec: scene.timeSec },
+  );
+
+  const setQuery = useSceneBrowserStore((s) => s.setQuery);
+  const setReference = useSceneBrowserStore((s) => s.setReference);
+  const colorMode = useSceneBrowserStore((s) => s.colorMode);
+
+  const handleOpen = useCallback(() => {
+    seekToScene(scene.mediaId, scene.timeSec);
+  }, [scene.mediaId, scene.timeSec]);
+
+  const handleSwatchClick = useCallback((swatch: { l: number; a: number; b: number }) => {
+    if (colorMode) {
+      setReference({
+        sceneId: `swatch-${Math.round(swatch.l)}-${Math.round(swatch.a)}-${Math.round(swatch.b)}`,
+        label: 'Picked swatch',
+        palette: [{ l: swatch.l, a: swatch.a, b: swatch.b, weight: 1 }],
+      });
+      return;
+    }
+    const family = nearestColorFamily(swatch);
+    if (!family) return;
+    setReference(null);
+    setQuery(family);
+  }, [colorMode, setQuery, setReference]);
+
+  const handleFindSimilarPalette = useCallback((event: React.MouseEvent) => {
+    event.stopPropagation();
+    if (!scene.palette || scene.palette.length === 0) return;
+    setQuery('');
+    setReference({
+      sceneId: scene.id,
+      label: `${scene.mediaFileName} · ${formatDuration(scene.timeSec)}`,
+      palette: scene.palette.map((p) => ({ l: p.l, a: p.a, b: p.b, weight: p.weight })),
+    });
+  }, [scene.id, scene.mediaFileName, scene.palette, scene.timeSec, setQuery, setReference]);
+
+  const handleDragStart = useCallback((event: React.DragEvent) => {
+    event.dataTransfer.effectAllowed = 'copy';
+    event.dataTransfer.setData(
+      'application/json',
+      JSON.stringify({
+        type: 'scene-drop' as const,
+        mediaId: scene.mediaId,
+        fileName: scene.mediaFileName,
+        startSec: scene.timeSec,
+      }),
+    );
+  }, [scene.mediaFileName, scene.mediaId, scene.timeSec]);
+
+  const timestampLabel = formatDuration(scene.timeSec);
+  const hasPalette = !!scene.palette && scene.palette.length > 0;
+
+  return (
+    <button
+      type="button"
+      draggable
+      onDragStart={handleDragStart}
+      onClick={handleOpen}
+      className={cn(
+        'group flex w-full flex-col overflow-hidden rounded-lg border border-transparent',
+        'text-left transition-colors',
+        'hover:border-border/60 hover:bg-foreground/5 focus-visible:outline-none',
+        'focus-visible:border-primary/60 focus-visible:bg-primary/10',
+        showSignals && isTop && 'border-primary/40 bg-primary/5',
+      )}
+      title="Click to preview in source monitor — drag to add to the timeline"
+    >
+      <div className="relative aspect-video max-h-32 w-full shrink-0 overflow-hidden rounded-md bg-secondary">
+        {thumbUrl ? (
+          <img
+            src={thumbUrl}
+            alt=""
+            className="h-full w-full object-cover"
+            loading="lazy"
+            draggable={false}
+          />
+        ) : (
+          <div className="flex h-full w-full items-center justify-center text-muted-foreground">
+            <Film className="h-5 w-5" />
+          </div>
+        )}
+        <div className="absolute inset-0 flex items-center justify-center bg-black/40 opacity-0 transition-opacity group-hover:opacity-100">
+          <Search className="h-5 w-5 text-white/90" />
+        </div>
+        <span className="absolute bottom-1 right-1 rounded bg-black/70 px-1 font-mono text-[10px] leading-none text-white/90">
+          {timestampLabel}
+        </span>
+        {hasPalette && (
+          <span
+            role="button"
+            tabIndex={-1}
+            aria-label="Find scenes with a similar palette"
+            title="Find scenes with a similar palette"
+            className="absolute left-1 top-1 flex h-6 w-6 items-center justify-center rounded-md bg-black/60 text-white/90 opacity-0 transition-opacity hover:bg-black/80 group-hover:opacity-100"
+            onClick={handleFindSimilarPalette}
+          >
+            <Palette className="h-3 w-3" />
+          </span>
+        )}
+      </div>
+      <div className="min-w-0 space-y-1 px-1.5 py-1.5">
+        {showMediaName && (
+          <div className="truncate text-[10px] text-muted-foreground" title={scene.mediaFileName}>
+            {scene.mediaFileName}
+          </div>
+        )}
+        <div className="whitespace-normal break-words text-[11px] leading-snug text-foreground">
+          {scene.text}
+        </div>
+        {showSignals && (
+          <SceneMatchStrength signals={scene.signals} score={scene.score} />
+        )}
+        {(showSignals || (colorMode && hasPalette)) && (
+          <div className="flex flex-wrap items-center gap-1">
+            {showSignals && (
+              <SceneMatchBadges
+                signals={scene.signals}
+                score={scene.score}
+                isTop={isTop}
+              />
+            )}
+            {hasPalette && (colorMode || showSignals) && (
+              <ScenePaletteSwatches
+                palette={scene.palette}
+                highlight={showSignals ? (scene.signals.colorMatch ?? null) : null}
+                onSwatchClick={handleSwatchClick}
+              />
+            )}
+          </div>
+        )}
+      </div>
+    </button>
+  );
+});
diff --git a/src/features/scene-browser/components/scene-match-badges.tsx b/src/features/scene-browser/components/scene-match-badges.tsx
new file mode 100644
index 000000000..3ec9de188
--- /dev/null
+++ b/src/features/scene-browser/components/scene-match-badges.tsx
@@ -0,0 +1,275 @@
+import { memo } from 'react';
+import { Eye, MessageSquareText, Palette, Sparkles, Type } from 'lucide-react';
+import { cn } from '@/shared/ui/cn';
+import type { SceneMatchSignals } from '../utils/rank';
+
+/**
+ * all-MiniLM-L6-v2 and CLIP cosines both compress heavily — a 0.38 text
+ * match is a *great* hit, not a "38% match". Showing raw percentages
+ * scared users (see the bug report where `38%` on the top result
+ * looked like the system was barely finding anything). We translate
+ * raw cosines into qualitative tiers calibrated to each model's real
+ * behavior and fill the strength bar proportionally to the tier's
+ * reachable range rather than to the raw 0–1 cosine — so a strong
+ * text hit actually looks full.
+ */
+
+const TEXT_TIER_STRONG = 0.5;
+const TEXT_TIER_GOOD = 0.4;
+const TEXT_TIER_FAIR = 0.3;
+
+const IMAGE_TIER_STRONG = 0.3;
+const IMAGE_TIER_GOOD = 0.25;
+const IMAGE_TIER_FAIR = 0.22;
+
+type Tier = 'strong' | 'good' | 'fair';
+
+function scoreTier(score: number, thresholds: { strong: number; good: number; fair: number }): Tier | null {
+  if (score >= thresholds.strong) return 'strong';
+  if (score >= thresholds.good) return 'good';
+  if (score >= thresholds.fair) return 'fair';
+  return null;
+}
+
+function tierLabel(tier: Tier): string {
+  switch (tier) {
+    case 'strong': return 'Strong';
+    case 'good': return 'Good';
+    case 'fair': return 'Fair';
+  }
+}
+
+/**
+ * Map raw cosine to a 0-1 display fraction calibrated to the model's
+ * real reachable range. For text embeddings we treat 0.3 (Fair floor)
+ * as "empty bar" and 0.6 as "full bar"; for CLIP it's 0.22 → 0.35.
+ * Users see a half-full bar on what's actually a ~0.4 text hit,
+ * matching the intuition that it's a solid but not perfect match.
+ */
+function calibratedFraction(score: number, floor: number, ceiling: number): number {
+  if (ceiling <= floor) return 0;
+  return Math.max(0, Math.min(1, (score - floor) / (ceiling - floor)));
+}
+
+function textFraction(score: number): number {
+  return calibratedFraction(score, TEXT_TIER_FAIR, 0.6);
+}
+
+function imageFraction(score: number): number {
+  return calibratedFraction(score, IMAGE_TIER_FAIR, 0.35);
+}
+
+/**
+ * Per-row surface that tells the user *why* a scene ranked — which
+ * signal (keyword, semantic meaning, visual CLIP) fired and how
+ * strongly. This is the main feedback loop that makes "I typed a query
+ * and scrolled a list" feel like something is actually happening rather
+ * than magic.
+ *
+ * Conventions:
+ *   - Keyword matches get a single yellow "Word" chip (match spans are
+ *     already highlighted in the caption text itself, so the chip is a
+ *     reinforcement rather than the only signal).
+ *   - Semantic-text matches get a blue "Meaning" chip with the percent.
+ *   - Semantic-visual matches get a purple "Visual" chip with the percent.
+ *   - When both semantic signals fire, both chips render — users see
+ *     directly which side of the parallel vector store contributed.
+ */
+
+interface SceneMatchBadgesProps {
+  signals: SceneMatchSignals;
+  score: number;
+  /** `true` for the first scene in the list — earns a "Top" label. */
+  isTop?: boolean;
+  className?: string;
+}
+
+export const SceneMatchBadges = memo(function SceneMatchBadges({
+  signals,
+  score,
+  isTop,
+  className,
+}: SceneMatchBadgesProps) {
+  const chips: React.ReactNode[] = [];
+
+  if (signals.ranker === 'keyword' && signals.keywordMatched) {
+    chips.push(
+      <Chip
+        key="keyword"
+        tone="keyword"
+        icon={<Type className="h-2.5 w-2.5" />}
+        label="Keyword"
+        hint={`Keyword match · cosine ${score.toFixed(2)}`}
+      />,
+    );
+  }
+
+  if (signals.ranker === 'semantic') {
+    const textScore = signals.textScore;
+    const imageScore = signals.imageScore;
+    const textTier = typeof textScore === 'number'
+      ? scoreTier(textScore, { strong: TEXT_TIER_STRONG, good: TEXT_TIER_GOOD, fair: TEXT_TIER_FAIR })
+      : null;
+    const imageTier = typeof imageScore === 'number'
+      ? scoreTier(imageScore, { strong: IMAGE_TIER_STRONG, good: IMAGE_TIER_GOOD, fair: IMAGE_TIER_FAIR })
+      : null;
+
+    if (textTier) {
+      chips.push(
+        <Chip
+          key="semantic-text"
+          tone="text"
+          icon={<MessageSquareText className="h-2.5 w-2.5" />}
+          label={`Meaning · ${tierLabel(textTier)}`}
+          hint={`Text-embedding cosine: ${(textScore ?? 0).toFixed(3)}`}
+        />,
+      );
+    }
+    if (imageTier) {
+      chips.push(
+        <Chip
+          key="semantic-image"
+          tone="visual"
+          icon={<Eye className="h-2.5 w-2.5" />}
+          label={`Visual · ${tierLabel(imageTier)}`}
+          hint={`CLIP cosine: ${(imageScore ?? 0).toFixed(3)}`}
+        />,
+      );
+    }
+    if (signals.colorMatch) {
+      chips.push(
+        <Chip
+          key="semantic-color"
+          tone="palette"
+          icon={<Palette className="h-2.5 w-2.5" />}
+          label={`Color · ${signals.colorMatch}`}
+          hint={`Palette match on ${signals.colorMatch} (∆E 2000)`}
+        />,
+      );
+    }
+    if (typeof signals.paletteDistance === 'number') {
+      chips.push(
+        <Chip
+          key="palette-similar"
+          tone="palette"
+          icon={<Palette className="h-2.5 w-2.5" />}
+          label={`Palette · ∆E ${signals.paletteDistance.toFixed(1)}`}
+          hint="Weighted-mean ∆E 2000 to the reference palette"
+        />,
+      );
+    }
+
+    if (!textTier && !imageTier && !signals.colorMatch && signals.paletteDistance === undefined) {
+      // Shouldn't normally reach here — the ranker drops rows that clear
+      // no threshold — but surface something so the row isn't silent.
+      chips.push(
+        <Chip
+          key="semantic-weak"
+          tone="text"
+          icon={<Sparkles className="h-2.5 w-2.5" />}
+          label="Below threshold"
+          hint={`cosine ${score.toFixed(3)}`}
+        />,
+      );
+    }
+  }
+
+  if (chips.length === 0 && !isTop) return null;
+
+  return (
+    <div className={cn('flex flex-wrap items-center gap-1', className)}>
+      {isTop && (
+        <Chip
+          tone="top"
+          icon={<Sparkles className="h-2.5 w-2.5" />}
+          label="Top"
+          hint="Highest-scoring match"
+        />
+      )}
+      {chips}
+    </div>
+  );
+});
+
+interface ChipProps {
+  tone: 'keyword' | 'text' | 'visual' | 'top' | 'palette';
+  icon: React.ReactNode;
+  label: string;
+  hint?: string;
+}
+
+function Chip({ tone, icon, label, hint }: ChipProps) {
+  const cls = (() => {
+    switch (tone) {
+      case 'keyword':
+        return 'bg-amber-400/15 text-amber-300 border-amber-400/30';
+      case 'text':
+        return 'bg-sky-400/15 text-sky-300 border-sky-400/30';
+      case 'visual':
+        return 'bg-purple-400/15 text-purple-300 border-purple-400/30';
+      case 'palette':
+        return 'bg-emerald-400/15 text-emerald-300 border-emerald-400/30';
+      case 'top':
+        return 'bg-primary/15 text-primary border-primary/40';
+    }
+  })();
+
+  return (
+    <span
+      className={cn(
+        'inline-flex items-center gap-1 rounded-full border px-1.5 py-[1px] text-[9.5px] font-medium leading-none',
+        cls,
+      )}
+      title={hint}
+    >
+      {icon}
+      {label}
+    </span>
+  );
+}
+
+/**
+ * Horizontal strength bar — fills proportionally to the calibrated
+ * tier range, NOT the raw cosine. A 0.38 text hit shows as ~50%
+ * filled rather than 38%, because 0.38 actually represents a solid
+ * match in all-MiniLM's compressed output distribution.
+ */
+export const SceneMatchStrength = memo(function SceneMatchStrength({
+  signals,
+  score,
+}: {
+  signals: SceneMatchSignals;
+  score: number;
+}) {
+  if (signals.ranker === 'keyword') {
+    return (
+      <div className="h-0.5 w-full overflow-hidden rounded-full bg-amber-400/10">
+        <div
+          className="h-full bg-amber-400/70"
+          style={{ width: `${Math.max(20, Math.min(100, score * 100))}%` }}
+        />
+      </div>
+    );
+  }
+
+  const hasTextScore = typeof signals.textScore === 'number';
+  const hasImageScore = typeof signals.imageScore === 'number';
+
+  // Palette-only ranking has neither text nor image cosines — showing
+  // two empty bars reads as broken UI, so skip the strength row entirely.
+  if (!hasTextScore && !hasImageScore) return null;
+
+  const textPct = hasTextScore ? textFraction(signals.textScore!) : 0;
+  const imagePct = hasImageScore ? imageFraction(signals.imageScore!) : 0;
+
+  return (
+    <div className="flex h-0.5 w-full gap-0.5">
+      <div className="flex-1 overflow-hidden rounded-full bg-sky-400/10">
+        <div className="h-full bg-sky-400/70" style={{ width: `${textPct * 100}%` }} />
+      </div>
+      <div className="flex-1 overflow-hidden rounded-full bg-purple-400/10">
+        <div className="h-full bg-purple-400/70" style={{ width: `${imagePct * 100}%` }} />
+      </div>
+    </div>
+  );
+});
diff --git a/src/features/scene-browser/components/scene-palette-swatches.tsx b/src/features/scene-browser/components/scene-palette-swatches.tsx
new file mode 100644
index 000000000..6df507906
--- /dev/null
+++ b/src/features/scene-browser/components/scene-palette-swatches.tsx
@@ -0,0 +1,152 @@
+import { memo } from 'react';
+import { cn } from '@/shared/ui/cn';
+
+/**
+ * Tiny row of color swatches showing the scene's dominant palette.
+ * Comes straight from the Lab entries stored on `MediaCaption.palette`
+ * — we convert back to sRGB for display so users can *see* the colors
+ * that got indexed for matching, which makes "why did this match my
+ * red query?" explainable at a glance.
+ */
+
+interface Swatch {
+  l: number;
+  a: number;
+  b: number;
+  weight: number;
+}
+
+interface ScenePaletteSwatchesProps {
+  palette: readonly Swatch[] | undefined;
+  /** When present, draws a ring around the matched swatch. */
+  highlight?: string | null;
+  className?: string;
+  /**
+   * Fires when a swatch is clicked. The caller is responsible for
+   * resolving the swatch to a search action (e.g. nearest color family
+   * → setQuery). When omitted, swatches render non-interactively.
+   */
+  onSwatchClick?: (swatch: Swatch) => void;
+}
+
+export function labToRgb(l: number, a: number, b: number): [number, number, number] {
+  // Inverse of sRGB → Lab conversion: Lab → XYZ → linear sRGB → gamma.
+  const fy = (l + 16) / 116;
+  const fx = a / 500 + fy;
+  const fz = fy - b / 200;
+
+  const epsilon = 216 / 24389;
+  const kappa = 24389 / 27;
+
+  const xr = fx ** 3 > epsilon ? fx ** 3 : (116 * fx - 16) / kappa;
+  const yr = l > kappa * epsilon ? ((l + 16) / 116) ** 3 : l / kappa;
+  const zr = fz ** 3 > epsilon ? fz ** 3 : (116 * fz - 16) / kappa;
+
+  const refX = 0.95047;
+  const refZ = 1.08883;
+  const X = xr * refX;
+  const Y = yr * 1.0;
+  const Z = zr * refZ;
+
+  // XYZ → linear sRGB
+  let rLin =  3.2404542 * X + -1.5371385 * Y + -0.4985314 * Z;
+  let gLin = -0.9692660 * X +  1.8760108 * Y +  0.0415560 * Z;
+  let bLin =  0.0556434 * X + -0.2040259 * Y +  1.0572252 * Z;
+
+  const compand = (v: number): number => {
+    const clamped = Math.max(0, Math.min(1, v));
+    return clamped <= 0.0031308
+      ? 12.92 * clamped
+      : 1.055 * Math.pow(clamped, 1 / 2.4) - 0.055;
+  };
+
+  rLin = compand(rLin);
+  gLin = compand(gLin);
+  bLin = compand(bLin);
+
+  return [
+    Math.round(rLin * 255),
+    Math.round(gLin * 255),
+    Math.round(bLin * 255),
+  ];
+}
+
+function swatchColor(swatch: Swatch): string {
+  const [r, g, b] = labToRgb(swatch.l, swatch.a, swatch.b);
+  return `rgb(${r}, ${g}, ${b})`;
+}
+
+export const ScenePaletteSwatches = memo(function ScenePaletteSwatches({
+  palette,
+  highlight,
+  className,
+  onSwatchClick,
+}: ScenePaletteSwatchesProps) {
+  if (!palette || palette.length === 0) return null;
+  const interactive = !!onSwatchClick;
+  return (
+    <div
+      className={cn('flex items-center gap-0.5', className)}
+      title={`Dominant palette (${palette.length} ${palette.length === 1 ? 'color' : 'colors'})`}
+    >
+      {palette.map((swatch, i) => {
+        // Width proportional to pixel coverage so a dominant color
+        // visibly takes up more room — matches how the user would
+        // describe the scene at a glance.
+        const width = Math.max(6, Math.min(22, Math.round(swatch.weight * 40)));
+        const isHighlighted = highlight
+          && Boolean(palette[i])
+          && paletteMatchesFamily(palette[i]!, highlight);
+        const commonClass = cn(
+          'block h-3 rounded-sm border border-white/10',
+          isHighlighted && 'ring-2 ring-primary',
+          interactive && 'cursor-pointer hover:ring-1 hover:ring-white/30',
+        );
+        const style = { width: `${width}px`, backgroundColor: swatchColor(swatch) };
+        if (!interactive) {
+          return <span key={i} className={commonClass} style={style} />;
+        }
+        return (
+          <span
+            key={i}
+            role="button"
+            tabIndex={-1}
+            aria-label="Search by this color"
+            title="Search by this color"
+            className={commonClass}
+            style={style}
+            onClick={(e) => {
+              e.stopPropagation();
+              onSwatchClick(swatch);
+            }}
+          />
+        );
+      })}
+    </div>
+  );
+});
+
+// Rough hue-family test used only to draw the matched-swatch ring.
+// Precise matching is the ranker's job via ∆E; here we just want
+// "does this specific swatch look roughly red?" for the visual hint.
+function paletteMatchesFamily(swatch: Swatch, family: string): boolean {
+  const { l, a, b } = swatch;
+  const chroma = Math.sqrt(a * a + b * b);
+  if (family === 'black') return l < 25 && chroma < 20;
+  if (family === 'white') return l > 80 && chroma < 15;
+  if (family === 'gray') return chroma < 12 && l >= 25 && l <= 80;
+  if (chroma < 10) return false;
+  const hueDeg = ((Math.atan2(b, a) * 180) / Math.PI + 360) % 360;
+  switch (family) {
+    case 'red':    return hueDeg < 20 || hueDeg >= 345;
+    case 'orange': return hueDeg >= 20 && hueDeg < 50;
+    case 'yellow': return hueDeg >= 50 && hueDeg < 80;
+    case 'green':  return hueDeg >= 95 && hueDeg < 165;
+    case 'teal':   return hueDeg >= 165 && hueDeg < 200;
+    case 'blue':   return hueDeg >= 200 && hueDeg < 255;
+    case 'purple': return hueDeg >= 255 && hueDeg < 300;
+    case 'pink':   return hueDeg >= 300 && hueDeg < 345;
+    case 'brown':  return (hueDeg >= 20 && hueDeg < 60) && l < 50;
+    default:       return false;
+  }
+}
diff --git a/src/features/scene-browser/components/scene-row.tsx b/src/features/scene-browser/components/scene-row.tsx
new file mode 100644
index 000000000..2fe13c624
--- /dev/null
+++ b/src/features/scene-browser/components/scene-row.tsx
@@ -0,0 +1,207 @@
+import { memo, useCallback, useMemo } from 'react';
+import { Clock, Film, Palette, Search } from 'lucide-react';
+import { cn } from '@/shared/ui/cn';
+import { formatDuration } from '../deps/media-library';
+import { useCaptionThumbnail } from '../hooks/use-caption-thumbnail';
+import { useSceneBrowserStore } from '../stores/scene-browser-store';
+import { nearestColorFamily } from '../utils/color-boost';
+import { seekToScene } from '../utils/seek';
+import { HighlightedText } from './highlighted-text';
+import { SceneMatchBadges, SceneMatchStrength } from './scene-match-badges';
+import { ScenePaletteSwatches } from './scene-palette-swatches';
+import type { ScoredScene } from '../utils/rank';
+
+interface SceneRowProps {
+  scene: ScoredScene;
+  /** When true, render the source filename line — hidden in per-media scope. */
+  showMediaName: boolean;
+  /** True when this row is the first result for the active query. */
+  isTop?: boolean;
+  /** Only render match signal chrome when a query is active. */
+  showSignals?: boolean;
+}
+
+function formatSceneTimestamp(sec: number): string {
+  return formatDuration(sec);
+}
+
+function parseCaptionIndex(sceneId: string): number | null {
+  const idx = sceneId.lastIndexOf(':');
+  if (idx < 0) return null;
+  const parsed = Number(sceneId.slice(idx + 1));
+  return Number.isInteger(parsed) && parsed >= 0 ? parsed : null;
+}
+
+export const SceneRow = memo(function SceneRow({
+  scene,
+  showMediaName,
+  isTop,
+  showSignals,
+}: SceneRowProps) {
+  const captionIndex = parseCaptionIndex(scene.id);
+  const thumbUrl = useCaptionThumbnail(
+    scene.thumbRelPath,
+    scene.thumbRelPath || captionIndex === null
+      ? undefined
+      : { mediaId: scene.mediaId, captionIndex, timeSec: scene.timeSec },
+  );
+
+  const handleOpen = useCallback(() => {
+    seekToScene(scene.mediaId, scene.timeSec);
+  }, [scene.mediaId, scene.timeSec]);
+
+  const setQuery = useSceneBrowserStore((s) => s.setQuery);
+  const setReference = useSceneBrowserStore((s) => s.setReference);
+  const colorMode = useSceneBrowserStore((s) => s.colorMode);
+
+  const handleSwatchClick = useCallback((swatch: { l: number; a: number; b: number }) => {
+    if (colorMode) {
+      setReference({
+        sceneId: `swatch-${Math.round(swatch.l)}-${Math.round(swatch.a)}-${Math.round(swatch.b)}`,
+        label: 'Picked swatch',
+        palette: [{ l: swatch.l, a: swatch.a, b: swatch.b, weight: 1 }],
+      });
+      return;
+    }
+    const family = nearestColorFamily(swatch);
+    if (!family) return;
+    setReference(null);
+    setQuery(family);
+  }, [colorMode, setQuery, setReference]);
+
+  const handleFindSimilarPalette = useCallback((event: React.MouseEvent) => {
+    event.stopPropagation();
+    if (!scene.palette || scene.palette.length === 0) return;
+    setQuery('');
+    setReference({
+      sceneId: scene.id,
+      label: `${scene.mediaFileName} · ${formatSceneTimestamp(scene.timeSec)}`,
+      palette: scene.palette.map((p) => ({ l: p.l, a: p.a, b: p.b, weight: p.weight })),
+    });
+  }, [scene.id, scene.mediaFileName, scene.palette, scene.timeSec, setQuery, setReference]);
+
+  const handleDragStart = useCallback((event: React.DragEvent) => {
+    event.dataTransfer.effectAllowed = 'copy';
+    event.dataTransfer.setData(
+      'application/json',
+      JSON.stringify({
+        type: 'scene-drop' as const,
+        mediaId: scene.mediaId,
+        fileName: scene.mediaFileName,
+        startSec: scene.timeSec,
+      }),
+    );
+  }, [scene.mediaFileName, scene.mediaId, scene.timeSec]);
+
+  const timestampLabel = useMemo(() => formatSceneTimestamp(scene.timeSec), [scene.timeSec]);
+
+  return (
+    <button
+      type="button"
+      draggable
+      onDragStart={handleDragStart}
+      onClick={handleOpen}
+      className={cn(
+        'group flex w-full items-start gap-3 rounded-lg border border-transparent px-2 py-2',
+        'text-left transition-colors',
+        'hover:border-border/60 hover:bg-foreground/5 focus-visible:outline-none',
+        'focus-visible:border-primary/60 focus-visible:bg-primary/10',
+        // A subtle backdrop on the top match so it stands out without
+        // stealing focus from lower-scoring but still-relevant rows.
+        showSignals && isTop && 'bg-primary/5',
+      )}
+      title="Click to preview in source monitor — drag to add to the timeline"
+    >
+      <div className="relative h-[54px] w-24 shrink-0 overflow-hidden rounded-md bg-secondary">
+        {thumbUrl ? (
+          <img
+            src={thumbUrl}
+            alt=""
+            className="h-full w-full object-cover"
+            loading="lazy"
+            draggable={false}
+          />
+        ) : (
+          <div className="flex h-full w-full items-center justify-center text-muted-foreground">
+            <Film className="h-4 w-4" />
+          </div>
+        )}
+        <div className="absolute inset-0 flex items-center justify-center bg-black/40 opacity-0 transition-opacity group-hover:opacity-100">
+          <Search className="h-4 w-4 text-white/90" />
+        </div>
+        <span className="absolute bottom-0.5 right-0.5 rounded bg-black/70 px-1 font-mono text-[10px] leading-none text-white/90">
+          {timestampLabel}
+        </span>
+      </div>
+      <div className="min-w-0 flex-1 space-y-0.5">
+        {showMediaName && (
+          <div className="flex items-center gap-1 text-[11px] text-muted-foreground">
+            <Clock className="h-2.5 w-2.5 shrink-0" />
+            <span className="truncate">{scene.mediaFileName}</span>
+          </div>
+        )}
+        <HighlightedText
+          text={scene.text}
+          spans={scene.matchSpans}
+          className="line-clamp-3 text-[12px] leading-snug text-foreground"
+        />
+        {showSignals && (
+          <div className="space-y-1 pt-0.5">
+            <SceneMatchStrength signals={scene.signals} score={scene.score} />
+            <div className="flex flex-wrap items-center gap-2">
+              <SceneMatchBadges
+                signals={scene.signals}
+                score={scene.score}
+                isTop={isTop}
+              />
+              <ScenePaletteSwatches
+                palette={scene.palette}
+                highlight={scene.signals.colorMatch ?? null}
+                onSwatchClick={handleSwatchClick}
+              />
+              {scene.palette && scene.palette.length > 0 && (
+                <span
+                  role="button"
+                  tabIndex={-1}
+                  aria-label="Find scenes with a similar palette"
+                  title="Find scenes with a similar palette"
+                  className="flex h-5 w-5 items-center justify-center rounded-md text-muted-foreground opacity-0 transition-opacity hover:bg-foreground/10 hover:text-foreground group-hover:opacity-100"
+                  onClick={handleFindSimilarPalette}
+                >
+                  <Palette className="h-3 w-3" />
+                </span>
+              )}
+            </div>
+          </div>
+        )}
+        {!showSignals && colorMode && (
+          <div className="flex flex-wrap items-center gap-2 pt-0.5">
+            {scene.palette && scene.palette.length > 0 ? (
+              <>
+                <ScenePaletteSwatches
+                  palette={scene.palette}
+                  highlight={null}
+                  onSwatchClick={handleSwatchClick}
+                />
+                <span
+                  role="button"
+                  tabIndex={-1}
+                  aria-label="Find scenes with a similar palette"
+                  title="Find scenes with a similar palette"
+                  className="flex h-5 w-5 items-center justify-center rounded-md text-muted-foreground transition-opacity hover:bg-foreground/10 hover:text-foreground"
+                  onClick={handleFindSimilarPalette}
+                >
+                  <Palette className="h-3 w-3" />
+                </span>
+              </>
+            ) : (
+              <span className="text-[10px] italic text-muted-foreground/60">
+                No palette indexed
+              </span>
+            )}
+          </div>
+        )}
+      </div>
+    </button>
+  );
+});
diff --git a/src/features/scene-browser/components/scene-search-input.tsx b/src/features/scene-browser/components/scene-search-input.tsx
new file mode 100644
index 000000000..0b41ab6a0
--- /dev/null
+++ b/src/features/scene-browser/components/scene-search-input.tsx
@@ -0,0 +1,186 @@
+import { useEffect, useRef } from 'react';
+import { Check, ChevronDown, Palette, Search, Sparkles, Type, X } from 'lucide-react';
+import { Input } from '@/components/ui/input';
+import {
+  DropdownMenu,
+  DropdownMenuContent,
+  DropdownMenuItem,
+  DropdownMenuTrigger,
+} from '@/components/ui/dropdown-menu';
+import { cn } from '@/shared/ui/cn';
+import { useSettingsStore, type CaptionSearchMode } from '../deps/settings';
+import { useSceneBrowserStore } from '../stores/scene-browser-store';
+import { LibraryPaletteGrid } from './library-palette-grid';
+
+/**
+ * Row-1 toggles: Color mode, and (when not in color mode) the
+ * semantic/keyword search-mode switch. Lifted out of the input field so
+ * the scene browser header can keep all pill-shaped controls on one
+ * line with the scope selector and the analyze menu, and push the
+ * actual input surface to a second row.
+ */
+export function SceneSearchModeButtons({ compact = false }: { compact?: boolean }) {
+  const colorMode = useSceneBrowserStore((s) => s.colorMode);
+  const setColorMode = useSceneBrowserStore((s) => s.setColorMode);
+  const captionSearchMode = useSettingsStore((s) => s.captionSearchMode);
+  const setSetting = useSettingsStore((s) => s.setSetting);
+
+  const semanticActive = captionSearchMode === 'semantic';
+
+  return (
+    <>
+      <button
+        type="button"
+        onClick={() => setColorMode(!colorMode)}
+        className={cn(
+          'flex h-8 items-center gap-1 rounded-md border px-2 text-[11px] transition-colors',
+          colorMode
+            ? 'border-primary/60 bg-primary/10 text-primary'
+            : 'border-border bg-secondary text-muted-foreground hover:text-foreground',
+        )}
+        title={colorMode ? 'Exit color mode' : 'Search by color'}
+        aria-label={colorMode ? 'Exit color mode' : 'Search by color'}
+        aria-pressed={colorMode}
+      >
+        <Palette className="h-3 w-3" />
+        {!compact && 'Color'}
+      </button>
+      {!colorMode && (
+        <DropdownMenu>
+          <DropdownMenuTrigger asChild>
+            <button
+              type="button"
+              className={cn(
+                'flex h-8 items-center gap-1 rounded-md border px-2 text-[11px] transition-colors',
+                semanticActive
+                  ? 'border-primary/60 bg-primary/10 text-primary'
+                  : 'border-border bg-secondary text-muted-foreground hover:text-foreground',
+              )}
+              title={
+                semanticActive
+                  ? 'Semantic search (by meaning)'
+                  : 'Keyword search (exact match)'
+              }
+              aria-label={semanticActive ? 'Semantic search' : 'Keyword search'}
+            >
+              {semanticActive ? <Sparkles className="h-3 w-3" /> : <Type className="h-3 w-3" />}
+              {!compact && (semanticActive ? 'Semantic' : 'Keyword')}
+              <ChevronDown className="h-3 w-3 opacity-70" />
+            </button>
+          </DropdownMenuTrigger>
+          <DropdownMenuContent align="start" className="w-56">
+            <DropdownMenuItem onClick={() => setSetting('captionSearchMode', 'keyword' satisfies CaptionSearchMode)}>
+              <Type className="mr-2 h-3 w-3" />
+              <div className="flex flex-col">
+                <span>Keyword</span>
+                <span className="text-[10px] text-muted-foreground">Exact word matches</span>
+              </div>
+              {!semanticActive && <Check className="ml-auto h-3 w-3" />}
+            </DropdownMenuItem>
+            <DropdownMenuItem onClick={() => setSetting('captionSearchMode', 'semantic' satisfies CaptionSearchMode)}>
+              <Sparkles className="mr-2 h-3 w-3" />
+              <div className="flex flex-col">
+                <span>Semantic</span>
+                <span className="text-[10px] text-muted-foreground">Match by meaning</span>
+              </div>
+              {semanticActive && <Check className="ml-auto h-3 w-3" />}
+            </DropdownMenuItem>
+          </DropdownMenuContent>
+        </DropdownMenu>
+      )}
+    </>
+  );
+}
+
+/**
+ * Row-2 input surface: text search when in caption-search mode, palette
+ * grid when in color mode. Sized to fill the browser header's second
+ * row on its own.
+ */
+export function SceneSearchField() {
+  const query = useSceneBrowserStore((s) => s.query);
+  const setQuery = useSceneBrowserStore((s) => s.setQuery);
+  const focusNonce = useSceneBrowserStore((s) => s.focusNonce);
+  const reference = useSceneBrowserStore((s) => s.reference);
+  const setReference = useSceneBrowserStore((s) => s.setReference);
+  const colorMode = useSceneBrowserStore((s) => s.colorMode);
+  const scope = useSceneBrowserStore((s) => s.scope);
+  const captionSearchMode = useSettingsStore((s) => s.captionSearchMode);
+  const inputRef = useRef<HTMLInputElement>(null);
+
+  useEffect(() => {
+    if (focusNonce > 0) {
+      inputRef.current?.focus();
+      inputRef.current?.select();
+    }
+  }, [focusNonce]);
+
+  const semanticActive = captionSearchMode === 'semantic';
+
+  if (colorMode) {
+    return (
+      <div className="flex min-w-0 flex-1 items-start gap-2 rounded-md border border-border bg-secondary/40 px-2 py-1.5">
+        {reference ? (
+          <button
+            type="button"
+            onClick={() => setReference(null)}
+            className="flex h-6 max-w-[220px] items-center gap-1 rounded-md border border-primary/60 bg-primary/10 px-2 text-[11px] text-primary transition-colors hover:bg-primary/20"
+            title="Clear reference and pick another color"
+          >
+            <Palette className="h-3 w-3 shrink-0" />
+            <span className="truncate">{reference.label}</span>
+            <X className="h-3 w-3 shrink-0" />
+          </button>
+        ) : (
+          <LibraryPaletteGrid scope={scope} />
+        )}
+      </div>
+    );
+  }
+
+  return (
+    <div className="relative flex min-w-0 flex-1 items-center gap-1.5">
+      <div className="relative flex-1 min-w-0">
+        <Search className="pointer-events-none absolute left-2.5 top-1/2 h-3.5 w-3.5 -translate-y-1/2 text-muted-foreground" />
+        <Input
+          ref={inputRef}
+          value={query}
+          onChange={(e) => setQuery(e.target.value)}
+          placeholder={
+            reference
+              ? 'Finding scenes with a similar palette…'
+              : semanticActive
+                ? 'Search by meaning — "sunset over water", "people laughing"…'
+                : 'Search scenes by what you see…'
+          }
+          disabled={!!reference}
+          className="h-8 pl-8 pr-7 text-[12px] disabled:opacity-60"
+          spellCheck={false}
+          autoComplete="off"
+        />
+        {query.length > 0 && !reference && (
+          <button
+            type="button"
+            onClick={() => setQuery('')}
+            className="absolute right-2 top-1/2 -translate-y-1/2 rounded p-0.5 text-muted-foreground hover:text-foreground"
+            aria-label="Clear search"
+          >
+            <X className="h-3 w-3" />
+          </button>
+        )}
+      </div>
+      {reference && (
+        <button
+          type="button"
+          onClick={() => setReference(null)}
+          className="flex h-8 max-w-[220px] items-center gap-1 rounded-md border border-primary/60 bg-primary/10 px-2 text-[11px] text-primary transition-colors hover:bg-primary/20"
+          title={`Similar palette to ${reference.label} — click to clear`}
+        >
+          <Palette className="h-3 w-3 shrink-0" />
+          <span className="truncate">{reference.label}</span>
+          <X className="h-3 w-3 shrink-0" />
+        </button>
+      )}
+    </div>
+  );
+}
diff --git a/src/features/scene-browser/deps/analysis-contract.ts b/src/features/scene-browser/deps/analysis-contract.ts
new file mode 100644
index 000000000..9a5d26923
--- /dev/null
+++ b/src/features/scene-browser/deps/analysis-contract.ts
@@ -0,0 +1,27 @@
+/**
+ * Cross-feature contract — scene-browser uses the embeddings provider for
+ * semantic search (query embedding + background indexer).
+ */
+
+export {
+  embeddingsProvider,
+  EMBEDDING_MODEL_ID,
+  EMBEDDING_MODEL_DIM,
+  clipProvider,
+  CLIP_MODEL_ID,
+  CLIP_EMBEDDING_DIM,
+  buildEmbeddingText,
+  extractDominantColors,
+  extractDominantColorPhrase,
+  deltaE2000,
+  rgbToLab,
+} from '@/infrastructure/analysis';
+export type {
+  EmbeddingsOptions,
+  EmbeddingsProgress,
+  EmbeddingsProvider,
+  BuildEmbeddingTextInput,
+  TranscriptSegment,
+  PaletteEntry,
+  LabColor,
+} from '@/infrastructure/analysis';
diff --git a/src/features/scene-browser/deps/analysis.ts b/src/features/scene-browser/deps/analysis.ts
new file mode 100644
index 000000000..6195064ad
--- /dev/null
+++ b/src/features/scene-browser/deps/analysis.ts
@@ -0,0 +1 @@
+export * from './analysis-contract';
diff --git a/src/features/scene-browser/deps/media-library-contract.ts b/src/features/scene-browser/deps/media-library-contract.ts
new file mode 100644
index 000000000..e62dc9d53
--- /dev/null
+++ b/src/features/scene-browser/deps/media-library-contract.ts
@@ -0,0 +1,10 @@
+/**
+ * Cross-feature adapter contract — scene-browser accesses media-library
+ * state and helpers through this file so the import graph is auditable.
+ */
+
+export { useMediaLibraryStore } from '@/features/media-library/stores/media-library-store';
+export { getMediaType, formatDuration } from '@/features/media-library/utils/validation';
+export { mediaLibraryService } from '@/features/media-library/services/media-library-service';
+export { mediaAnalysisService } from '@/features/media-library/services/media-analysis-service';
+export type { MediaLibraryNotification } from '@/features/media-library/types';
diff --git a/src/features/scene-browser/deps/media-library.ts b/src/features/scene-browser/deps/media-library.ts
new file mode 100644
index 000000000..9a640311e
--- /dev/null
+++ b/src/features/scene-browser/deps/media-library.ts
@@ -0,0 +1,10 @@
+/**
+ * Cross-feature adapter — scene-browser accesses media-library state and
+ * the shared source player through this barrel so the import graph stays
+ * one-directional (feature-boundary rule in CLAUDE.md).
+ */
+
+export * from './media-library-contract';
+export { useSourcePlayerStore } from '@/shared/state/source-player';
+export { useEditorStore } from '@/app/state/editor';
+export type { MediaMetadata } from '@/types/storage';
diff --git a/src/features/scene-browser/deps/settings-contract.ts b/src/features/scene-browser/deps/settings-contract.ts
new file mode 100644
index 000000000..d212c2e36
--- /dev/null
+++ b/src/features/scene-browser/deps/settings-contract.ts
@@ -0,0 +1,7 @@
+/**
+ * Adapter — scene-browser reads `captionSearchMode` from the app settings
+ * store through this contract so the boundary checker stays happy.
+ */
+
+export { useSettingsStore } from '@/features/settings/stores/settings-store';
+export type { CaptionSearchMode } from '@/features/settings/stores/settings-store';
diff --git a/src/features/scene-browser/deps/settings.ts b/src/features/scene-browser/deps/settings.ts
new file mode 100644
index 000000000..5322f23d4
--- /dev/null
+++ b/src/features/scene-browser/deps/settings.ts
@@ -0,0 +1 @@
+export * from './settings-contract';
diff --git a/src/features/scene-browser/deps/storage.ts b/src/features/scene-browser/deps/storage.ts
new file mode 100644
index 000000000..f014cadf3
--- /dev/null
+++ b/src/features/scene-browser/deps/storage.ts
@@ -0,0 +1,17 @@
+/**
+ * Storage adapter — loads caption thumbnail blobs from workspace-fs.
+ */
+
+export {
+  getCaptionThumbnailBlob,
+  saveCaptionThumbnail,
+  probeCaptionThumbnail,
+  saveCaptionEmbeddings,
+  getCaptionEmbeddings,
+  getCaptionsEmbeddingsMeta,
+  saveCaptionImageEmbeddings,
+  getCaptionImageEmbeddings,
+  getTranscript,
+  getScenes,
+} from '@/infrastructure/storage';
+export type { SavedScenes } from '@/infrastructure/storage';
diff --git a/src/features/scene-browser/hooks/use-caption-thumbnail.ts b/src/features/scene-browser/hooks/use-caption-thumbnail.ts
new file mode 100644
index 000000000..3ce749912
--- /dev/null
+++ b/src/features/scene-browser/hooks/use-caption-thumbnail.ts
@@ -0,0 +1,112 @@
+import { useEffect, useRef, useState } from 'react';
+import { getCaptionThumbnailBlob } from '../deps/storage';
+import { requestLazyCaptionThumbnail } from '../utils/lazy-thumb';
+
+/**
+ * Module-scoped blob URL cache keyed by `thumbRelPath`. Scene Browser rows
+ * are virtualized / remount frequently, so loading the same JPEG for every
+ * mount would thrash the workspace-fs read path. Entries are evicted by
+ * {@link invalidateMediaCaptionThumbBlobs} when the source media is
+ * re-analyzed — without that, a blob URL keeps pointing at the pre-reanalyze
+ * JPEG content even after the on-disk file changes.
+ */
+const blobUrlCache = new Map<string, string>();
+const pendingLoads = new Map<string, Promise<string | null>>();
+
+/**
+ * Revoke and drop every blob URL that lives under a media's
+ * captions-thumbs directory. Callers should invoke this before a
+ * re-analysis run so the next render loads the freshly-written JPEG
+ * instead of the cached pre-overwrite blob.
+ */
+export function invalidateMediaCaptionThumbBlobs(mediaId: string): void {
+  const prefix = `media/${mediaId}/cache/ai/captions-thumbs/`;
+  for (const [key, url] of blobUrlCache) {
+    if (key.startsWith(prefix)) {
+      URL.revokeObjectURL(url);
+      blobUrlCache.delete(key);
+    }
+  }
+  for (const key of pendingLoads.keys()) {
+    if (key.startsWith(prefix)) pendingLoads.delete(key);
+  }
+}
+
+async function loadBlobUrl(relPath: string): Promise<string | null> {
+  const cached = blobUrlCache.get(relPath);
+  if (cached) return cached;
+  const pending = pendingLoads.get(relPath);
+  if (pending) return pending;
+
+  const promise = (async () => {
+    const blob = await getCaptionThumbnailBlob(relPath);
+    if (!blob) return null;
+    const url = URL.createObjectURL(blob);
+    blobUrlCache.set(relPath, url);
+    return url;
+  })();
+  pendingLoads.set(relPath, promise);
+  try {
+    return await promise;
+  } finally {
+    pendingLoads.delete(relPath);
+  }
+}
+
+interface LazyRequest {
+  mediaId: string;
+  captionIndex: number;
+  timeSec: number;
+}
+
+/**
+ * Resolve a caption thumbnail `thumbRelPath` to a blob URL. When the
+ * persisted path is missing and a `lazy` descriptor is supplied, the
+ * generator is queued to seek the source media, capture a JPEG, persist
+ * it, and hand the resulting path back to this hook on a subsequent
+ * render (via the store patch inside `lazy-thumb.ts`).
+ */
+export function useCaptionThumbnail(
+  thumbRelPath: string | undefined,
+  lazy?: LazyRequest,
+): string | null {
+  const [url, setUrl] = useState<string | null>(() => (
+    thumbRelPath ? blobUrlCache.get(thumbRelPath) ?? null : null
+  ));
+  const latestPath = useRef(thumbRelPath);
+  latestPath.current = thumbRelPath;
+
+  useEffect(() => {
+    if (thumbRelPath) {
+      const cached = blobUrlCache.get(thumbRelPath);
+      if (cached) {
+        setUrl(cached);
+        return;
+      }
+      setUrl(null);
+      void loadBlobUrl(thumbRelPath).then((loaded) => {
+        if (latestPath.current === thumbRelPath) {
+          setUrl(loaded);
+        }
+      });
+      return;
+    }
+
+    // No persisted thumbnail — lazy-generate if we know how.
+    setUrl(null);
+    if (!lazy) return;
+    let cancelled = false;
+    void requestLazyCaptionThumbnail(lazy.mediaId, lazy.captionIndex, lazy.timeSec)
+      .then((relPath) => {
+        if (cancelled || !relPath) return;
+        void loadBlobUrl(relPath).then((loaded) => {
+          if (!cancelled && latestPath.current === undefined) {
+            setUrl(loaded);
+          }
+        });
+      });
+    return () => { cancelled = true; };
+  }, [thumbRelPath, lazy?.mediaId, lazy?.captionIndex, lazy?.timeSec]);
+
+  return url;
+}
diff --git a/src/features/scene-browser/hooks/use-library-palette.ts b/src/features/scene-browser/hooks/use-library-palette.ts
new file mode 100644
index 000000000..12cb3fc22
--- /dev/null
+++ b/src/features/scene-browser/hooks/use-library-palette.ts
@@ -0,0 +1,56 @@
+import { useMemo } from 'react';
+import { useMediaLibraryStore } from '../deps/media-library';
+import {
+  clusterPaletteEntries,
+  flattenLibraryPalettes,
+  type LabCluster,
+} from '../utils/library-palette';
+
+/** Target cluster count. Capped further by how many palettes exist. */
+const DEFAULT_K = 12;
+
+/**
+ * Collect every caption's palette across the library and cluster them
+ * into a small set of representative colors for the Color Mode picker.
+ *
+ * The hook reads from the media-library store (not the scene browser's
+ * embeddings cache) because captions are the source of truth — the
+ * palettes in `MediaCaption.palette` are what the ranker matches
+ * against, so the grid must reflect the same data.
+ */
+export function useLibraryPalette(
+  scope: string | null,
+  k = DEFAULT_K,
+): LabCluster[] {
+  const mediaItems = useMediaLibraryStore((s) => s.mediaItems);
+
+  return useMemo(() => {
+    const palettes: Array<ReadonlyArray<{ l: number; a: number; b: number; weight: number }>> = [];
+    for (const media of mediaItems) {
+      if (scope && media.id !== scope) continue;
+      const captions = media.aiCaptions;
+      if (!captions || captions.length === 0) continue;
+      for (const caption of captions) {
+        if (caption.palette && caption.palette.length > 0) {
+          palettes.push(caption.palette);
+        }
+      }
+    }
+    if (palettes.length === 0) return [];
+
+    const flat = flattenLibraryPalettes(palettes.map((p) => p.map((e) => ({
+      l: e.l, a: e.a, b: e.b, weight: e.weight,
+    }))));
+    const clusters = clusterPaletteEntries(flat, k);
+
+    // Sort by aggregate weight so the grid leads with the library's
+    // dominant colors — skin, sky, greenery tend to surface first,
+    // with vivid accents trailing. Stable tiebreak on Lab so the order
+    // doesn't jitter across renders.
+    return clusters.slice().sort((a, b) => {
+      if (b.weight !== a.weight) return b.weight - a.weight;
+      if (a.l !== b.l) return a.l - b.l;
+      return a.a - b.a;
+    });
+  }, [mediaItems, scope, k]);
+}
diff --git a/src/features/scene-browser/hooks/use-ranked-scenes.ts b/src/features/scene-browser/hooks/use-ranked-scenes.ts
new file mode 100644
index 000000000..bc6332fb4
--- /dev/null
+++ b/src/features/scene-browser/hooks/use-ranked-scenes.ts
@@ -0,0 +1,265 @@
+import { useEffect, useMemo, useState } from 'react';
+import { createLogger } from '@/shared/logging/logger';
+import { clipProvider, embeddingsProvider } from '../deps/analysis';
+import { useMediaLibraryStore } from '../deps/media-library';
+import { useSettingsStore } from '../deps/settings';
+import { useSceneBrowserStore } from '../stores/scene-browser-store';
+import {
+  getEmbeddingsSnapshot,
+  getImageEmbeddingsSnapshot,
+  getPalettesSnapshot,
+} from '../utils/embeddings-cache';
+import { parseColorQuery } from '../utils/color-boost';
+import { rankScenes, type RankableScene, type ScoredScene } from '../utils/rank';
+import { semanticRank } from '../utils/semantic-rank';
+
+const log = createLogger('SceneBrowser:RankedScenes');
+
+export interface RankedScenesResult {
+  scenes: ScoredScene[];
+  totalScenes: number;
+  totalClips: number;
+  clipsWithCaptions: number;
+  /**
+   * Filenames of media currently being Analyzed-with-AI (and therefore
+   * excluded from the scene list above). Exposed so the panel can surface
+   * a "re-analyzing" indicator while the old entries are hidden and the
+   * new ones haven't landed yet.
+   */
+  reanalyzingMedia: Array<{ id: string; fileName: string }>;
+  /**
+   * Whether the active search mode produced the shown ranking. Semantic
+   * mode falls back to keyword while the query embedding is in flight —
+   * the panel can use this to show a subtle "embedding…" indicator.
+   */
+  activeMode: 'keyword' | 'semantic';
+  /**
+   * True when a non-empty query is being ranked — toggles per-row score
+   * chrome so browsing without a query doesn't look cluttered with 0%
+   * badges on every scene.
+   */
+  isQuerying: boolean;
+  /**
+   * Count of scenes (not clips) whose text embedding is currently loaded
+   * in memory, vs. the total visible scene count. Gives the status-bar
+   * something concrete to say while the background indexer is still
+   * filling things in.
+   */
+  sceneTextIndexed: number;
+  /** Same, for CLIP image embeddings. */
+  sceneImageIndexed: number;
+  /**
+   * True while we're waiting on the query's semantic text embedding —
+   * old scenes still render via keyword fallback, but the panel can show
+   * a subtle "embedding query…" pill so the delay isn't mysterious.
+   */
+  queryTextEmbedding: 'idle' | 'embedding' | 'ready';
+  /** Same, for the CLIP text-encoder half of the query. */
+  queryImageEmbedding: 'idle' | 'embedding' | 'ready';
+}
+
+/**
+ * Build the ranked scene list for the Scene Browser. The hook owns all
+ * joining between media metadata and caption records so components can
+ * treat each row as a self-contained record (filename, timestamp, thumb path).
+ */
+export function useRankedScenes(): RankedScenesResult {
+  const mediaItems = useMediaLibraryStore((s) => s.mediaItems);
+  const taggingMediaIds = useMediaLibraryStore((s) => s.taggingMediaIds);
+  const query = useSceneBrowserStore((s) => s.query);
+  const scope = useSceneBrowserStore((s) => s.scope);
+  const sortMode = useSceneBrowserStore((s) => s.sortMode);
+  const reference = useSceneBrowserStore((s) => s.reference);
+  const captionSearchMode = useSettingsStore((s) => s.captionSearchMode);
+  const colorQuery = useMemo(() => parseColorQuery(query), [query]);
+
+  // Embed the query with both text models when semantic mode is active.
+  // Keeping each in a separate state slot (rather than a Suspense promise
+  // or sync read) means typing stays fluid — old scenes remain visible
+  // while the new embedding is in flight.
+  const [queryEmbedding, setQueryEmbedding] = useState<Float32Array | null>(null);
+  const [queryImageEmbedding, setQueryImageEmbedding] = useState<Float32Array | null>(null);
+  const [queryTextState, setQueryTextState] = useState<'idle' | 'embedding' | 'ready'>('idle');
+  const [queryImageState, setQueryImageState] = useState<'idle' | 'embedding' | 'ready'>('idle');
+
+  useEffect(() => {
+    if (captionSearchMode !== 'semantic' || query.trim().length === 0) {
+      setQueryEmbedding(null);
+      setQueryTextState('idle');
+      return;
+    }
+    if (colorQuery.paletteOnly) {
+      setQueryEmbedding(new Float32Array(0));
+      setQueryTextState('ready');
+      return;
+    }
+    let cancelled = false;
+    setQueryTextState('embedding');
+    void embeddingsProvider
+      .embed(query.trim())
+      .then((vector) => {
+        if (cancelled) return;
+        setQueryEmbedding(vector);
+        setQueryTextState('ready');
+      })
+      .catch((error) => {
+        if (!cancelled) {
+          log.warn('Query text embedding failed — falling back to keyword', { query, error });
+          setQueryEmbedding(null);
+          setQueryTextState('idle');
+        }
+      });
+    return () => { cancelled = true; };
+  }, [captionSearchMode, query, colorQuery.paletteOnly]);
+
+  // CLIP text-encoder embedding for the visual side. Loaded independently
+  // so a slow CLIP download doesn't block text-side ranking — scenes can
+  // be shown via text-only cosine until the CLIP query vector lands.
+  useEffect(() => {
+    if (
+      captionSearchMode !== 'semantic'
+      || query.trim().length === 0
+      || colorQuery.paletteOnly
+    ) {
+      setQueryImageEmbedding(null);
+      setQueryImageState('idle');
+      return;
+    }
+    let cancelled = false;
+    setQueryImageState('embedding');
+    // Use the ensembled path so a one-word query ("fighting") gets
+    // wrapped in natural-sentence templates before embedding — CLIP is
+    // badly behaved on bare tokens and the averaged vector materially
+    // reduces false positives like "a tower matches fighting".
+    void clipProvider
+      .embedQueryForImages(query.trim())
+      .then((vector) => {
+        if (cancelled) return;
+        if (vector) {
+          setQueryImageEmbedding(vector);
+          setQueryImageState('ready');
+        } else {
+          setQueryImageState('idle');
+        }
+      })
+      .catch((error) => {
+        if (!cancelled) {
+          log.warn('CLIP query embedding failed — skipping visual ranking', { query, error });
+          setQueryImageEmbedding(null);
+          setQueryImageState('idle');
+        }
+      });
+    return () => { cancelled = true; };
+  }, [captionSearchMode, query, colorQuery.paletteOnly]);
+
+  return useMemo<RankedScenesResult>(() => {
+    const allScenes: RankableScene[] = [];
+    const reanalyzingMedia: Array<{ id: string; fileName: string }> = [];
+    let clipsWithCaptions = 0;
+
+    for (const media of mediaItems) {
+      if (scope && media.id !== scope) continue;
+      // Hide entries for media that's actively being Analyzed-with-AI —
+      // the old captions are about to be replaced, and surfacing them
+      // alongside "re-analyzing" state would be misleading.
+      if (taggingMediaIds.has(media.id)) {
+        if (media.aiCaptions && media.aiCaptions.length > 0) {
+          reanalyzingMedia.push({ id: media.id, fileName: media.fileName });
+        }
+        continue;
+      }
+      const captions = media.aiCaptions;
+      if (!captions || captions.length === 0) continue;
+      clipsWithCaptions += 1;
+      captions.forEach((caption, captionIndex) => {
+        allScenes.push({
+          id: `${media.id}:${captionIndex}`,
+          mediaId: media.id,
+          mediaFileName: media.fileName,
+          timeSec: caption.timeSec,
+          text: caption.text,
+          thumbRelPath: caption.thumbRelPath,
+          palette: caption.palette,
+        });
+      });
+    }
+
+    const isSemanticActive = (
+      captionSearchMode === 'semantic'
+      && query.trim().length > 0
+      && queryEmbedding !== null
+    );
+
+    const textEmbeddings = getEmbeddingsSnapshot();
+    const imageEmbeddings = getImageEmbeddingsSnapshot();
+    const paletteSnapshot = getPalettesSnapshot();
+
+    // A reference palette forces semantic-lane ranking (palette-only
+    // scoring inside semanticRank). The query stays visible in the input
+    // but is ignored until the reference is cleared.
+    let ranked;
+    if (reference) {
+      ranked = semanticRank(new Float32Array(0), allScenes, textEmbeddings, {
+        palettes: paletteSnapshot,
+        referencePalette: reference.palette,
+      });
+    } else if (isSemanticActive) {
+      ranked = semanticRank(queryEmbedding!, allScenes, textEmbeddings, {
+        queryImageEmbedding,
+        imageEmbeddings,
+        query,
+        palettes: paletteSnapshot,
+      });
+    } else {
+      ranked = rankScenes(query, allScenes);
+    }
+
+    // Coverage stats over the scenes the user can currently see, not
+    // over the whole library — keeps the "indexed" counter honest when
+    // scoped to a single media.
+    let sceneTextIndexed = 0;
+    let sceneImageIndexed = 0;
+    for (const scene of allScenes) {
+      if (textEmbeddings.has(scene.id)) sceneTextIndexed += 1;
+      if (imageEmbeddings.has(scene.id)) sceneImageIndexed += 1;
+    }
+
+    const hasRankingSignal = query.trim().length > 0 || !!reference;
+    if (!hasRankingSignal || sortMode === 'time' || sortMode === 'name') {
+      ranked.sort((a, b) => {
+        if (a.mediaFileName !== b.mediaFileName) {
+          return a.mediaFileName.localeCompare(b.mediaFileName);
+        }
+        return a.timeSec - b.timeSec;
+      });
+    }
+    // relevance sort is the default output of rankScenes / semanticRank.
+
+    return {
+      scenes: ranked,
+      totalScenes: allScenes.length,
+      totalClips: mediaItems.length,
+      clipsWithCaptions,
+      reanalyzingMedia,
+      activeMode: isSemanticActive || reference ? 'semantic' : 'keyword',
+      isQuerying: hasRankingSignal,
+      sceneTextIndexed,
+      sceneImageIndexed,
+      queryTextEmbedding: queryTextState,
+      queryImageEmbedding: queryImageState,
+    };
+  }, [
+    mediaItems,
+    taggingMediaIds,
+    query,
+    scope,
+    sortMode,
+    captionSearchMode,
+    queryEmbedding,
+    queryImageEmbedding,
+    queryTextState,
+    queryImageState,
+    colorQuery.paletteOnly,
+    reference,
+  ]);
+}
diff --git a/src/features/scene-browser/hooks/use-semantic-index.test.tsx b/src/features/scene-browser/hooks/use-semantic-index.test.tsx
new file mode 100644
index 000000000..59bbd44af
--- /dev/null
+++ b/src/features/scene-browser/hooks/use-semantic-index.test.tsx
@@ -0,0 +1,111 @@
+import { render, screen, waitFor } from '@testing-library/react';
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+import { create } from 'zustand';
+
+const ensureEmbeddingsLoadedMock = vi.fn();
+const indexMediaCaptionsMock = vi.fn();
+const indexMediaImageCaptionsMock = vi.fn();
+const isMediaMissingEmbeddingsMock = vi.fn();
+const isMediaMissingImageEmbeddingsMock = vi.fn();
+
+type MediaItem = {
+  id: string;
+  aiCaptions?: Array<{ timeSec: number; text: string }>;
+};
+
+const useMediaLibraryStore = create<{
+  mediaItems: MediaItem[];
+  taggingMediaIds: Set<string>;
+}>(() => ({
+  mediaItems: [],
+  taggingMediaIds: new Set<string>(),
+}));
+
+const useSettingsStore = create<{
+  captionSearchMode: 'keyword' | 'semantic';
+}>(() => ({
+  captionSearchMode: 'keyword',
+}));
+
+vi.mock('../deps/media-library', () => ({
+  useMediaLibraryStore,
+}));
+
+vi.mock('../deps/settings', () => ({
+  useSettingsStore,
+}));
+
+vi.mock('../utils/embeddings-cache', () => ({
+  ensureEmbeddingsLoaded: ensureEmbeddingsLoadedMock,
+  indexMediaCaptions: indexMediaCaptionsMock,
+  indexMediaImageCaptions: indexMediaImageCaptionsMock,
+  isMediaMissingEmbeddings: isMediaMissingEmbeddingsMock,
+  isMediaMissingImageEmbeddings: isMediaMissingImageEmbeddingsMock,
+}));
+
+const { useSemanticIndex } = await import('./use-semantic-index');
+
+function SemanticIndexProbe() {
+  const progress = useSemanticIndex();
+  return (
+    <div
+      data-testid="semantic-index-probe"
+      data-indexing={String(progress.indexing)}
+      data-total={String(progress.indexTotal)}
+      data-loading={String(progress.loadingModel)}
+    />
+  );
+}
+
+describe('useSemanticIndex', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+    useSettingsStore.setState({ captionSearchMode: 'semantic' });
+    useMediaLibraryStore.setState({
+      mediaItems: [
+        {
+          id: 'media-1',
+          aiCaptions: [{ timeSec: 0, text: 'A scene' }],
+        },
+      ],
+      taggingMediaIds: new Set<string>(),
+    });
+  });
+
+  it('clears stale progress when a rerun becomes a no-op after store updates', async () => {
+    let textIndexed = false;
+
+    ensureEmbeddingsLoadedMock.mockResolvedValue(undefined);
+    isMediaMissingEmbeddingsMock.mockImplementation(() => !textIndexed);
+    isMediaMissingImageEmbeddingsMock.mockReturnValue(false);
+    indexMediaImageCaptionsMock.mockResolvedValue(undefined);
+    indexMediaCaptionsMock.mockImplementation(async (mediaId: string) => {
+      await Promise.resolve();
+      textIndexed = true;
+      useMediaLibraryStore.setState((state) => ({
+        mediaItems: state.mediaItems.map((item) => (
+          item.id === mediaId
+            ? { ...item, aiCaptions: [...(item.aiCaptions ?? [])] }
+            : item
+        )),
+      }));
+      await Promise.resolve();
+    });
+
+    render(<SemanticIndexProbe />);
+
+    await waitFor(() => {
+      expect(screen.getByTestId('semantic-index-probe')).toHaveAttribute('data-total', '1');
+    });
+
+    await waitFor(() => {
+      expect(indexMediaCaptionsMock).toHaveBeenCalledTimes(1);
+    });
+
+    await waitFor(() => {
+      expect(screen.getByTestId('semantic-index-probe')).toHaveAttribute('data-indexing', '0');
+      expect(screen.getByTestId('semantic-index-probe')).toHaveAttribute('data-total', '0');
+      expect(screen.getByTestId('semantic-index-probe')).toHaveAttribute('data-loading', 'false');
+    });
+  });
+});
diff --git a/src/features/scene-browser/hooks/use-semantic-index.ts b/src/features/scene-browser/hooks/use-semantic-index.ts
new file mode 100644
index 000000000..cc81ab288
--- /dev/null
+++ b/src/features/scene-browser/hooks/use-semantic-index.ts
@@ -0,0 +1,141 @@
+/**
+ * Orchestrates retroactive semantic indexing when the user switches into
+ * semantic mode. Hydrates embeddings for media that already have them on
+ * disk; runs the embedding model for media that don't.
+ *
+ * Exposes progress so the panel can surface a banner ("Indexing 3/12
+ * clips…") while work is in flight. Designed to be safe to mount many
+ * times — the underlying cache + promise maps deduplicate real work.
+ */
+
+import { useEffect, useRef, useState } from 'react';
+import { createLogger } from '@/shared/logging/logger';
+import { useMediaLibraryStore } from '../deps/media-library';
+import { useSettingsStore } from '../deps/settings';
+import {
+  ensureEmbeddingsLoaded,
+  indexMediaCaptions,
+  indexMediaImageCaptions,
+  isMediaMissingEmbeddings,
+  isMediaMissingImageEmbeddings,
+} from '../utils/embeddings-cache';
+
+const log = createLogger('SceneBrowser:SemanticIndex');
+
+export interface SemanticIndexProgress {
+  /** Running indexer is generating fresh embeddings (slow path). */
+  indexing: number;
+  /** Total clips that need indexing in the current pass. */
+  indexTotal: number;
+  /** Model is downloading — blocks even the hydration path. */
+  loadingModel: boolean;
+  /** A clip just finished indexing — used by the banner for a pulse. */
+  lastCompletedAt: number;
+}
+
+const INITIAL_PROGRESS: SemanticIndexProgress = {
+  indexing: 0,
+  indexTotal: 0,
+  loadingModel: false,
+  lastCompletedAt: 0,
+};
+
+export function useSemanticIndex(): SemanticIndexProgress {
+  const mode = useSettingsStore((s) => s.captionSearchMode);
+  const mediaItems = useMediaLibraryStore((s) => s.mediaItems);
+  const taggingMediaIds = useMediaLibraryStore((s) => s.taggingMediaIds);
+  const [progress, setProgress] = useState<SemanticIndexProgress>(INITIAL_PROGRESS);
+  const runIdRef = useRef(0);
+
+  useEffect(() => {
+    if (mode !== 'semantic') {
+      setProgress(INITIAL_PROGRESS);
+      return;
+    }
+
+    const runId = ++runIdRef.current;
+
+    const candidates = mediaItems.filter((media) => (
+      (media.aiCaptions?.length ?? 0) > 0 && !taggingMediaIds.has(media.id)
+    ));
+    if (candidates.length === 0) {
+      setProgress(INITIAL_PROGRESS);
+      return;
+    }
+
+    let cancelled = false;
+
+    void (async () => {
+      // Phase 1: hydrate everything that already has on-disk embeddings.
+      // Parallel because the bulk of the work is just reading a small bin.
+      await Promise.all(candidates.map((media) => ensureEmbeddingsLoaded(media.id)));
+      if (cancelled || runId !== runIdRef.current) return;
+
+      // Phase 2: fill in text embeddings that are missing (fast path on
+      // already-downloaded all-MiniLM model, ~20ms per caption).
+      const needsTextIndex = candidates.filter((media) => isMediaMissingEmbeddings(media.id));
+      const needsImageIndex = candidates.filter((media) => isMediaMissingImageEmbeddings(media.id));
+      const totalToIndex = needsTextIndex.length + needsImageIndex.length;
+      if (totalToIndex === 0) {
+        setProgress(INITIAL_PROGRESS);
+        return;
+      }
+
+      setProgress({
+        indexing: 0,
+        indexTotal: totalToIndex,
+        loadingModel: true,
+        lastCompletedAt: 0,
+      });
+
+      let done = 0;
+      const advance = () => {
+        done += 1;
+        setProgress({
+          indexing: done,
+          indexTotal: totalToIndex,
+          loadingModel: false,
+          lastCompletedAt: Date.now(),
+        });
+      };
+
+      for (const media of needsTextIndex) {
+        if (cancelled || runId !== runIdRef.current) return;
+        try {
+          await indexMediaCaptions(media.id);
+        } catch (error) {
+          log.warn('Retroactive text embedding failed', {
+            mediaId: media.id, fileName: media.fileName, error,
+          });
+        }
+        advance();
+      }
+
+      // Phase 3: image indexing. This is the expensive side — CLIP is
+      // ~90 MB to download and ~50 ms per image, so do it strictly after
+      // text indexing so at least keyword → text semantic is immediately
+      // usable while visual search warms up.
+      for (const media of needsImageIndex) {
+        if (cancelled || runId !== runIdRef.current) return;
+        try {
+          await indexMediaImageCaptions(media.id);
+        } catch (error) {
+          log.warn('Retroactive image embedding failed', {
+            mediaId: media.id, fileName: media.fileName, error,
+          });
+        }
+        advance();
+      }
+
+      if (!cancelled && runId === runIdRef.current) {
+        setProgress(INITIAL_PROGRESS);
+      }
+    })();
+
+    return () => {
+      cancelled = true;
+    };
+  }, [mode, mediaItems, taggingMediaIds]);
+
+  return progress;
+}
diff --git a/src/features/scene-browser/index.ts b/src/features/scene-browser/index.ts
new file mode 100644
index 000000000..274eb2c07
--- /dev/null
+++ b/src/features/scene-browser/index.ts
@@ -0,0 +1,13 @@
+/**
+ * Scene Browser — cross-library visual search for AI-generated captions.
+ *
+ * Public API:
+ *  - `<SceneBrowserPanel/>` — the full panel; mount inside the media-library
+ *    body when `useSceneBrowserStore.open === true`.
+ *  - `useSceneBrowserStore` — control open/close, query, scope, sort.
+ */
+
+export { SceneBrowserPanel } from './components/scene-browser-panel';
+export { useSceneBrowserStore } from './stores/scene-browser-store';
+export type { SceneBrowserSortMode } from './stores/scene-browser-store';
+export { invalidateMediaCaptionThumbnails } from './utils/invalidate';
diff --git a/src/features/scene-browser/stores/scene-browser-store.ts b/src/features/scene-browser/stores/scene-browser-store.ts
new file mode 100644
index 000000000..193df7920
--- /dev/null
+++ b/src/features/scene-browser/stores/scene-browser-store.ts
@@ -0,0 +1,128 @@
+import { create, type StoreApi, type UseBoundStore } from 'zustand';
+import type { PaletteEntry } from '../deps/analysis';
+
+export type SceneBrowserSortMode = 'relevance' | 'time' | 'name';
+export type SceneBrowserViewMode = 'list' | 'grid';
+
+export interface SceneBrowserReference {
+  /** Scene id whose palette is the reference — for dedupe and the clear chip. */
+  sceneId: string;
+  /** Short human label (e.g. `"foo.mp4 · 0:12"`) shown in the chip. */
+  label: string;
+  /** The reference palette (CIELAB + weight). */
+  palette: PaletteEntry[];
+}
+
+/**
+ * `scope === null` is the default cross-library view. A non-null scope is
+ * the mediaId the Scene Browser was opened from — set when the user clicks
+ * "Open in Scene Browser" from a media card's info popover.
+ */
+interface SceneBrowserState {
+  open: boolean;
+  query: string;
+  scope: string | null;
+  sortMode: SceneBrowserSortMode;
+  /** Incrementing token the search input watches to force a focus. */
+  focusNonce: number;
+  /**
+   * Active "find similar palette" reference. When set, the ranker scores
+   * scenes by palette distance against this reference instead of by
+   * query semantics. Cleared explicitly (chip × or escape).
+   */
+  reference: SceneBrowserReference | null;
+  /**
+   * Panel-local Color Mode — swaps the search input for a grid of the
+   * library's dominant colors. Orthogonal to captionSearchMode; a user
+   * can come back to their preferred keyword/semantic lane by toggling
+   * it off. Not persisted so the default is always "text search".
+   */
+  colorMode: boolean;
+  /**
+   * List vs grid layout for the results area. Grid is a responsive
+   * thumbnail-first layout (good for color/visual scanning); list is
+   * thumbnail + caption text (good for reading matches).
+   */
+  viewMode: SceneBrowserViewMode;
+}
+
+interface SceneBrowserActions {
+  openBrowser: (options?: { mediaId?: string | null; focus?: boolean }) => void;
+  closeBrowser: () => void;
+  toggleBrowser: () => void;
+  setQuery: (query: string) => void;
+  setScope: (scope: string | null) => void;
+  setSortMode: (mode: SceneBrowserSortMode) => void;
+  requestFocus: () => void;
+  setReference: (reference: SceneBrowserReference | null) => void;
+  setColorMode: (colorMode: boolean) => void;
+  setViewMode: (viewMode: SceneBrowserViewMode) => void;
+  reset: () => void;
+}
+
+const INITIAL_STATE: SceneBrowserState = {
+  open: false,
+  query: '',
+  scope: null,
+  sortMode: 'relevance',
+  focusNonce: 0,
+  reference: null,
+  colorMode: false,
+  viewMode: 'list',
+};
+
+type SceneBrowserStoreApi = UseBoundStore<StoreApi<SceneBrowserState & SceneBrowserActions>>;
+
+declare global {
+  // eslint-disable-next-line no-var
+  var __FREECUT_SCENE_BROWSER_STORE__: SceneBrowserStoreApi | undefined;
+}
+
+const hotStore = import.meta.env.DEV ? globalThis.__FREECUT_SCENE_BROWSER_STORE__ : undefined;
+
+// Preserve query/scope/color-mode/reference across Vite HMR in dev so a
+// file save doesn't wipe the panel's current search context.
+const sceneBrowserStore: SceneBrowserStoreApi = hotStore ?? create<SceneBrowserState & SceneBrowserActions>((set) => ({
+  ...INITIAL_STATE,
+
+  openBrowser: (options) => set((state) => ({
+    open: true,
+    scope: options?.mediaId !== undefined ? options.mediaId : state.scope,
+    focusNonce: options?.focus === false ? state.focusNonce : state.focusNonce + 1,
+  })),
+
+  closeBrowser: () => set({ open: false }),
+
+  toggleBrowser: () => set((state) => ({
+    open: !state.open,
+    focusNonce: !state.open ? state.focusNonce + 1 : state.focusNonce,
+  })),
+
+  setQuery: (query) => set({ query }),
+
+  setScope: (scope) => set({ scope }),
+
+  setSortMode: (sortMode) => set({ sortMode }),
+
+  requestFocus: () => set((state) => ({ focusNonce: state.focusNonce + 1 })),
+
+  setReference: (reference) => set({ reference }),
+
+  setViewMode: (viewMode) => set({ viewMode }),
+
+  setColorMode: (colorMode) => set((state) => ({
+    colorMode,
+    // Leaving color mode clears any active reference — the mode is the
+    // only way to land on one, so the chip shouldn't outlive the mode.
+    reference: colorMode ? state.reference : null,
+    query: colorMode ? '' : state.query,
+  })),
+
+  reset: () => set(INITIAL_STATE),
+}));
+
+if (import.meta.env.DEV) {
+  globalThis.__FREECUT_SCENE_BROWSER_STORE__ = sceneBrowserStore;
+}
+
+export const useSceneBrowserStore = sceneBrowserStore;
diff --git a/src/features/scene-browser/utils/color-boost.test.ts b/src/features/scene-browser/utils/color-boost.test.ts
new file mode 100644
index 000000000..fbdc60fcc
--- /dev/null
+++ b/src/features/scene-browser/utils/color-boost.test.ts
@@ -0,0 +1,239 @@
+import { describe, expect, it } from 'vitest';
+import {
+  colorBoostFor,
+  extractQueryColors,
+  nearestColorFamily,
+  palettePairDistance,
+  paletteSimilarityBoost,
+  parseColorQuery,
+} from './color-boost';
+import type { PaletteEntry } from '../deps/analysis';
+
+describe('extractQueryColors', () => {
+  it('finds a single family for an explicit color-intent query', () => {
+    const result = extractQueryColors('red color');
+    expect(result.map((r) => r.family)).toEqual(['red']);
+  });
+
+  it('maps synonyms to their canonical family when the query is palette-oriented', () => {
+    const result = extractQueryColors('crimson tones');
+    expect(result.map((r) => r.family)).toEqual(['red']);
+  });
+
+  it('returns empty for queries with no color terms', () => {
+    expect(extractQueryColors('a man fighting')).toEqual([]);
+  });
+
+  it('extracts color families for bare color-only queries', () => {
+    // A query that is *only* color words has no object semantics — the
+    // ranker should match against the palette instead of sending CLIP
+    // chasing unrelated captions that happen to cluster near the token.
+    expect(extractQueryColors('ruby scarlet red').map((c) => c.family)).toEqual(['red']);
+    expect(extractQueryColors('pink').map((c) => c.family)).toEqual(['pink']);
+  });
+
+  it('returns empty when non-color content words are present without explicit palette intent', () => {
+    expect(extractQueryColors('orange sunset navy water')).toEqual([]);
+  });
+
+  it('supports color prefix syntax and multiple distinct families', () => {
+    const result = extractQueryColors('color orange palette navy');
+    expect(result.map((r) => r.family).sort()).toEqual(['blue', 'orange']);
+  });
+});
+
+describe('parseColorQuery', () => {
+  it('marks pure color-intent queries as palette-only', () => {
+    expect(parseColorQuery('yellow color')).toMatchObject({
+      colors: [{ family: 'yellow' }],
+      paletteOnly: true,
+    });
+    expect(parseColorQuery('color:yellow')).toMatchObject({
+      colors: [{ family: 'yellow' }],
+      paletteOnly: true,
+    });
+  });
+
+  it('keeps mixed content queries out of palette-only mode', () => {
+    expect(parseColorQuery('yellow color jacket')).toMatchObject({
+      colors: [{ family: 'yellow' }],
+      paletteOnly: false,
+    });
+  });
+
+  it('treats bare single-color queries as palette intent', () => {
+    expect(parseColorQuery('pink')).toMatchObject({
+      colors: [{ family: 'pink' }],
+      paletteOnly: true,
+    });
+  });
+
+  it('treats multi-color-only queries as palette intent', () => {
+    expect(parseColorQuery('pink purple')).toMatchObject({
+      colors: [{ family: 'pink' }, { family: 'purple' }],
+      paletteOnly: true,
+    });
+  });
+});
+
+const REDS: PaletteEntry = { l: 53, a: 70, b: 50, weight: 0.5 };
+const GREEN: PaletteEntry = { l: 60, a: -55, b: 50, weight: 0.3 };
+const BLUES: PaletteEntry = { l: 40, a: 15, b: -60, weight: 0.2 };
+
+describe('colorBoostFor', () => {
+  it('returns a non-zero boost when palette contains the query color', () => {
+    const queries = extractQueryColors('red color');
+    const result = colorBoostFor(queries, [REDS, GREEN, BLUES]);
+    expect(result).not.toBeNull();
+    expect(result?.family).toBe('red');
+    expect(result?.boost).toBeGreaterThan(0.1);
+  });
+
+  it('returns null when palette has no close match', () => {
+    const queries = extractQueryColors('red color');
+    const result = colorBoostFor(queries, [
+      { l: 60, a: -55, b: 50, weight: 1.0 },
+    ]);
+    expect(result).toBeNull();
+  });
+
+  it('returns null for empty palette', () => {
+    const queries = extractQueryColors('red color');
+    expect(colorBoostFor(queries, [])).toBeNull();
+    expect(colorBoostFor(queries, undefined)).toBeNull();
+  });
+
+  it('returns null for query without color words', () => {
+    const queries = extractQueryColors('a scene with people');
+    expect(colorBoostFor(queries, [REDS, GREEN, BLUES])).toBeNull();
+  });
+
+  it('weighs larger palette entries higher', () => {
+    const queries = extractQueryColors('red color');
+    const majorRed = colorBoostFor(queries, [{ l: 53, a: 70, b: 50, weight: 0.8 }]);
+    const minorRed = colorBoostFor(queries, [{ l: 53, a: 70, b: 50, weight: 0.05 }]);
+    expect(majorRed?.boost).toBeGreaterThan(minorRed?.boost ?? 0);
+  });
+
+  it('picks the best match across multiple query colors', () => {
+    const queries = extractQueryColors('red and blue palette');
+    const result = colorBoostFor(queries, [
+      { l: 50, a: 50, b: 40, weight: 0.2 },
+      { l: 42, a: 18, b: -58, weight: 0.7 },
+    ]);
+    expect(result?.family).toBe('blue');
+  });
+
+  it('does not match pink against warm skin-tone palette entries', () => {
+    // Lab ~(65, 20, 20) is a common medium skin tone — warm, moderate
+    // chroma. It sat within the old pink boost range and polluted "pink"
+    // results with face-dominated dim scenes.
+    const queries = extractQueryColors('pink');
+    const result = colorBoostFor(queries, [
+      { l: 65, a: 20, b: 20, weight: 0.5 },
+      { l: 40, a: 10, b: 15, weight: 0.3 },
+    ]);
+    expect(result).toBeNull();
+  });
+
+  it('matches pink against genuinely pink palette entries', () => {
+    const queries = extractQueryColors('pink');
+    const result = colorBoostFor(queries, [
+      { l: 65, a: 55, b: -5, weight: 0.4 },
+      { l: 20, a: 5, b: 5, weight: 0.4 },
+    ]);
+    expect(result).not.toBeNull();
+    expect(result?.family).toBe('pink');
+    expect(result?.boost).toBeGreaterThan(0.1);
+  });
+
+  it('does not match chromatic families against low-chroma gray palette entries', () => {
+    const queries = extractQueryColors('red');
+    const result = colorBoostFor(queries, [
+      { l: 55, a: 2, b: 1, weight: 0.8 }, // near-gray
+    ]);
+    expect(result).toBeNull();
+  });
+
+  it('still matches neutral families against low-chroma entries', () => {
+    // The chroma/hue gate applies only to chromatic families — gray,
+    // black, white should still match near-neutral palette entries.
+    const queries = extractQueryColors('gray tones');
+    const result = colorBoostFor(queries, [
+      { l: 55, a: 2, b: 1, weight: 0.6 },
+    ]);
+    expect(result).not.toBeNull();
+    expect(result?.family).toBe('gray');
+  });
+});
+
+describe('nearestColorFamily', () => {
+  it('maps a clearly chromatic swatch to the obvious family', () => {
+    expect(nearestColorFamily({ l: 53, a: 70, b: 50 })).toBe('red');
+    expect(nearestColorFamily({ l: 40, a: 15, b: -60 })).toBe('blue');
+    expect(nearestColorFamily({ l: 90, a: -5, b: 80 })).toBe('yellow');
+  });
+
+  it('maps near-neutral swatches to gray/black/white', () => {
+    expect(nearestColorFamily({ l: 55, a: 0, b: 0 })).toBe('gray');
+    expect(nearestColorFamily({ l: 95, a: 0, b: 0 })).toBe('white');
+    expect(nearestColorFamily({ l: 10, a: 0, b: 0 })).toBe('black');
+  });
+});
+
+describe('palettePairDistance', () => {
+  it('returns 0 for identical palettes', () => {
+    const a: PaletteEntry[] = [
+      { l: 50, a: 60, b: 40, weight: 0.6 },
+      { l: 40, a: 20, b: -50, weight: 0.4 },
+    ];
+    expect(palettePairDistance(a, a)).toBeCloseTo(0, 5);
+  });
+
+  it('is symmetric', () => {
+    const a: PaletteEntry[] = [{ l: 60, a: 40, b: 30, weight: 0.8 }];
+    const b: PaletteEntry[] = [{ l: 65, a: 45, b: 20, weight: 1.0 }];
+    expect(palettePairDistance(a, b)).toBeCloseTo(palettePairDistance(b, a), 5);
+  });
+
+  it('returns a larger distance for perceptually different palettes', () => {
+    const warmReds: PaletteEntry[] = [{ l: 53, a: 70, b: 50, weight: 1 }];
+    const coolBlues: PaletteEntry[] = [{ l: 40, a: 15, b: -60, weight: 1 }];
+    expect(palettePairDistance(warmReds, coolBlues)).toBeGreaterThan(40);
+  });
+
+  it('returns infinity for empty palettes', () => {
+    const a: PaletteEntry[] = [{ l: 50, a: 0, b: 0, weight: 1 }];
+    expect(palettePairDistance(a, [])).toBe(Number.POSITIVE_INFINITY);
+    expect(palettePairDistance([], a)).toBe(Number.POSITIVE_INFINITY);
+  });
+});
+
+describe('paletteSimilarityBoost', () => {
+  it('produces a non-zero boost for similar palettes', () => {
+    const ref: PaletteEntry[] = [
+      { l: 50, a: 60, b: 40, weight: 0.7 },
+      { l: 40, a: 20, b: -50, weight: 0.3 },
+    ];
+    const candidate: PaletteEntry[] = [
+      { l: 52, a: 62, b: 38, weight: 0.6 },
+      { l: 42, a: 18, b: -52, weight: 0.4 },
+    ];
+    const result = paletteSimilarityBoost(ref, candidate);
+    expect(result).not.toBeNull();
+    expect(result?.boost).toBeGreaterThan(0.1);
+    expect(result?.distance).toBeLessThan(10);
+  });
+
+  it('returns null for clearly dissimilar palettes', () => {
+    const warmReds: PaletteEntry[] = [{ l: 53, a: 70, b: 50, weight: 1 }];
+    const coolGreens: PaletteEntry[] = [{ l: 60, a: -55, b: 50, weight: 1 }];
+    expect(paletteSimilarityBoost(warmReds, coolGreens)).toBeNull();
+  });
+
+  it('returns null for missing inputs', () => {
+    const a: PaletteEntry[] = [{ l: 50, a: 0, b: 0, weight: 1 }];
+    expect(paletteSimilarityBoost(undefined, a)).toBeNull();
+    expect(paletteSimilarityBoost(a, undefined)).toBeNull();
+  });
+});
diff --git a/src/features/scene-browser/utils/color-boost.ts b/src/features/scene-browser/utils/color-boost.ts
new file mode 100644
index 000000000..fa00fe2bf
--- /dev/null
+++ b/src/features/scene-browser/utils/color-boost.ts
@@ -0,0 +1,365 @@
+/**
+ * Color-query boost for semantic search.
+ *
+ * CLIP is weak on pure color queries — it was trained on object-centric
+ * captions, so "red color" drifts to whatever CLIP happens to associate
+ * with the token. Industry CBIR systems (Imgix, TinEye) sidestep this
+ * entirely by pre-extracting dominant colors per image and matching
+ * query colors via ∆E in CIELAB, the approximately-perceptually-uniform
+ * color space. We do the same here, using the pre-computed palette on
+ * each `MediaCaption.palette`.
+ *
+ * Output: a ColorBoost per scene with the closest palette match, its
+ * perceptual distance, and a score contribution calibrated to cosine
+ * magnitudes so it composes cleanly with the text/image scores.
+ */
+
+import { deltaE2000, type LabColor, type PaletteEntry } from '../deps/analysis';
+
+export interface ColorBoostResult {
+  /** Additive score contribution, in cosine-compatible units. */
+  boost: number;
+  /** Query color family that matched (e.g. "red"). */
+  family: string;
+  /** Minimum ∆E across the scene's palette. */
+  deltaE: number;
+  /** The palette entry that produced the minimum distance. */
+  matched: PaletteEntry;
+}
+
+/**
+ * Tuned so that a visually-identical match (∆E ~0) contributes ~0.15
+ * — roughly one confidence tier. ∆E ≥ 30 ("obviously different") gives
+ * 0. Linear falloff in between keeps the math simple and explains
+ * itself in chip tooltips.
+ */
+const MAX_BOOST = 0.18;
+const ZERO_BOOST_DELTA_E = 30;
+
+function boostFromDeltaE(deltaE: number, weight: number): number {
+  if (deltaE >= ZERO_BOOST_DELTA_E) return 0;
+  const linear = (ZERO_BOOST_DELTA_E - deltaE) / ZERO_BOOST_DELTA_E;
+  // Weight shrinks the contribution when the matched color is a tiny
+  // fraction of the thumbnail (a 3% pixel slice of red doesn't really
+  // make the scene "red").
+  const weightFactor = Math.min(1, weight / 0.2);
+  return MAX_BOOST * linear * weightFactor;
+}
+
+/** Families that demand a visibly chromatic palette entry to match. */
+const CHROMATIC_FAMILIES = new Set([
+  'red', 'orange', 'yellow', 'green', 'teal', 'blue', 'purple', 'pink', 'coral',
+]);
+
+/** Minimum chroma (sqrt(a²+b²)) for a palette entry to match a chromatic family. */
+const MIN_ENTRY_CHROMA = 15;
+
+/** Max hue-angle difference (degrees) between palette entry and chromatic family. */
+const MAX_HUE_DELTA_DEG = 45;
+
+function labHueDeg(a: number, b: number): number {
+  const deg = (Math.atan2(b, a) * 180) / Math.PI;
+  return deg < 0 ? deg + 360 : deg;
+}
+
+function labChroma(a: number, b: number): number {
+  return Math.sqrt(a * a + b * b);
+}
+
+function hueDelta(h1: number, h2: number): number {
+  const diff = Math.abs(h1 - h2) % 360;
+  return diff > 180 ? 360 - diff : diff;
+}
+
+/**
+ * Gate low-chroma or off-hue palette entries out of chromatic family
+ * matches. ∆E 2000 gracefully collapses hue weight for gray-ish colors,
+ * which is correct for color science but wrong for user intent: a user
+ * asking for "pink" doesn't want a beige scene that happens to sit
+ * near-ish to the pink Lab reference. Neutral families (white/black/
+ * gray/brown) bypass the gate — their whole point is low-chroma matching.
+ */
+function paletteEntryCompatibleWithFamily(
+  family: ColorFamilyDefinition,
+  entry: PaletteEntry,
+): boolean {
+  if (!CHROMATIC_FAMILIES.has(family.family)) return true;
+  if (labChroma(entry.a, entry.b) < MIN_ENTRY_CHROMA) return false;
+  const familyHue = labHueDeg(family.lab.a, family.lab.b);
+  const entryHue = labHueDeg(entry.a, entry.b);
+  return hueDelta(familyHue, entryHue) <= MAX_HUE_DELTA_DEG;
+}
+
+/**
+ * Canonical Lab coordinates for each color family, plus the synonyms
+ * that map into it. Values are mid-saturation reference points — for
+ * `red` we pick a slightly-desaturated Lab(53, 70, 50) rather than
+ * pure-sRGB red (Lab 53, 80, 67) because VLM-described "reds" in
+ * natural footage tend to sit a bit off the primary.
+ *
+ * The list stays conservative to avoid false-positive query parses
+ * ("rose" as a flower vs. "rose" as a color — we accept the color
+ * reading; users can always add descriptive words to disambiguate).
+ */
+export interface ColorFamilyDefinition {
+  family: string;
+  lab: LabColor;
+  synonyms: string[];
+}
+
+export interface ParsedColorQuery {
+  colors: ColorFamilyDefinition[];
+  /**
+   * True when the query is asking for palette alone (e.g. `color:yellow`,
+   * `yellow color`, `crimson tones`) rather than "yellow jacket" /
+   * "blue car" object semantics.
+   */
+  paletteOnly: boolean;
+}
+
+const COLOR_FAMILIES: ColorFamilyDefinition[] = [
+  { family: 'red',    lab: { l: 53, a: 70, b: 50 },  synonyms: ['red', 'crimson', 'scarlet', 'maroon', 'ruby', 'burgundy'] },
+  { family: 'orange', lab: { l: 65, a: 40, b: 65 },  synonyms: ['orange', 'amber', 'tangerine', 'peach', 'apricot'] },
+  { family: 'yellow', lab: { l: 90, a: -5, b: 80 },  synonyms: ['yellow', 'golden', 'gold', 'mustard', 'lemon'] },
+  { family: 'green',  lab: { l: 60, a: -55, b: 50 }, synonyms: ['green', 'emerald', 'lime', 'olive', 'forest', 'mint', 'sage'] },
+  { family: 'teal',   lab: { l: 60, a: -40, b: -15 },synonyms: ['teal', 'turquoise', 'cyan', 'aqua'] },
+  { family: 'blue',   lab: { l: 40, a: 15, b: -60 }, synonyms: ['blue', 'navy', 'azure', 'cobalt', 'indigo', 'sapphire'] },
+  { family: 'purple', lab: { l: 40, a: 50, b: -45 }, synonyms: ['purple', 'violet', 'magenta', 'lavender', 'plum', 'lilac'] },
+  // Pink hue sits around 340-355° (negative b*), not the 5-10° range —
+  // at b=+5 we drift into salmon/coral and start matching warm skin
+  // tones in dimly lit scenes. Classic "pink" needs a cool shift.
+  { family: 'pink',   lab: { l: 70, a: 50, b: -8 },  synonyms: ['pink', 'rose', 'fuchsia'] },
+  { family: 'coral',  lab: { l: 68, a: 45, b: 25 },  synonyms: ['coral', 'salmon'] },
+  { family: 'brown',  lab: { l: 40, a: 15, b: 35 },  synonyms: ['brown', 'tan', 'beige', 'chocolate', 'khaki', 'sepia'] },
+  { family: 'white',  lab: { l: 95, a: 0, b: 0 },    synonyms: ['white', 'ivory', 'cream', 'snow', 'pearl'] },
+  { family: 'black',  lab: { l: 10, a: 0, b: 0 },    synonyms: ['black', 'ebony', 'charcoal', 'midnight', 'onyx'] },
+  { family: 'gray',   lab: { l: 55, a: 0, b: 0 },    synonyms: ['gray', 'grey', 'silver', 'slate', 'ash'] },
+];
+
+const SYNONYM_TO_FAMILY = new Map<string, ColorFamilyDefinition>();
+for (const def of COLOR_FAMILIES) {
+  for (const synonym of def.synonyms) SYNONYM_TO_FAMILY.set(synonym, def);
+}
+
+const COLOR_INTENT_TOKENS = new Set([
+  'color',
+  'colors',
+  'palette',
+  'palettes',
+  'tint',
+  'tints',
+  'tone',
+  'tones',
+  'hue',
+  'hues',
+  'grade',
+  'graded',
+  'grading',
+  'dominant',
+  'swatch',
+  'swatches',
+]);
+
+const COLOR_QUERY_FILLER_TOKENS = new Set([
+  'a',
+  'an',
+  'the',
+  'and',
+  'or',
+  'of',
+  'with',
+  'in',
+  'on',
+  'at',
+  'to',
+  'for',
+  'from',
+  'by',
+  'show',
+  'find',
+  'me',
+  'shot',
+  'shots',
+  'scene',
+  'scenes',
+  'clip',
+  'clips',
+  'frame',
+  'frames',
+  'image',
+  'images',
+  'video',
+  'videos',
+  'please',
+]);
+
+function tokenize(text: string): string[] {
+  return text.toLowerCase().replace(/[^\p{L}\p{N}\s]/gu, ' ').split(/\s+/).filter(Boolean);
+}
+
+/**
+ * Parse whether the query is explicitly asking for palette matching.
+ * Bare color words stay in the normal semantic lane so queries like
+ * "yellow jacket" or "orange sunset" don't get treated as palette-only.
+ */
+export function parseColorQuery(query: string): ParsedColorQuery {
+  const tokens = tokenize(query);
+  const explicitIntent = tokens.some((token) => COLOR_INTENT_TOKENS.has(token));
+  const seen = new Set<string>();
+  const colors: ColorFamilyDefinition[] = [];
+  for (const token of tokens) {
+    const def = SYNONYM_TO_FAMILY.get(token);
+    if (def && !seen.has(def.family)) {
+      seen.add(def.family);
+      colors.push(def);
+    }
+  }
+
+  // A query composed only of color words (e.g. "pink", "red blue") has no
+  // object semantics to chase — treat it as palette intent so CLIP's
+  // weakness on bare color tokens doesn't surface unrelated scenes above
+  // palette-matching ones. Multi-word queries like "pink jacket" still
+  // flow through the normal semantic path with an additive color boost.
+  const allTokensAreColors = tokens.length > 0
+    && tokens.every((token) => SYNONYM_TO_FAMILY.has(token));
+  if (allTokensAreColors) {
+    return { colors, paletteOnly: true };
+  }
+
+  if (!explicitIntent || colors.length === 0) {
+    return { colors: [], paletteOnly: false };
+  }
+
+  const paletteOnly = !tokens.some((token) => (
+    !COLOR_INTENT_TOKENS.has(token)
+    && !COLOR_QUERY_FILLER_TOKENS.has(token)
+    && !SYNONYM_TO_FAMILY.has(token)
+  ));
+
+  return { colors, paletteOnly };
+}
+
+/**
+ * Return the color families (with Lab coordinates) that the query
+ * explicitly asks to match by palette. Empty array means no color-aware
+ * ranking for this query.
+ */
+export function extractQueryColors(query: string): ColorFamilyDefinition[] {
+  return parseColorQuery(query).colors;
+}
+
+/**
+ * Find the best palette match for each query color, pick the overall
+ * closest one, and return the boost + metadata. `null` means no
+ * meaningful match (palette empty, or all ∆E ≥ 30).
+ */
+export function colorBoostFor(
+  queryColors: ColorFamilyDefinition[],
+  palette: PaletteEntry[] | undefined,
+): ColorBoostResult | null {
+  if (queryColors.length === 0 || !palette || palette.length === 0) return null;
+
+  let best: ColorBoostResult | null = null;
+  for (const query of queryColors) {
+    for (const entry of palette) {
+      if (!paletteEntryCompatibleWithFamily(query, entry)) continue;
+      const distance = deltaE2000(query.lab, { l: entry.l, a: entry.a, b: entry.b });
+      const boost = boostFromDeltaE(distance, entry.weight);
+      if (boost <= 0) continue;
+      if (!best || boost > best.boost) {
+        best = { boost, family: query.family, deltaE: distance, matched: entry };
+      }
+    }
+  }
+  return best;
+}
+
+/**
+ * Map a single Lab swatch to its closest color-family name. Used by the
+ * "click a palette swatch to search its color" interaction — we want the
+ * family word (e.g. `"red"`) that parseColorQuery will canonicalize back
+ * to the same palette-only search.
+ *
+ * Returns null for swatches too far from every family reference (rare —
+ * the families span the Lab gamut densely enough that the nearest is
+ * usually within ∆E 30).
+ */
+export function nearestColorFamily(entry: LabColor): string | null {
+  let bestFamily: string | null = null;
+  let bestDistance = Number.POSITIVE_INFINITY;
+  for (const def of COLOR_FAMILIES) {
+    const distance = deltaE2000(def.lab, entry);
+    if (distance < bestDistance) {
+      bestDistance = distance;
+      bestFamily = def.family;
+    }
+  }
+  return bestDistance < ZERO_BOOST_DELTA_E ? bestFamily : null;
+}
+
+/**
+ * Symmetric weighted distance between two palettes via greedy nearest
+ * matching in both directions. Full Hungarian assignment would be cleaner
+ * but palettes are ≤6 entries in practice, and greedy matches stay within
+ * a few percent of optimal for that size. Averaging both directions keeps
+ * the metric symmetric — a tiny palette shouldn't "win" just because one
+ * of its entries happens to be close to a big entry in the reference.
+ *
+ * Output is a weighted-mean ∆E 2000 across matched entries. Palettes with
+ * no overlap return POSITIVE_INFINITY so the ranker can drop them.
+ */
+export function palettePairDistance(
+  a: PaletteEntry[],
+  b: PaletteEntry[],
+): number {
+  if (a.length === 0 || b.length === 0) return Number.POSITIVE_INFINITY;
+  const forward = greedyDirectionalDistance(a, b);
+  const reverse = greedyDirectionalDistance(b, a);
+  if (!Number.isFinite(forward) || !Number.isFinite(reverse)) {
+    return Number.POSITIVE_INFINITY;
+  }
+  return (forward + reverse) / 2;
+}
+
+function greedyDirectionalDistance(
+  source: PaletteEntry[],
+  target: PaletteEntry[],
+): number {
+  let totalWeight = 0;
+  let totalWeighted = 0;
+  for (const s of source) {
+    let best = Number.POSITIVE_INFINITY;
+    for (const t of target) {
+      const d = deltaE2000({ l: s.l, a: s.a, b: s.b }, { l: t.l, a: t.a, b: t.b });
+      if (d < best) best = d;
+    }
+    if (!Number.isFinite(best)) continue;
+    totalWeighted += best * s.weight;
+    totalWeight += s.weight;
+  }
+  if (totalWeight <= 0) return Number.POSITIVE_INFINITY;
+  return totalWeighted / totalWeight;
+}
+
+export interface PaletteSimilarityResult {
+  /** Cosine-compatible boost (higher = more similar). */
+  boost: number;
+  /** Weighted-mean ∆E 2000 between the two palettes. */
+  distance: number;
+}
+
+/**
+ * Turn a palette-pair distance into a score in cosine-compatible units,
+ * reusing the same linear falloff as the single-color boost so both
+ * signals compose cleanly when mixed.
+ */
+export function paletteSimilarityBoost(
+  reference: PaletteEntry[] | undefined,
+  candidate: PaletteEntry[] | undefined,
+): PaletteSimilarityResult | null {
+  if (!reference || !candidate) return null;
+  const distance = palettePairDistance(reference, candidate);
+  if (!Number.isFinite(distance) || distance >= ZERO_BOOST_DELTA_E) return null;
+  const linear = (ZERO_BOOST_DELTA_E - distance) / ZERO_BOOST_DELTA_E;
+  return { boost: MAX_BOOST * linear, distance };
+}
diff --git a/src/features/scene-browser/utils/embeddings-cache.ts b/src/features/scene-browser/utils/embeddings-cache.ts
new file mode 100644
index 000000000..0eac9b47d
--- /dev/null
+++ b/src/features/scene-browser/utils/embeddings-cache.ts
@@ -0,0 +1,344 @@
+/**
+ * In-memory embeddings cache for the Scene Browser.
+ *
+ * Caption embeddings live on disk as a packed `Float32Array` bin plus
+ * metadata in `captions.json`. The hook layer wants fast synchronous
+ * access during ranking, so this module hydrates per-media vectors on
+ * first request and keeps them in memory for the session.
+ *
+ * Cache keys are "scene ids" (`${mediaId}:${captionIndex}`) so the ranker
+ * doesn't need to know about media boundaries.
+ */
+
+import { createLogger } from '@/shared/logging/logger';
+import {
+  EMBEDDING_MODEL_DIM,
+  EMBEDDING_MODEL_ID,
+  CLIP_EMBEDDING_DIM,
+  CLIP_MODEL_ID,
+  buildEmbeddingText,
+  clipProvider,
+  embeddingsProvider,
+  extractDominantColors,
+} from '../deps/analysis';
+import {
+  mediaLibraryService,
+  useMediaLibraryStore,
+  type MediaMetadata,
+} from '../deps/media-library';
+import {
+  getCaptionEmbeddings,
+  getCaptionImageEmbeddings,
+  getCaptionThumbnailBlob,
+  getCaptionsEmbeddingsMeta,
+  getTranscript,
+  saveCaptionEmbeddings,
+  saveCaptionImageEmbeddings,
+} from '../deps/storage';
+
+const log = createLogger('SceneBrowser:EmbeddingsCache');
+
+/** sceneId → normalized text embedding vector. */
+const embeddings = new Map<string, Float32Array>();
+/** sceneId → normalized CLIP image embedding vector. */
+const imageEmbeddings = new Map<string, Float32Array>();
+/** sceneId → dominant-color palette entries (Lab + weight). */
+const palettes = new Map<string, Array<{ l: number; a: number; b: number; weight: number }>>();
+/** mediaId → outstanding hydration promise so concurrent callers share work. */
+const pendingHydrates = new Map<string, Promise<void>>();
+/** mediaId → outstanding text indexing (retroactive generate) promise. */
+const pendingIndexes = new Map<string, Promise<void>>();
+/** mediaId → outstanding image indexing (retroactive generate) promise. */
+const pendingImageIndexes = new Map<string, Promise<void>>();
+/** mediaIds we've already concluded have no usable text embeddings. */
+const missingEmbeddings = new Set<string>();
+/** mediaIds we've already concluded have no usable image embeddings. */
+const missingImageEmbeddings = new Set<string>();
+
+function sceneId(mediaId: string, captionIndex: number): string {
+  return `${mediaId}:${captionIndex}`;
+}
+
+function populateFromInMemory(media: MediaMetadata): boolean {
+  const captions = media.aiCaptions;
+  if (!captions || captions.length === 0) return false;
+  let found = false;
+  captions.forEach((caption, i) => {
+    if (Array.isArray(caption.embedding) && caption.embedding.length === EMBEDDING_MODEL_DIM) {
+      embeddings.set(sceneId(media.id, i), Float32Array.from(caption.embedding));
+      found = true;
+    }
+    // Palettes are tiny — always mirror from whatever the store has so
+    // the rank-time Map is a read-only snapshot of the source of truth.
+    if (Array.isArray(caption.palette) && caption.palette.length > 0) {
+      palettes.set(sceneId(media.id, i), caption.palette.map((entry) => ({
+        l: entry.l, a: entry.a, b: entry.b, weight: entry.weight,
+      })));
+    }
+  });
+  return found;
+}
+
+async function hydrateFromDisk(mediaId: string, expectedCount: number): Promise<{
+  text: boolean;
+  image: boolean;
+}> {
+  const meta = await getCaptionsEmbeddingsMeta(mediaId);
+  if (!meta) return { text: false, image: false };
+
+  let textOk = false;
+  if (meta.embeddingModel === EMBEDDING_MODEL_ID && meta.embeddingDim === EMBEDDING_MODEL_DIM) {
+    const vectors = await getCaptionEmbeddings(mediaId, meta.embeddingDim, expectedCount);
+    if (vectors) {
+      vectors.forEach((vector, i) => embeddings.set(sceneId(mediaId, i), vector));
+      textOk = true;
+    }
+  }
+
+  let imageOk = false;
+  if (
+    meta.imageEmbeddingModel === CLIP_MODEL_ID
+    && meta.imageEmbeddingDim === CLIP_EMBEDDING_DIM
+  ) {
+    const vectors = await getCaptionImageEmbeddings(mediaId, meta.imageEmbeddingDim, expectedCount);
+    if (vectors) {
+      vectors.forEach((vector, i) => imageEmbeddings.set(sceneId(mediaId, i), vector));
+      imageOk = true;
+    }
+  }
+
+  return { text: textOk, image: imageOk };
+}
+
+/**
+ * Ensure embeddings for every caption on `mediaId` are present in memory.
+ * Reuses already-loaded vectors; concurrent callers share a single disk read.
+ */
+export function ensureEmbeddingsLoaded(mediaId: string): Promise<void> {
+  const existing = pendingHydrates.get(mediaId);
+  if (existing) return existing;
+
+  const promise = (async () => {
+    const media = useMediaLibraryStore.getState().mediaById[mediaId];
+    if (!media || !media.aiCaptions || media.aiCaptions.length === 0) return;
+
+    let textHydrated = populateFromInMemory(media);
+    let imageHydrated = imageEmbeddings.has(sceneId(mediaId, 0));
+
+    if (!textHydrated || !imageHydrated) {
+      const loaded = await hydrateFromDisk(mediaId, media.aiCaptions.length);
+      textHydrated ||= loaded.text;
+      imageHydrated ||= loaded.image;
+    }
+
+    if (!textHydrated) missingEmbeddings.add(mediaId);
+    if (!imageHydrated) missingImageEmbeddings.add(mediaId);
+  })().catch((error) => {
+    log.warn('Embedding hydrate failed', { mediaId, error });
+    missingEmbeddings.add(mediaId);
+    missingImageEmbeddings.add(mediaId);
+  }).finally(() => {
+    pendingHydrates.delete(mediaId);
+  });
+
+  pendingHydrates.set(mediaId, promise);
+  return promise;
+}
+
+/**
+ * Run the embedding model over captions that have never been indexed,
+ * save the resulting `.bin`, patch captions.json with the model metadata,
+ * and populate the cache. Caller is responsible for opening the gate via
+ * `embeddingsProvider.ensureReady()` — skipped here so background indexing
+ * can decide when to pay the model-download cost.
+ */
+export function indexMediaCaptions(mediaId: string): Promise<void> {
+  const existing = pendingIndexes.get(mediaId);
+  if (existing) return existing;
+
+  const promise = (async () => {
+    const state = useMediaLibraryStore.getState();
+    const media = state.mediaById[mediaId];
+    if (!media || !media.aiCaptions || media.aiCaptions.length === 0) return;
+    if (state.taggingMediaIds.has(mediaId)) return;
+
+    await embeddingsProvider.ensureReady();
+    // The main Analyze-with-AI pipeline owns this media during its run.
+    // Re-check after the (potentially long) model download to avoid racing
+    // it with a re-analysis that just started.
+    if (useMediaLibraryStore.getState().taggingMediaIds.has(mediaId)) return;
+
+    // Gather the same context signals the main pipeline uses so a
+    // retroactively-indexed caption is embedded identically to one
+    // generated by Analyze-with-AI — otherwise semantic ranking would
+    // get two flavors of vectors in one library and drift in quality.
+    const transcript = await getTranscript(mediaId).catch(() => null);
+    const colorResults = await Promise.all(
+      media.aiCaptions.map(async (caption) => {
+        if (!caption.thumbRelPath) return { phrase: '', palette: [] as const };
+        try {
+          const blob = await getCaptionThumbnailBlob(caption.thumbRelPath);
+          if (!blob) return { phrase: '', palette: [] as const };
+          return await extractDominantColors(blob);
+        } catch {
+          return { phrase: '', palette: [] as const };
+        }
+      }),
+    );
+
+    const texts = media.aiCaptions.map((caption, i) => buildEmbeddingText({
+      caption: { text: caption.text, timeSec: caption.timeSec },
+      sceneData: caption.sceneData,
+      transcriptSegments: transcript?.segments,
+      colorPhrase: colorResults[i]?.phrase ?? '',
+    }));
+
+    const vectors = await embeddingsProvider.embedBatch(texts);
+    if (vectors.length !== texts.length) {
+      throw new Error(`Embedding returned ${vectors.length} vectors for ${texts.length} captions`);
+    }
+
+    await saveCaptionEmbeddings(mediaId, vectors, EMBEDDING_MODEL_DIM);
+    // Persist the model metadata on captions.json so future sessions know
+    // the bin matches. We rewrite the full captions payload — cheap, since
+    // retroactive indexing is an explicit user action, not a hot path.
+    // Stamp the extracted palettes onto each caption so retroactive
+    // indexing also populates color data for legacy captions without it.
+    const capturedCaptions = media.aiCaptions.map((caption, i) => {
+      const palette = colorResults[i]?.palette;
+      const next = { ...caption };
+      if (palette && palette.length > 0) next.palette = [...palette];
+      return next;
+    });
+    await mediaLibraryService.updateMediaCaptions(mediaId, capturedCaptions, {
+      embeddingModel: EMBEDDING_MODEL_ID,
+      embeddingDim: EMBEDDING_MODEL_DIM,
+    });
+    useMediaLibraryStore.getState().updateMediaCaptions(mediaId, capturedCaptions);
+
+    vectors.forEach((vector, i) => {
+      embeddings.set(sceneId(mediaId, i), vector);
+    });
+    missingEmbeddings.delete(mediaId);
+  })().finally(() => {
+    pendingIndexes.delete(mediaId);
+  });
+
+  pendingIndexes.set(mediaId, promise);
+  return promise;
+}
+
+/**
+ * Generate CLIP image embeddings for every thumbnail-bearing caption on
+ * `mediaId`, persist the bin, update captions.json with the image model
+ * metadata, and populate the cache. Skips captions whose thumbnails are
+ * missing on disk — the count of saved vectors is allowed to be less
+ * than the caption count only when the persisted bin layout still
+ * matches 1:1 (which is why we pre-require all thumbs to exist; if any
+ * are missing we bail rather than emit a short-count bin).
+ */
+export function indexMediaImageCaptions(mediaId: string): Promise<void> {
+  const existing = pendingImageIndexes.get(mediaId);
+  if (existing) return existing;
+
+  const promise = (async () => {
+    const state = useMediaLibraryStore.getState();
+    const media = state.mediaById[mediaId];
+    if (!media || !media.aiCaptions || media.aiCaptions.length === 0) return;
+    if (state.taggingMediaIds.has(mediaId)) return;
+
+    // Load every thumbnail up front — CLIP expects one vector per
+    // caption index, so a missing thumb anywhere in the series means we
+    // can't write a coherent bin for this media. Lazy-thumb will
+    // eventually generate them on next Scene Browser visit; skip and
+    // retry next time.
+    const blobs: Blob[] = [];
+    for (const caption of media.aiCaptions) {
+      if (!caption.thumbRelPath) return;
+      const blob = await getCaptionThumbnailBlob(caption.thumbRelPath);
+      if (!blob) return;
+      blobs.push(blob);
+    }
+
+    await clipProvider.ensureReady();
+    if (useMediaLibraryStore.getState().taggingMediaIds.has(mediaId)) return;
+
+    const vectors = await clipProvider.embedImages(blobs);
+    if (vectors.length !== blobs.length) {
+      throw new Error(`CLIP returned ${vectors.length} vectors for ${blobs.length} thumbnails`);
+    }
+
+    await saveCaptionImageEmbeddings(mediaId, vectors, CLIP_EMBEDDING_DIM);
+
+    // Patch captions.json with the image-model metadata. Fetch the latest
+    // captions from the store so we preserve concurrent edits (rare, but
+    // re-analyze-and-index-at-the-same-time is the exact race we care about).
+    const latest = useMediaLibraryStore.getState().mediaById[mediaId];
+    if (latest?.aiCaptions) {
+      await mediaLibraryService.updateMediaCaptions(mediaId, latest.aiCaptions, {
+        embeddingModel: EMBEDDING_MODEL_ID,
+        embeddingDim: EMBEDDING_MODEL_DIM,
+        imageEmbeddingModel: CLIP_MODEL_ID,
+        imageEmbeddingDim: CLIP_EMBEDDING_DIM,
+      });
+    }
+
+    vectors.forEach((vector, i) => {
+      imageEmbeddings.set(sceneId(mediaId, i), vector);
+    });
+    missingImageEmbeddings.delete(mediaId);
+  })().finally(() => {
+    pendingImageIndexes.delete(mediaId);
+  });
+
+  pendingImageIndexes.set(mediaId, promise);
+  return promise;
+}
+
+/**
+ * Drop cached embeddings for `mediaId`. Call after Analyze-with-AI finishes
+ * (new embeddings will be hydrated from the fresh in-memory caption array
+ * on next access) or when embeddings-on-disk go out of sync.
+ */
+export function invalidateEmbeddingsCache(mediaId: string): void {
+  const prefix = `${mediaId}:`;
+  for (const key of embeddings.keys()) {
+    if (key.startsWith(prefix)) embeddings.delete(key);
+  }
+  for (const key of imageEmbeddings.keys()) {
+    if (key.startsWith(prefix)) imageEmbeddings.delete(key);
+  }
+  for (const key of palettes.keys()) {
+    if (key.startsWith(prefix)) palettes.delete(key);
+  }
+  missingEmbeddings.delete(mediaId);
+  missingImageEmbeddings.delete(mediaId);
+  pendingHydrates.delete(mediaId);
+  pendingIndexes.delete(mediaId);
+  pendingImageIndexes.delete(mediaId);
+}
+
+/** Read-only view of the in-memory text embeddings cache, for ranking. */
+export function getEmbeddingsSnapshot(): Map<string, Float32Array> {
+  return embeddings;
+}
+
+/** Read-only view of the in-memory CLIP image embeddings cache. */
+export function getImageEmbeddingsSnapshot(): Map<string, Float32Array> {
+  return imageEmbeddings;
+}
+
+/** Read-only view of the in-memory color palette cache. */
+export function getPalettesSnapshot(): Map<string, Array<{ l: number; a: number; b: number; weight: number }>> {
+  return palettes;
+}
+
+/** Whether the given media is known to be missing text embeddings. */
+export function isMediaMissingEmbeddings(mediaId: string): boolean {
+  return missingEmbeddings.has(mediaId);
+}
+
+/** Whether the given media is known to be missing image embeddings. */
+export function isMediaMissingImageEmbeddings(mediaId: string): boolean {
+  return missingImageEmbeddings.has(mediaId);
+}
diff --git a/src/features/scene-browser/utils/invalidate.ts b/src/features/scene-browser/utils/invalidate.ts
new file mode 100644
index 000000000..389935ead
--- /dev/null
+++ b/src/features/scene-browser/utils/invalidate.ts
@@ -0,0 +1,24 @@
+/**
+ * Single entry point for "this media's captions are about to change —
+ * drop every cached thumbnail resource tied to it." Called by
+ * Analyze-with-AI (and any future re-caption flow) before the pipeline
+ * deletes old thumbs and writes new ones.
+ *
+ * Combines the blob URL cache (the hook that hands JPEG URLs to <img/>
+ * rows) and the lazy-thumb probe/generation cache (the queue that fills
+ * in pointers for pre-feature captions) in one call so callers don't
+ * have to know about the internal split.
+ */
+
+import { invalidateMediaCaptionThumbBlobs } from '../hooks/use-caption-thumbnail';
+import { invalidateEmbeddingsCache } from './embeddings-cache';
+import { invalidateLazyThumbCache } from './lazy-thumb';
+
+export function invalidateMediaCaptionThumbnails(mediaId: string): void {
+  invalidateMediaCaptionThumbBlobs(mediaId);
+  invalidateLazyThumbCache(mediaId);
+  // Semantic embeddings are tied 1:1 to caption indexes — a re-analyze
+  // throws away the old caption array and generates a fresh one, so the
+  // cached vectors no longer correspond to their (new) scenes.
+  invalidateEmbeddingsCache(mediaId);
+}
diff --git a/src/features/scene-browser/utils/lazy-thumb.ts b/src/features/scene-browser/utils/lazy-thumb.ts
new file mode 100644
index 000000000..7281a771b
--- /dev/null
+++ b/src/features/scene-browser/utils/lazy-thumb.ts
@@ -0,0 +1,280 @@
+/**
+ * Lazy thumbnail generator for captions that were created before the
+ * Scene Browser feature landed (`thumbRelPath` missing). Opens the source
+ * media, seeks to the caption timestamp, captures a JPEG, and persists it
+ * alongside the rest of that media's caption thumbs so the Scene Browser
+ * can pick it up on subsequent reads.
+ *
+ * Work is queued globally so we never spin up more than one HTMLVideoElement
+ * at a time — 161-caption libraries can otherwise exhaust memory on long
+ * clips. Images are handled via `fetch` + `createImageBitmap` (same as the
+ * LFM provider's image path).
+ */
+
+import { createLogger } from '@/shared/logging/logger';
+import { mediaLibraryService, useMediaLibraryStore, type MediaMetadata } from '../deps/media-library';
+import { probeCaptionThumbnail, saveCaptionThumbnail } from '../deps/storage';
+
+const log = createLogger('SceneBrowser:LazyThumb');
+
+const PERSIST_DEBOUNCE_MS = 1500;
+const pendingPersists = new Map<string, ReturnType<typeof setTimeout>>();
+
+/**
+ * Rewrite `captions.json` + the metadata mirror for `mediaId` with the
+ * current in-memory captions array. Coalesces rapid fire-and-forget
+ * updates from a stream of thumbnail writes into a single disk write per
+ * ~{@link PERSIST_DEBOUNCE_MS}ms window — 161 captions that each land a
+ * thumb in quick succession otherwise trigger 161 JSON rewrites.
+ */
+function schedulePersist(mediaId: string): void {
+  const existing = pendingPersists.get(mediaId);
+  if (existing) clearTimeout(existing);
+  const timer = setTimeout(() => {
+    pendingPersists.delete(mediaId);
+    const latest = useMediaLibraryStore.getState().mediaById[mediaId];
+    if (!latest?.aiCaptions) return;
+    void mediaLibraryService
+      .updateMediaCaptions(mediaId, latest.aiCaptions)
+      .catch((error) => {
+        log.warn('Persisting caption thumb pointers failed', { mediaId, error });
+      });
+  }, PERSIST_DEBOUNCE_MS);
+  pendingPersists.set(mediaId, timer);
+}
+
+const MAX_DIM = 512;
+const SEEK_TIMEOUT_MS = 8_000;
+
+interface PendingRequest {
+  mediaId: string;
+  captionIndex: number;
+  timeSec: number;
+  resolve: (relPath: string | null) => void;
+}
+
+const queue: PendingRequest[] = [];
+let running = false;
+const resultCache = new Map<string, string | null>();
+const inflight = new Map<string, Promise<string | null>>();
+
+function cacheKey(mediaId: string, captionIndex: number): string {
+  return `${mediaId}:${captionIndex}`;
+}
+
+/**
+ * Drop the memoized probe + generation results for every caption of
+ * `mediaId` so a re-analyzed media starts from a clean slate. Queued
+ * requests that haven't started yet are dropped; in-flight generations
+ * are left to finish and are discarded at the write site via the
+ * `taggingMediaIds` gate below.
+ */
+export function invalidateLazyThumbCache(mediaId: string): void {
+  const prefix = `${mediaId}:`;
+  for (const key of resultCache.keys()) {
+    if (key.startsWith(prefix)) resultCache.delete(key);
+  }
+  for (let i = queue.length - 1; i >= 0; i -= 1) {
+    const request = queue[i]!;
+    if (request.mediaId === mediaId) {
+      request.resolve(null);
+      queue.splice(i, 1);
+    }
+  }
+  const pendingPersist = pendingPersists.get(mediaId);
+  if (pendingPersist) {
+    clearTimeout(pendingPersist);
+    pendingPersists.delete(mediaId);
+  }
+}
+
+async function seekVideoTo(video: HTMLVideoElement, timeSec: number): Promise<void> {
+  return new Promise<void>((resolve, reject) => {
+    const timeout = setTimeout(() => {
+      cleanup();
+      reject(new Error(`Seek timed out at ${timeSec}s`));
+    }, SEEK_TIMEOUT_MS);
+    const onSeeked = () => {
+      cleanup();
+      resolve();
+    };
+    const onError = () => {
+      cleanup();
+      reject(new Error('Video seek failed'));
+    };
+    const cleanup = () => {
+      clearTimeout(timeout);
+      video.removeEventListener('seeked', onSeeked);
+      video.removeEventListener('error', onError);
+    };
+    video.addEventListener('seeked', onSeeked, { once: true });
+    video.addEventListener('error', onError, { once: true });
+    video.currentTime = Math.max(0, timeSec);
+  });
+}
+
+async function captureFrame(video: HTMLVideoElement): Promise<Blob> {
+  const vw = video.videoWidth || 640;
+  const vh = video.videoHeight || 360;
+  const scale = Math.min(MAX_DIM / Math.max(vw, vh), 1);
+  const width = Math.max(1, Math.round(vw * scale));
+  const height = Math.max(1, Math.round(vh * scale));
+  const canvas = new OffscreenCanvas(width, height);
+  const context = canvas.getContext('2d');
+  if (!context) throw new Error('OffscreenCanvas 2d context unavailable');
+  context.drawImage(video, 0, 0, width, height);
+  return canvas.convertToBlob({ type: 'image/jpeg', quality: 0.75 });
+}
+
+async function captureImage(blob: Blob): Promise<Blob> {
+  const bitmap = await createImageBitmap(blob);
+  try {
+    const scale = Math.min(MAX_DIM / Math.max(bitmap.width, bitmap.height), 1);
+    const width = Math.max(1, Math.round(bitmap.width * scale));
+    const height = Math.max(1, Math.round(bitmap.height * scale));
+    const canvas = new OffscreenCanvas(width, height);
+    const context = canvas.getContext('2d');
+    if (!context) throw new Error('OffscreenCanvas 2d context unavailable');
+    context.drawImage(bitmap, 0, 0, width, height);
+    return canvas.convertToBlob({ type: 'image/jpeg', quality: 0.75 });
+  } finally {
+    bitmap.close();
+  }
+}
+
+/**
+ * Patch the in-memory media item so subsequent renders see the new path,
+ * then schedule a debounced write-back so the pointer survives reloads.
+ */
+function patchStoreThumbPath(mediaId: string, captionIndex: number, relPath: string): void {
+  const store = useMediaLibraryStore.getState();
+  const media = store.mediaById[mediaId];
+  if (!media || !media.aiCaptions) return;
+  const existing = media.aiCaptions[captionIndex];
+  if (!existing || existing.thumbRelPath === relPath) return;
+  const updated: NonNullable<MediaMetadata['aiCaptions']> = media.aiCaptions.map((caption, i) =>
+    i === captionIndex ? { ...caption, thumbRelPath: relPath } : caption,
+  );
+  store.updateMediaCaptions(mediaId, updated);
+  schedulePersist(mediaId);
+}
+
+async function generateOne(request: PendingRequest): Promise<string | null> {
+  const { mediaId, captionIndex, timeSec } = request;
+  const state = useMediaLibraryStore.getState();
+  const media = state.mediaById[mediaId];
+  if (!media) return null;
+  // A concurrent Analyze-with-AI run owns this media's thumbs for the
+  // duration of its sweep — skip lazy work so we don't race the main
+  // pipeline and clobber a fresh thumbnail with a stale one.
+  if (state.taggingMediaIds.has(mediaId)) return null;
+
+  const isImage = media.mimeType.startsWith('image/');
+  const blobUrl = await mediaLibraryService.getMediaBlobUrl(mediaId);
+  if (!blobUrl) return null;
+
+  try {
+    let jpeg: Blob;
+    if (isImage) {
+      const response = await fetch(blobUrl);
+      const sourceBlob = await response.blob();
+      jpeg = await captureImage(sourceBlob);
+    } else {
+      const video = document.createElement('video');
+      video.muted = true;
+      video.preload = 'auto';
+      video.crossOrigin = 'anonymous';
+      video.src = blobUrl;
+      try {
+        await new Promise<void>((resolve, reject) => {
+          const onLoad = () => { cleanup(); resolve(); };
+          const onError = () => { cleanup(); reject(new Error('Video load failed')); };
+          const cleanup = () => {
+            video.removeEventListener('loadedmetadata', onLoad);
+            video.removeEventListener('error', onError);
+          };
+          video.addEventListener('loadedmetadata', onLoad, { once: true });
+          video.addEventListener('error', onError, { once: true });
+        });
+        await seekVideoTo(video, timeSec);
+        jpeg = await captureFrame(video);
+      } finally {
+        video.pause();
+        video.removeAttribute('src');
+        video.load();
+      }
+    }
+
+    // Re-check the tagging gate before writing — Analyze-with-AI may have
+    // started between our initial check and the slow seek + capture above.
+    if (useMediaLibraryStore.getState().taggingMediaIds.has(mediaId)) {
+      return null;
+    }
+    const relPath = await saveCaptionThumbnail(mediaId, captionIndex, jpeg);
+    patchStoreThumbPath(mediaId, captionIndex, relPath);
+    return relPath;
+  } catch (error) {
+    log.warn('Lazy thumbnail generation failed', { mediaId, captionIndex, timeSec, error });
+    return null;
+  } finally {
+    URL.revokeObjectURL(blobUrl);
+  }
+}
+
+async function drain(): Promise<void> {
+  if (running) return;
+  running = true;
+  try {
+    while (queue.length > 0) {
+      const request = queue.shift()!;
+      const key = cacheKey(request.mediaId, request.captionIndex);
+      const relPath = await generateOne(request);
+      resultCache.set(key, relPath);
+      request.resolve(relPath);
+    }
+  } finally {
+    running = false;
+  }
+}
+
+/**
+ * Request a thumbnail for a caption that has no persisted `thumbRelPath`.
+ * Returns the rel path of either the freshly-saved or already-on-disk thumb,
+ * or `null` when generation fails. Concurrent callers for the same
+ * (mediaId, captionIndex) share one job.
+ *
+ * The disk probe runs outside the generation queue so all scenes can probe
+ * in parallel on reload — only probe misses pay the price of the serial
+ * video-seek generation pipeline.
+ */
+export function requestLazyCaptionThumbnail(
+  mediaId: string,
+  captionIndex: number,
+  timeSec: number,
+): Promise<string | null> {
+  const key = cacheKey(mediaId, captionIndex);
+  const cached = resultCache.get(key);
+  if (cached !== undefined) return Promise.resolve(cached);
+
+  const pending = inflight.get(key);
+  if (pending) return pending;
+
+  const promise = (async () => {
+    const existing = await probeCaptionThumbnail(mediaId, captionIndex);
+    if (existing) {
+      patchStoreThumbPath(mediaId, captionIndex, existing);
+      resultCache.set(key, existing);
+      return existing;
+    }
+    const generated = await new Promise<string | null>((resolve) => {
+      queue.push({ mediaId, captionIndex, timeSec, resolve });
+      void drain();
+    });
+    resultCache.set(key, generated);
+    return generated;
+  })().finally(() => {
+    inflight.delete(key);
+  });
+  inflight.set(key, promise);
+  return promise;
+}
diff --git a/src/features/scene-browser/utils/library-palette.test.ts b/src/features/scene-browser/utils/library-palette.test.ts
new file mode 100644
index 000000000..e42ac30dd
--- /dev/null
+++ b/src/features/scene-browser/utils/library-palette.test.ts
@@ -0,0 +1,85 @@
+import { describe, expect, it } from 'vitest';
+import type { PaletteEntry } from '../deps/analysis';
+import { clusterPaletteEntries, flattenLibraryPalettes } from './library-palette';
+
+describe('flattenLibraryPalettes', () => {
+  it('normalizes each palette so long clips do not dominate', () => {
+    const a: PaletteEntry[] = [{ l: 50, a: 10, b: 10, weight: 0.8 }];
+    const b: PaletteEntry[] = [
+      { l: 30, a: 0, b: 0, weight: 0.5 },
+      { l: 60, a: 0, b: 0, weight: 0.5 },
+    ];
+    const flat = flattenLibraryPalettes([a, b]);
+    const totalA = flat.filter((e) => e.l === 50).reduce((s, e) => s + e.weight, 0);
+    const totalB = flat.filter((e) => e.l !== 50).reduce((s, e) => s + e.weight, 0);
+    expect(totalA).toBeCloseTo(1, 5);
+    expect(totalB).toBeCloseTo(1, 5);
+  });
+
+  it('skips empty or undefined palettes', () => {
+    const a: PaletteEntry[] = [{ l: 50, a: 10, b: 10, weight: 0.8 }];
+    expect(flattenLibraryPalettes([a, undefined, []])).toHaveLength(1);
+  });
+});
+
+describe('clusterPaletteEntries', () => {
+  it('returns empty for empty input', () => {
+    expect(clusterPaletteEntries([], 5)).toEqual([]);
+  });
+
+  it('caps cluster count at the entry count', () => {
+    const entries: PaletteEntry[] = [
+      { l: 50, a: 60, b: 40, weight: 1 },
+      { l: 40, a: 15, b: -60, weight: 1 },
+    ];
+    const clusters = clusterPaletteEntries(entries, 10);
+    expect(clusters).toHaveLength(2);
+  });
+
+  it('recovers well-separated source colors', () => {
+    // Three obvious color blobs with a bit of jitter per entry. The
+    // clusters should land near each of the three source centers.
+    const makeBlob = (base: { l: number; a: number; b: number }): PaletteEntry[] =>
+      Array.from({ length: 5 }, (_, i) => ({
+        l: base.l + (i - 2) * 0.3,
+        a: base.a + (i - 2) * 0.3,
+        b: base.b + (i - 2) * 0.3,
+        weight: 1,
+      }));
+
+    const entries = [
+      ...makeBlob({ l: 53, a: 70, b: 50 }), // red
+      ...makeBlob({ l: 40, a: 15, b: -60 }), // blue
+      ...makeBlob({ l: 90, a: -5, b: 80 }), // yellow
+    ];
+    const clusters = clusterPaletteEntries(entries, 3);
+    expect(clusters).toHaveLength(3);
+
+    // At least one cluster center should be close to each source blob.
+    const nearest = (target: { l: number; a: number; b: number }): number => {
+      let best = Infinity;
+      for (const c of clusters) {
+        const d = Math.sqrt(
+          (c.l - target.l) ** 2 + (c.a - target.a) ** 2 + (c.b - target.b) ** 2,
+        );
+        if (d < best) best = d;
+      }
+      return best;
+    };
+    expect(nearest({ l: 53, a: 70, b: 50 })).toBeLessThan(5);
+    expect(nearest({ l: 40, a: 15, b: -60 })).toBeLessThan(5);
+    expect(nearest({ l: 90, a: -5, b: 80 })).toBeLessThan(5);
+  });
+
+  it('weights cluster output by pixel coverage', () => {
+    const entries: PaletteEntry[] = [
+      { l: 50, a: 60, b: 40, weight: 0.9 }, // big red
+      { l: 40, a: 15, b: -60, weight: 0.05 }, // tiny blue
+      { l: 40, a: 15, b: -60, weight: 0.05 }, // tiny blue
+    ];
+    const clusters = clusterPaletteEntries(entries, 2);
+    expect(clusters).toHaveLength(2);
+    const sorted = [...clusters].sort((a, b) => b.weight - a.weight);
+    expect(sorted[0]?.weight).toBeGreaterThan(sorted[1]?.weight ?? 0);
+  });
+});
diff --git a/src/features/scene-browser/utils/library-palette.ts b/src/features/scene-browser/utils/library-palette.ts
new file mode 100644
index 000000000..067d06023
--- /dev/null
+++ b/src/features/scene-browser/utils/library-palette.ts
@@ -0,0 +1,190 @@
+/**
+ * Weighted k-means clustering of palette entries across the library.
+ *
+ * Color Mode shows a small grid of "the unique colors this library is
+ * made of" — for that to be usable we need to collapse the hundreds of
+ * per-scene palette entries into ~12 cluster centers in CIELAB space,
+ * weighted by each entry's pixel coverage so a vivid accent color
+ * doesn't get drowned out by huge neutral expanses of sky/wall.
+ *
+ * Init: deterministic k-means++ (heaviest entry, then farthest-weighted).
+ * Iteration: weighted mean in Lab.
+ * Distance: ∆E 2000 so perceptual differences drive cluster membership.
+ */
+
+import { deltaE2000, type LabColor, type PaletteEntry } from '../deps/analysis';
+
+export interface LabCluster extends LabColor {
+  /** Sum of pixel-coverage weights of all entries in the cluster. */
+  weight: number;
+  /** Count of raw palette entries that landed here. */
+  count: number;
+}
+
+/**
+ * Fold all media palettes into a single flat list, scaled so every
+ * scene contributes equally. Without the per-palette normalization a
+ * long clip would dominate just by having more caption frames indexed.
+ */
+export function flattenLibraryPalettes(
+  palettesBySource: Iterable<PaletteEntry[] | undefined>,
+): PaletteEntry[] {
+  const flat: PaletteEntry[] = [];
+  for (const palette of palettesBySource) {
+    if (!palette || palette.length === 0) continue;
+    const total = palette.reduce((sum, e) => sum + e.weight, 0);
+    if (total <= 0) continue;
+    for (const entry of palette) {
+      flat.push({
+        l: entry.l,
+        a: entry.a,
+        b: entry.b,
+        weight: entry.weight / total,
+      });
+    }
+  }
+  return flat;
+}
+
+/**
+ * Weighted k-means in Lab. Returns at most `k` cluster centers; empty
+ * clusters are dropped rather than re-seeded since for this UI "give me
+ * the N colors that actually exist" is more useful than "exactly N".
+ */
+export function clusterPaletteEntries(
+  entries: PaletteEntry[],
+  k: number,
+  maxIter = 20,
+): LabCluster[] {
+  if (entries.length === 0 || k <= 0) return [];
+  const effectiveK = Math.min(k, entries.length);
+
+  const centers: LabColor[] = seedCentersKMeansPP(entries, effectiveK);
+
+  for (let iter = 0; iter < maxIter; iter += 1) {
+    const assignments = assignEntriesToCenters(entries, centers);
+    const { centers: nextCenters, weights, counts } = recomputeCenters(entries, assignments, centers.length);
+    if (nextCenters.length === 0) break;
+
+    const converged = nextCenters.length === centers.length
+      && nextCenters.every((c, i) => {
+        const prev = centers[i];
+        return prev !== undefined && deltaE2000(c, prev) < 0.5;
+      });
+
+    centers.length = 0;
+    centers.push(...nextCenters);
+
+    if (converged) {
+      return centers.map((c, i) => ({
+        l: c.l, a: c.a, b: c.b,
+        weight: weights[i] ?? 0,
+        count: counts[i] ?? 0,
+      }));
+    }
+  }
+
+  // Final assignment for weights/counts when we exhaust iterations.
+  const assignments = assignEntriesToCenters(entries, centers);
+  const weights = new Array<number>(centers.length).fill(0);
+  const counts = new Array<number>(centers.length).fill(0);
+  for (let i = 0; i < entries.length; i += 1) {
+    const k = assignments[i]!;
+    weights[k] = (weights[k] ?? 0) + entries[i]!.weight;
+    counts[k] = (counts[k] ?? 0) + 1;
+  }
+  return centers.map((c, i) => ({
+    l: c.l, a: c.a, b: c.b,
+    weight: weights[i] ?? 0,
+    count: counts[i] ?? 0,
+  }));
+}
+
+function seedCentersKMeansPP(entries: PaletteEntry[], k: number): LabColor[] {
+  // Deterministic init so the grid doesn't re-order on every render.
+  // Pick the heaviest entry first, then greedily take the entry that
+  // maximizes min-distance-to-existing × weight (D² weighted sampling
+  // argmax instead of random sampling — same asymptotic quality, stable
+  // output).
+  let heaviest = 0;
+  for (let i = 1; i < entries.length; i += 1) {
+    if (entries[i]!.weight > entries[heaviest]!.weight) heaviest = i;
+  }
+  const seed = entries[heaviest]!;
+  const centers: LabColor[] = [{ l: seed.l, a: seed.a, b: seed.b }];
+
+  while (centers.length < k) {
+    let bestIdx = -1;
+    let bestScore = -1;
+    for (let i = 0; i < entries.length; i += 1) {
+      const entry = entries[i]!;
+      let minD = Number.POSITIVE_INFINITY;
+      for (const c of centers) {
+        const d = deltaE2000(c, { l: entry.l, a: entry.a, b: entry.b });
+        if (d < minD) minD = d;
+      }
+      if (!Number.isFinite(minD)) continue;
+      const score = minD * minD * entry.weight;
+      if (score > bestScore) {
+        bestScore = score;
+        bestIdx = i;
+      }
+    }
+    if (bestIdx < 0 || bestScore <= 0) break;
+    const picked = entries[bestIdx]!;
+    centers.push({ l: picked.l, a: picked.a, b: picked.b });
+  }
+  return centers;
+}
+
+function assignEntriesToCenters(
+  entries: PaletteEntry[],
+  centers: LabColor[],
+): number[] {
+  const out = new Array<number>(entries.length);
+  for (let i = 0; i < entries.length; i += 1) {
+    const entry = entries[i]!;
+    let bestK = 0;
+    let bestD = Number.POSITIVE_INFINITY;
+    for (let k = 0; k < centers.length; k += 1) {
+      const d = deltaE2000(centers[k]!, { l: entry.l, a: entry.a, b: entry.b });
+      if (d < bestD) { bestD = d; bestK = k; }
+    }
+    out[i] = bestK;
+  }
+  return out;
+}
+
+function recomputeCenters(
+  entries: PaletteEntry[],
+  assignments: number[],
+  k: number,
+): { centers: LabColor[]; weights: number[]; counts: number[] } {
+  const sumL = new Array<number>(k).fill(0);
+  const sumA = new Array<number>(k).fill(0);
+  const sumB = new Array<number>(k).fill(0);
+  const sumW = new Array<number>(k).fill(0);
+  const counts = new Array<number>(k).fill(0);
+
+  for (let i = 0; i < entries.length; i += 1) {
+    const entry = entries[i]!;
+    const cluster = assignments[i]!;
+    sumL[cluster] = (sumL[cluster] ?? 0) + entry.l * entry.weight;
+    sumA[cluster] = (sumA[cluster] ?? 0) + entry.a * entry.weight;
+    sumB[cluster] = (sumB[cluster] ?? 0) + entry.b * entry.weight;
+    sumW[cluster] = (sumW[cluster] ?? 0) + entry.weight;
+    counts[cluster] = (counts[cluster] ?? 0) + 1;
+  }
+
+  const centers: LabColor[] = [];
+  const weights: number[] = [];
+  const outCounts: number[] = [];
+  for (let i = 0; i < k; i += 1) {
+    const w = sumW[i] ?? 0;
+    if (w <= 0) continue;
+    centers.push({ l: sumL[i]! / w, a: sumA[i]! / w, b: sumB[i]! / w });
+    weights.push(w);
+    outCounts.push(counts[i] ?? 0);
+  }
+  return { centers, weights, counts: outCounts };
+}
diff --git a/src/features/scene-browser/utils/rank.test.ts b/src/features/scene-browser/utils/rank.test.ts
new file mode 100644
index 000000000..c3cd7fcec
--- /dev/null
+++ b/src/features/scene-browser/utils/rank.test.ts
@@ -0,0 +1,106 @@
+import { describe, expect, it } from 'vitest';
+import { rankScenes, type RankableScene } from './rank';
+
+function scene(id: string, text: string, extra: Partial<RankableScene> = {}): RankableScene {
+  return {
+    id,
+    mediaId: extra.mediaId ?? id.split(':')[0] ?? 'm1',
+    mediaFileName: extra.mediaFileName ?? 'clip.mp4',
+    timeSec: extra.timeSec ?? 0,
+    text,
+    thumbRelPath: extra.thumbRelPath,
+  };
+}
+
+describe('rankScenes', () => {
+  it('returns scenes unchanged when the query is empty', () => {
+    const scenes = [scene('a', 'A chef plates pasta'), scene('b', 'Sunset over mountains')];
+    const result = rankScenes('', scenes);
+    expect(result).toHaveLength(2);
+    expect(result.map((s) => s.id)).toEqual(['a', 'b']);
+    expect(result[0]!.matchSpans).toEqual([]);
+  });
+
+  it('scores exact substring matches above token matches', () => {
+    const scenes = [
+      scene('a', 'A chef plating roasted chicken on a wooden board'),
+      scene('b', 'Kitchen preparation shot with chef tools'),
+    ];
+    const result = rankScenes('roasted chicken', scenes);
+    expect(result[0]!.id).toBe('a');
+    expect(result[0]!.score).toBeGreaterThan(result[1]?.score ?? 0);
+  });
+
+  it('matches on token overlap when no substring is present', () => {
+    const scenes = [
+      scene('a', 'Wide shot of a kitchen counter with copper pots'),
+      scene('b', 'Living room with bookshelves'),
+    ];
+    const result = rankScenes('kitchen pots', scenes);
+    expect(result.map((s) => s.id)).toEqual(['a']);
+  });
+
+  it('tolerates a single-char typo via trigram similarity', () => {
+    const scenes = [scene('a', 'Bright kitchen counter shot')];
+    const result = rankScenes('kitchin', scenes);
+    expect(result.map((s) => s.id)).toEqual(['a']);
+  });
+
+  it('does not fuzzy-match on a shared suffix when the prefix differs', () => {
+    // "orange" vs "range" share four trigrams at the tail end — without a
+    // prefix gate, the whole mountain library falls out of a fruit query.
+    const scenes = [
+      scene('a', 'A snowy mountain range with a field of green trees in the foreground.'),
+      scene('b', 'A tree with orange leaves is shown against a blue sky.'),
+    ];
+    const result = rankScenes('orange', scenes);
+    expect(result.map((s) => s.id)).toEqual(['b']);
+  });
+
+  it('filters out scenes below the score threshold', () => {
+    const scenes = [
+      scene('a', 'A chef plating roasted chicken'),
+      scene('b', 'Sunset over mountains'),
+    ];
+    const result = rankScenes('kitchen', scenes);
+    expect(result.find((s) => s.id === 'b')).toBeUndefined();
+  });
+
+  it('returns merged case-insensitive match spans', () => {
+    const scenes = [scene('a', 'Chef places a pan. Chef plates pasta.')];
+    const result = rankScenes('chef', scenes);
+    expect(result).toHaveLength(1);
+    const spans = result[0]!.matchSpans;
+    expect(spans.length).toBeGreaterThanOrEqual(2);
+    for (const [from, to] of spans) {
+      expect(result[0]!.text.slice(from, to).toLowerCase()).toBe('chef');
+    }
+  });
+
+  it('ignores punctuation differences between query and caption', () => {
+    const scenes = [scene('a', 'A close-up of a wine glass.')];
+    const result = rankScenes('close up wine', scenes);
+    expect(result.map((s) => s.id)).toEqual(['a']);
+  });
+
+  it('matches richer scene-caption vocabulary for shot size and weather terms', () => {
+    const scenes = [
+      scene('a', 'Wide shot of a city skyline at dusk.'),
+      scene('b', 'Medium close-up of a singer on a rainy street.'),
+      scene('c', 'Close-up of hands slicing limes on a cutting board.'),
+    ];
+
+    expect(rankScenes('wide shot dusk skyline', scenes).map((s) => s.id)).toEqual(['a']);
+    expect(rankScenes('rainy singer', scenes).map((s) => s.id)).toEqual(['b']);
+    expect(rankScenes('close up limes', scenes)[0]?.id).toBe('c');
+  });
+
+  it('is stable in sort by filename then timestamp when scores tie', () => {
+    const scenes = [
+      scene('b', 'chef pans', { timeSec: 10, mediaFileName: 'b.mp4' }),
+      scene('a', 'chef pans', { timeSec: 5, mediaFileName: 'a.mp4' }),
+    ];
+    const result = rankScenes('chef pans', scenes);
+    expect(result.map((s) => s.id)).toEqual(['a', 'b']);
+  });
+});
diff --git a/src/features/scene-browser/utils/rank.ts b/src/features/scene-browser/utils/rank.ts
new file mode 100644
index 000000000..6c439850b
--- /dev/null
+++ b/src/features/scene-browser/utils/rank.ts
@@ -0,0 +1,244 @@
+/**
+ * Pure keyword + fuzzy ranking for scene captions.
+ *
+ * Kept dependency-free so it can be unit-tested without stores or storage,
+ * and moved into a worker later if ranking grows expensive (current 10k-scene
+ * runs complete in well under a frame on a modern laptop).
+ *
+ * Ranking is intentionally simple in v1:
+ *   - exact substring match on the normalized caption text → 1.0
+ *   - ratio of query tokens that appear in the caption (whole-word or prefix)
+ *   - trigram similarity as a tiebreak for typo tolerance
+ * Scores combine with max() rather than a linear blend so a clean substring
+ * match always beats a partial token overlap, regardless of caption length.
+ */
+
+export interface RankableScene {
+  /** Stable composite id — typically `${mediaId}:${captionIndex}`. */
+  id: string;
+  mediaId: string;
+  mediaFileName: string;
+  timeSec: number;
+  text: string;
+  thumbRelPath?: string;
+  /**
+   * Dominant-color palette (CIELAB + weight) for UI swatch display and
+   * color-query ranking. Plumbed through from `MediaCaption.palette`.
+   */
+  palette?: Array<{ l: number; a: number; b: number; weight: number }>;
+}
+
+/**
+ * Per-signal breakdown of why a scene ranked. Surfaced on the row so
+ * users can tell, at a glance, whether the match was driven by caption
+ * keywords, semantic text meaning, or visual (CLIP) similarity — which
+ * is the main UX gap that "I can't tell if semantic search is working"
+ * points at.
+ */
+export interface SceneMatchSignals {
+  /** Which ranker produced this row. */
+  ranker: 'keyword' | 'semantic';
+  /** Cosine against the text (all-MiniLM) embedding, when semantic mode ran. */
+  textScore?: number;
+  /** Cosine against the CLIP image embedding, when visual ranking ran. */
+  imageScore?: number;
+  /** True when the row cleared the keyword match threshold. */
+  keywordMatched?: boolean;
+  /**
+   * Color family (e.g. `"red"`) that the query asked for and the
+   * caption text mentions. Set by the color-boost pass in the ranker.
+   * Present means the final score got a boost and the UI should show a
+   * Color chip; absent means no color match (or no color query).
+   */
+  colorMatch?: string;
+  /**
+   * Weighted-mean ∆E between the scene's palette and the user-selected
+   * reference palette, when "find similar palette" is active. Lower is
+   * closer; surfaced as a palette-distance chip on the row.
+   */
+  paletteDistance?: number;
+}
+
+export interface ScoredScene extends RankableScene {
+  score: number;
+  /** Character ranges within `text` that matched, for <mark/> rendering. */
+  matchSpans: Array<[number, number]>;
+  signals: SceneMatchSignals;
+}
+
+export interface RankOptions {
+  /** Drop scenes below this score. Defaults to 0.25. */
+  threshold?: number;
+}
+
+const DEFAULT_THRESHOLD = 0.25;
+
+/** Strip punctuation and lowercase. Preserves letters, digits, CJK, whitespace. */
+function normalize(text: string): string {
+  return text.toLowerCase().replace(/[^\p{L}\p{N}\s]/gu, ' ').replace(/\s+/g, ' ').trim();
+}
+
+function trigrams(text: string): Set<string> {
+  const padded = `  ${text}  `;
+  const set = new Set<string>();
+  for (let i = 0; i < padded.length - 2; i += 1) {
+    set.add(padded.slice(i, i + 3));
+  }
+  return set;
+}
+
+/**
+ * Dice-style trigram similarity between two tokens. Overlap coefficient
+ * against the smaller token is more forgiving than Jaccard for short
+ * typo-laden queries where the whole caption would otherwise dominate the
+ * denominator.
+ */
+function tokenTrigramSimilarity(a: string, b: string): number {
+  if (a.length < 3 || b.length < 3) return 0;
+  const left = trigrams(a);
+  const right = trigrams(b);
+  let overlap = 0;
+  for (const tri of left) {
+    if (right.has(tri)) overlap += 1;
+  }
+  const denominator = Math.min(left.size, right.size);
+  return denominator === 0 ? 0 : overlap / denominator;
+}
+
+/**
+ * Typo-tolerant match gate. Fuzzy matching alone is too permissive — "orange"
+ * and "range" share four of their five interior trigrams, so a naive trigram
+ * score would surface "mountain range" results for an "orange" query.
+ *
+ * Anchoring on a shared prefix (at least half the query token, capped at 3
+ * chars) keeps typos at the back of the word matching ("kitchin" → "kitchen")
+ * while rejecting coincidental substring overlaps.
+ */
+function sharesQueryPrefix(queryToken: string, captionToken: string): boolean {
+  if (queryToken.length < 3) return false;
+  const prefixLen = Math.min(3, Math.max(2, Math.floor(queryToken.length / 2)));
+  return captionToken.startsWith(queryToken.slice(0, prefixLen));
+}
+
+/** Best fuzzy match for a single query token against any caption token. */
+function bestFuzzyTokenScore(queryToken: string, captionTokens: string[]): number {
+  let best = 0;
+  for (const captionToken of captionTokens) {
+    if (!sharesQueryPrefix(queryToken, captionToken)) continue;
+    const similarity = tokenTrigramSimilarity(queryToken, captionToken);
+    if (similarity > best) best = similarity;
+    if (best === 1) return 1;
+  }
+  return best;
+}
+
+/**
+ * Find ranges in the original `text` (case-insensitive) that match any of
+ * the query tokens. Overlapping ranges are merged so the <mark/> renderer
+ * doesn't have to deduplicate.
+ */
+function findMatchSpans(text: string, tokens: string[]): Array<[number, number]> {
+  if (tokens.length === 0) return [];
+  const lower = text.toLowerCase();
+  const raw: Array<[number, number]> = [];
+  for (const token of tokens) {
+    if (token.length === 0) continue;
+    let from = 0;
+    while (from <= lower.length - token.length) {
+      const idx = lower.indexOf(token, from);
+      if (idx < 0) break;
+      raw.push([idx, idx + token.length]);
+      from = idx + token.length;
+    }
+  }
+  if (raw.length === 0) return [];
+  raw.sort((a, b) => a[0] - b[0]);
+  const merged: Array<[number, number]> = [];
+  for (const span of raw) {
+    const last = merged[merged.length - 1];
+    if (last && span[0] <= last[1]) {
+      last[1] = Math.max(last[1], span[1]);
+    } else {
+      merged.push([span[0], span[1]]);
+    }
+  }
+  return merged;
+}
+
+const FUZZY_TOKEN_THRESHOLD = 0.6;
+
+function scoreScene(query: string, queryTokens: string[], scene: RankableScene): number {
+  const captionNormalized = normalize(scene.text);
+  if (captionNormalized.length === 0) return 0;
+
+  if (query.length > 0 && captionNormalized.includes(query)) {
+    return 1;
+  }
+
+  const captionTokens = captionNormalized.split(' ');
+  if (queryTokens.length === 0) return 0;
+
+  let exactOrPrefix = 0;
+  let fuzzySum = 0;
+  for (const queryToken of queryTokens) {
+    if (captionTokens.some((token) => token === queryToken || token.startsWith(queryToken))) {
+      exactOrPrefix += 1;
+      fuzzySum += 1;
+      continue;
+    }
+    const fuzzy = bestFuzzyTokenScore(queryToken, captionTokens);
+    if (fuzzy >= FUZZY_TOKEN_THRESHOLD) {
+      fuzzySum += fuzzy;
+    }
+  }
+
+  // Prefix-heavy matches get a small bonus so "kitchen pots" in caption wins
+  // over "kichen pts" in a different caption at the same fuzzy coverage.
+  const tokenScore = (exactOrPrefix / queryTokens.length) * 0.9;
+  const fuzzyScore = (fuzzySum / queryTokens.length) * 0.8;
+
+  return Math.max(tokenScore, fuzzyScore);
+}
+
+/**
+ * Rank scenes against `query`. Empty query returns scenes unchanged (no
+ * filtering, no sorting) so callers can show the default timestamp-sorted
+ * view without a second code path.
+ */
+export function rankScenes(
+  query: string,
+  scenes: RankableScene[],
+  options: RankOptions = {},
+): ScoredScene[] {
+  const normalizedQuery = normalize(query);
+  if (normalizedQuery.length === 0) {
+    return scenes.map((scene) => ({
+      ...scene,
+      score: 0,
+      matchSpans: [],
+      signals: { ranker: 'keyword' },
+    }));
+  }
+
+  const threshold = options.threshold ?? DEFAULT_THRESHOLD;
+  const queryTokens = normalizedQuery.split(' ').filter(Boolean);
+
+  const scored: ScoredScene[] = [];
+  for (const scene of scenes) {
+    const score = scoreScene(normalizedQuery, queryTokens, scene);
+    if (score < threshold) continue;
+    scored.push({
+      ...scene,
+      score,
+      matchSpans: findMatchSpans(scene.text, queryTokens),
+      signals: { ranker: 'keyword', keywordMatched: true },
+    });
+  }
+
+  scored.sort((a, b) => {
+    if (b.score !== a.score) return b.score - a.score;
+    if (a.mediaFileName !== b.mediaFileName) return a.mediaFileName.localeCompare(b.mediaFileName);
+    return a.timeSec - b.timeSec;
+  });
+  return scored;
+}
diff --git a/src/features/scene-browser/utils/seek.ts b/src/features/scene-browser/utils/seek.ts
new file mode 100644
index 000000000..a4fbfccf9
--- /dev/null
+++ b/src/features/scene-browser/utils/seek.ts
@@ -0,0 +1,38 @@
+/**
+ * Seek a scene in the Source Monitor. Mirrors media-card's handleSeekToCaption
+ * so Scene Browser rows open the source preview the same way a caption
+ * timestamp click does — setting the source player state alone isn't enough,
+ * the editor store's sourcePreviewMediaId is what actually mounts the panel.
+ */
+
+import {
+  useEditorStore,
+  useMediaLibraryStore,
+  useSourcePlayerStore,
+} from '../deps/media-library';
+
+export const SCENE_SELECTION_DURATION_SEC = 3;
+
+export function seekToScene(mediaId: string, timeSec: number): void {
+  const media = useMediaLibraryStore.getState().mediaById[mediaId];
+  if (!media) return;
+  const fps = media.fps || 30;
+  const sourceDurationFrames = Math.max(1, Math.round(media.duration * fps));
+  const frame = Math.max(0, Math.min(sourceDurationFrames - 1, Math.round(timeSec * fps)));
+  const outFrame = Math.min(
+    sourceDurationFrames,
+    frame + Math.max(1, Math.round(SCENE_SELECTION_DURATION_SEC * fps)),
+  );
+
+  const source = useSourcePlayerStore.getState();
+  // Pause the current scene synchronously — waiting for the seek-consume
+  // effect leaves the video element decoding the old frame, which is
+  // what the user sees as "flash of the old scene" when switching.
+  source.playerMethods?.pause();
+  source.setCurrentMediaId(mediaId);
+  source.clearInOutPoints();
+  source.setInPoint(frame);
+  source.setOutPoint(outFrame);
+  source.setPendingSeekFrame(frame);
+  useEditorStore.getState().setSourcePreviewMediaId(mediaId);
+}
diff --git a/src/features/scene-browser/utils/semantic-rank.test.ts b/src/features/scene-browser/utils/semantic-rank.test.ts
new file mode 100644
index 000000000..b6d8655ac
--- /dev/null
+++ b/src/features/scene-browser/utils/semantic-rank.test.ts
@@ -0,0 +1,362 @@
+import { describe, expect, it } from 'vitest';
+import {
+  cosineSimilarity,
+  semanticRank,
+  SEMANTIC_MATCH_THRESHOLD,
+} from './semantic-rank';
+import type { RankableScene } from './rank';
+
+function unit(values: number[]): Float32Array {
+  const magnitude = Math.sqrt(values.reduce((sum, x) => sum + x * x, 0)) || 1;
+  return Float32Array.from(values.map((x) => x / magnitude));
+}
+
+function scene(id: string, text: string): RankableScene {
+  return {
+    id,
+    mediaId: id.split(':')[0] ?? 'm1',
+    mediaFileName: `${id}.mp4`,
+    timeSec: 0,
+    text,
+  };
+}
+
+describe('cosineSimilarity', () => {
+  it('returns 1 for identical unit vectors', () => {
+    const a = unit([1, 2, 3]);
+    expect(cosineSimilarity(a, a)).toBeCloseTo(1, 5);
+  });
+
+  it('returns 0 for orthogonal vectors', () => {
+    const a = unit([1, 0]);
+    const b = unit([0, 1]);
+    expect(cosineSimilarity(a, b)).toBeCloseTo(0, 5);
+  });
+
+  it('returns 0 when dimensions differ', () => {
+    expect(cosineSimilarity(unit([1, 0]), unit([1, 0, 0]))).toBe(0);
+  });
+});
+
+describe('semanticRank', () => {
+  it('orders scenes by descending cosine similarity to the query', () => {
+    const query = unit([1, 0, 0]);
+    const scenes = [scene('a:0', 'first'), scene('b:0', 'second'), scene('c:0', 'third')];
+    const embeddings = new Map<string, Float32Array>([
+      ['a:0', unit([0.9, 0.1, 0])],
+      ['b:0', unit([0.2, 1, 0])],
+      ['c:0', unit([1, 0, 0])],
+    ]);
+    const result = semanticRank(query, scenes, embeddings, { threshold: 0 });
+    expect(result.map((s) => s.id)).toEqual(['c:0', 'a:0', 'b:0']);
+  });
+
+  it('drops scenes below the threshold', () => {
+    const query = unit([1, 0]);
+    const scenes = [scene('a:0', 'a'), scene('b:0', 'b')];
+    const embeddings = new Map<string, Float32Array>([
+      ['a:0', unit([0.99, 0.01])],
+      ['b:0', unit([0.01, 0.99])],
+    ]);
+    const result = semanticRank(query, scenes, embeddings);
+    expect(result.map((s) => s.id)).toEqual(['a:0']);
+    expect(result[0]!.score).toBeGreaterThan(SEMANTIC_MATCH_THRESHOLD);
+  });
+
+  it('skips scenes that have no embedding in the map', () => {
+    const query = unit([1, 0]);
+    const scenes = [scene('a:0', 'with'), scene('b:0', 'without')];
+    const embeddings = new Map<string, Float32Array>([
+      ['a:0', unit([1, 0])],
+    ]);
+    const result = semanticRank(query, scenes, embeddings, { threshold: 0 });
+    expect(result.map((s) => s.id)).toEqual(['a:0']);
+  });
+
+  it('returns empty matchSpans so highlighting stays sane', () => {
+    const query = unit([1, 0]);
+    const scenes = [scene('a:0', 'orange sky over water')];
+    const embeddings = new Map<string, Float32Array>([['a:0', unit([1, 0])]]);
+    const [top] = semanticRank(query, scenes, embeddings, { threshold: 0 });
+    expect(top!.matchSpans).toEqual([]);
+  });
+
+  it('stable-sorts ties by filename then timestamp', () => {
+    const query = unit([1, 0]);
+    const scenes: RankableScene[] = [
+      { id: 'b:0', mediaId: 'b', mediaFileName: 'b.mp4', timeSec: 5, text: 'b' },
+      { id: 'a:0', mediaId: 'a', mediaFileName: 'a.mp4', timeSec: 10, text: 'a' },
+    ];
+    const embeddings = new Map<string, Float32Array>([
+      ['a:0', unit([1, 0])],
+      ['b:0', unit([1, 0])],
+    ]);
+    const result = semanticRank(query, scenes, embeddings, { threshold: 0 });
+    expect(result.map((s) => s.id)).toEqual(['a:0', 'b:0']);
+  });
+});
+
+describe('semanticRank with CLIP image signal', () => {
+  it('falls through to image match when caption text is weak', () => {
+    const textQuery = unit([1, 0]);
+    const imageQuery = unit([1, 0, 0]);
+    const scenes = [scene('a:0', 'terse caption')];
+    const textEmbeds = new Map<string, Float32Array>([
+      ['a:0', unit([0.05, 1])], // nearly orthogonal to text query
+    ]);
+    const imageEmbeds = new Map<string, Float32Array>([
+      ['a:0', unit([0.9, 0.1, 0])],
+    ]);
+    const result = semanticRank(textQuery, scenes, textEmbeds, {
+      queryImageEmbedding: imageQuery,
+      imageEmbeddings: imageEmbeds,
+    });
+    expect(result.map((s) => s.id)).toEqual(['a:0']);
+    expect(result[0]!.score).toBeGreaterThan(0.5);
+  });
+
+  it('takes max of text and image scores when both are present', () => {
+    const textQuery = unit([1, 0]);
+    const imageQuery = unit([1, 0, 0]);
+    const scenes = [scene('a:0', 'strong text'), scene('b:0', 'strong image')];
+    const textEmbeds = new Map<string, Float32Array>([
+      ['a:0', unit([1, 0])],     // text cosine ≈ 1
+      ['b:0', unit([0.1, 1])],   // text cosine low
+    ]);
+    const imageEmbeds = new Map<string, Float32Array>([
+      ['a:0', unit([0.1, 1, 0])], // image cosine low
+      ['b:0', unit([1, 0, 0])],    // image cosine ≈ 1
+    ]);
+    const result = semanticRank(textQuery, scenes, textEmbeds, {
+      queryImageEmbedding: imageQuery,
+      imageEmbeddings: imageEmbeds,
+      threshold: 0.2,
+      imageThreshold: 0.2,
+    });
+    expect(result.map((s) => s.id).sort()).toEqual(['a:0', 'b:0']);
+    expect(result[0]!.score).toBeGreaterThan(0.9);
+    expect(result[1]!.score).toBeGreaterThan(0.9);
+  });
+
+  it('drops a scene only when both signals are below their thresholds', () => {
+    const textQuery = unit([1, 0]);
+    const imageQuery = unit([1, 0]);
+    const scenes = [scene('a:0', 'weak everywhere')];
+    const textEmbeds = new Map<string, Float32Array>([
+      ['a:0', unit([0.1, 1])], // cosine ≈ 0.1
+    ]);
+    const imageEmbeds = new Map<string, Float32Array>([
+      ['a:0', unit([0.05, 1])], // cosine ≈ 0.05
+    ]);
+    const result = semanticRank(textQuery, scenes, textEmbeds, {
+      queryImageEmbedding: imageQuery,
+      imageEmbeddings: imageEmbeds,
+    });
+    expect(result).toEqual([]);
+  });
+
+  it('drops a scene whose only signal is a 0.21 visual match (below 0.22 threshold)', () => {
+    const textQuery = unit([1, 0]);
+    const imageQuery = unit([1, 0]);
+    const scenes = [scene('tower:0', 'A tall green tower at night')];
+    const textEmbeds = new Map<string, Float32Array>(); // no text match at all
+    // 0.21 cosine — the exact false-positive level observed in the wild
+    // for one-word queries before we raised the threshold.
+    const imageEmbeds = new Map<string, Float32Array>([
+      ['tower:0', unit([0.21, Math.sqrt(1 - 0.21 * 0.21)])],
+    ]);
+    const result = semanticRank(textQuery, scenes, textEmbeds, {
+      queryImageEmbedding: imageQuery,
+      imageEmbeddings: imageEmbeds,
+    });
+    expect(result).toEqual([]);
+  });
+
+  it('drops a scene whose only signal is a Fair-tier visual match with no text support', () => {
+    // The "seated down → doorknob close-up" failure: CLIP cosine just
+    // above the 0.22 floor but no text corroboration. Should not pass.
+    const textQuery = unit([1, 0]);
+    const imageQuery = unit([1, 0]);
+    const scenes = [scene('doorknob:0', 'Close-up of a hand gripping a doorknob')];
+    const textEmbeds = new Map<string, Float32Array>([
+      ['doorknob:0', unit([0.1, 1])], // text cosine ≈ 0.1, below Fair floor
+    ]);
+    const imageEmbeds = new Map<string, Float32Array>([
+      ['doorknob:0', unit([0.25, Math.sqrt(1 - 0.25 * 0.25)])], // image cosine ≈ 0.25
+    ]);
+    const result = semanticRank(textQuery, scenes, textEmbeds, {
+      queryImageEmbedding: imageQuery,
+      imageEmbeddings: imageEmbeds,
+    });
+    expect(result).toEqual([]);
+  });
+
+  it('accepts a Fair-Fair scene where both sides mutually confirm', () => {
+    const textQuery = unit([1, 0]);
+    const imageQuery = unit([1, 0]);
+    const scenes = [scene('a:0', 'An elderly couple sits in a wheelchair')];
+    // 0.32 text (Fair) × 0.23 image (Fair) — weak alone, confirming together.
+    const textEmbeds = new Map<string, Float32Array>([
+      ['a:0', unit([0.32, Math.sqrt(1 - 0.32 * 0.32)])],
+    ]);
+    const imageEmbeds = new Map<string, Float32Array>([
+      ['a:0', unit([0.23, Math.sqrt(1 - 0.23 * 0.23)])],
+    ]);
+    const result = semanticRank(textQuery, scenes, textEmbeds, {
+      queryImageEmbedding: imageQuery,
+      imageEmbeddings: imageEmbeds,
+    });
+    expect(result.map((s) => s.id)).toEqual(['a:0']);
+  });
+
+  it('accepts a scene on strong text alone, even when image is weak', () => {
+    const textQuery = unit([1, 0]);
+    const imageQuery = unit([1, 0]);
+    const scenes = [scene('a:0', 'strong text')];
+    const textEmbeds = new Map<string, Float32Array>([
+      ['a:0', unit([0.5, Math.sqrt(1 - 0.5 * 0.5)])], // text cosine = 0.5, strong
+    ]);
+    const imageEmbeds = new Map<string, Float32Array>([
+      ['a:0', unit([0.15, Math.sqrt(1 - 0.15 * 0.15)])], // below Fair floor
+    ]);
+    const result = semanticRank(textQuery, scenes, textEmbeds, {
+      queryImageEmbedding: imageQuery,
+      imageEmbeddings: imageEmbeds,
+    });
+    expect(result.map((s) => s.id)).toEqual(['a:0']);
+  });
+
+  it('still ranks a scene that has no image embedding on text alone', () => {
+    const textQuery = unit([1, 0]);
+    const imageQuery = unit([1, 0]);
+    const scenes = [scene('a:0', 'text-only scene')];
+    const textEmbeds = new Map<string, Float32Array>([
+      ['a:0', unit([1, 0])],
+    ]);
+    const imageEmbeds = new Map<string, Float32Array>(); // empty
+    const result = semanticRank(textQuery, scenes, textEmbeds, {
+      queryImageEmbedding: imageQuery,
+      imageEmbeddings: imageEmbeds,
+    });
+    expect(result.map((s) => s.id)).toEqual(['a:0']);
+  });
+
+  it('ignores image side when queryImageEmbedding is null', () => {
+    const textQuery = unit([1, 0]);
+    const scenes = [scene('a:0', 'has image not text')];
+    const textEmbeds = new Map<string, Float32Array>([
+      ['a:0', unit([0.1, 1])], // weak text
+    ]);
+    const imageEmbeds = new Map<string, Float32Array>([
+      ['a:0', unit([1, 0])], // strong image
+    ]);
+    const result = semanticRank(textQuery, scenes, textEmbeds, {
+      queryImageEmbedding: null,
+      imageEmbeddings: imageEmbeds,
+    });
+    expect(result).toEqual([]); // image was strong but query image embed absent
+  });
+
+  it('uses palette-only ranking for explicit pure color queries', () => {
+    const textQuery = unit([1, 0]);
+    const imageQuery = unit([1, 0]);
+    const scenes = [scene('a:0', 'A person in a yellow jacket'), scene('b:0', 'A dark blue hallway')];
+    const textEmbeds = new Map<string, Float32Array>([
+      ['a:0', unit([1, 0])], // strong text match that should be ignored
+      ['b:0', unit([0, 1])],
+    ]);
+    const imageEmbeds = new Map<string, Float32Array>([
+      ['a:0', unit([1, 0])], // strong visual match that should be ignored
+      ['b:0', unit([0, 1])],
+    ]);
+    const palettes = new Map([
+      ['a:0', [{ l: 40, a: 15, b: -60, weight: 0.9 }]], // blue
+      ['b:0', [{ l: 90, a: -5, b: 80, weight: 0.9 }]], // yellow
+    ]);
+
+    const result = semanticRank(textQuery, scenes, textEmbeds, {
+      query: 'yellow color',
+      queryImageEmbedding: imageQuery,
+      imageEmbeddings: imageEmbeds,
+      palettes,
+    });
+
+    expect(result.map((s) => s.id)).toEqual(['b:0']);
+    expect(result[0]?.signals.colorMatch).toBe('yellow');
+    expect(result[0]?.signals.textScore).toBeUndefined();
+    expect(result[0]?.signals.imageScore).toBeUndefined();
+  });
+
+  it('keeps semantic text/image scoring for mixed color-content queries', () => {
+    const textQuery = unit([1, 0]);
+    const imageQuery = unit([1, 0]);
+    const scenes = [scene('a:0', 'Yellow kitchen interior'), scene('b:0', 'Blue hallway')];
+    const textEmbeds = new Map<string, Float32Array>([
+      ['a:0', unit([1, 0])],
+      ['b:0', unit([0, 1])],
+    ]);
+    const imageEmbeds = new Map<string, Float32Array>([
+      ['a:0', unit([1, 0])],
+      ['b:0', unit([0, 1])],
+    ]);
+    const palettes = new Map([
+      ['a:0', [{ l: 90, a: -5, b: 80, weight: 0.9 }]],
+      ['b:0', [{ l: 40, a: 15, b: -60, weight: 0.9 }]],
+    ]);
+
+    const result = semanticRank(textQuery, scenes, textEmbeds, {
+      query: 'yellow color kitchen',
+      queryImageEmbedding: imageQuery,
+      imageEmbeddings: imageEmbeds,
+      palettes,
+    });
+
+    expect(result[0]?.id).toBe('a:0');
+    expect(result[0]?.signals.colorMatch).toBe('yellow');
+    expect(result[0]?.signals.textScore).toBeDefined();
+    expect(result[0]?.signals.imageScore).toBeDefined();
+  });
+
+  it('ranks by palette similarity and ignores text scores when referencePalette is set', () => {
+    // With a reference palette, the ranker should find scenes whose
+    // palettes are perceptually close to the reference, regardless of
+    // how well the text side matches the query vector.
+    const query = unit([1, 0]);
+    const scenes = [
+      scene('warm:0', 'an unrelated caption'),
+      scene('cool:0', 'a perfect text match'),
+    ];
+    const textEmbeds = new Map<string, Float32Array>([
+      ['warm:0', unit([0, 1])],
+      ['cool:0', unit([1, 0])],
+    ]);
+    const palettes = new Map([
+      ['warm:0', [{ l: 53, a: 70, b: 50, weight: 0.9 }]],
+      ['cool:0', [{ l: 40, a: 15, b: -60, weight: 0.9 }]],
+    ]);
+    const referencePalette = [{ l: 53, a: 70, b: 50, weight: 1 }];
+
+    const result = semanticRank(query, scenes, textEmbeds, {
+      palettes,
+      referencePalette,
+    });
+
+    expect(result.map((s) => s.id)).toEqual(['warm:0']);
+    expect(result[0]?.signals.paletteDistance).toBeDefined();
+    expect(result[0]?.signals.textScore).toBeUndefined();
+  });
+
+  it('falls back to the scene-level palette when paletteMap lacks the id', () => {
+    const query = unit([1, 0]);
+    const warmPalette = [{ l: 53, a: 70, b: 50, weight: 1 }];
+    const scenes: RankableScene[] = [
+      { ...scene('warm:0', 'x'), palette: warmPalette },
+    ];
+    const textEmbeds = new Map<string, Float32Array>();
+    const result = semanticRank(query, scenes, textEmbeds, {
+      referencePalette: warmPalette,
+    });
+    expect(result).toHaveLength(1);
+    expect(result[0]?.score).toBeGreaterThan(0);
+  });
+});
diff --git a/src/features/scene-browser/utils/semantic-rank.ts b/src/features/scene-browser/utils/semantic-rank.ts
new file mode 100644
index 000000000..4d8101de3
--- /dev/null
+++ b/src/features/scene-browser/utils/semantic-rank.ts
@@ -0,0 +1,205 @@
+/**
+ * Semantic ranker — cosine similarity over unit-length caption embeddings.
+ *
+ * Vectors coming out of `embeddingsProvider` are already L2-normalized
+ * (the worker uses `normalize: true`), so cosine similarity reduces to
+ * a dot product here. Keeping this module dependency-free makes it
+ * cheap to unit-test without spinning up a worker.
+ */
+
+import type { PaletteEntry } from '../deps/analysis';
+import {
+  colorBoostFor,
+  paletteSimilarityBoost,
+  parseColorQuery,
+  type ColorBoostResult,
+} from './color-boost';
+import type { RankableScene, ScoredScene } from './rank';
+
+/** "Fair" tier floor for text cosines — a weakly confirming signal. */
+export const SEMANTIC_MATCH_THRESHOLD = 0.3;
+
+/**
+ * CLIP cosine scores cluster in a much narrower range than all-MiniLM
+ * text-to-text scores — even a strong visual match rarely clears 0.35,
+ * whereas a strong text match can hit 0.7+. Using separate thresholds
+ * keeps both signals on equal footing when we combine them below.
+ *
+ * 0.22 is the "Fair" floor — a weakly confirming signal. It used to be
+ * the *accept* threshold, but at that level CLIP's short-query
+ * distribution put ~50% of a 200-scene corpus past it on almost any
+ * prompt (the "seated down → skateboarding, doorknobs" failure). Now
+ * it gates combined weak-signal acceptance: Fair-Fair only counts when
+ * the text side ALSO clears its Fair floor.
+ */
+export const SEMANTIC_IMAGE_MATCH_THRESHOLD = 0.22;
+
+/** "Good" tier floor for text — strong enough to accept alone. */
+export const SEMANTIC_TEXT_STRONG_THRESHOLD = 0.4;
+
+/** "Strong" tier floor for CLIP image cosines — strong enough to accept alone. */
+export const SEMANTIC_IMAGE_STRONG_THRESHOLD = 0.3;
+
+export interface SemanticRankOptions {
+  /** Minimum text cosine to retain a scene (default 0.3). */
+  threshold?: number;
+  /** Minimum image cosine to retain a scene (default 0.2). */
+  imageThreshold?: number;
+  /** CLIP-text-encoder embedding of the query, for matching image side. */
+  queryImageEmbedding?: Float32Array | null;
+  /** sceneId → CLIP image embedding, parallel to the text embeddings map. */
+  imageEmbeddings?: Map<string, Float32Array>;
+  /**
+   * Raw user query. When it contains color terms, the ranker computes
+   * per-scene ∆E 2000 distance against each palette entry and folds
+   * the best match into the final score. Sidesteps CLIP's weakness on
+   * bare color queries.
+   */
+  query?: string;
+  /** sceneId → dominant-color palette (CIELAB + weights). */
+  palettes?: Map<string, PaletteEntry[]>;
+  /**
+   * Reference palette for "find similar colors" mode. When set, the
+   * ranker switches to palette-similarity scoring and ignores text/CLIP
+   * signals — object semantics aren't part of "scenes with this palette".
+   */
+  referencePalette?: PaletteEntry[] | null;
+}
+
+export function cosineSimilarity(a: Float32Array, b: Float32Array): number {
+  if (a.length !== b.length) return 0;
+  let sum = 0;
+  for (let i = 0; i < a.length; i += 1) {
+    sum += a[i]! * b[i]!;
+  }
+  return sum;
+}
+
+/**
+ * Rank scenes by cosine similarity to the query embedding. When a CLIP
+ * text-encoder query embedding and parallel image-embedding map are
+ * supplied, each scene's final score is `max(text_cosine, image_cosine)`
+ * (each gated by its own threshold). This makes missing signals harmless:
+ * a scene without image embeddings still ranks on text alone, and a scene
+ * with a weak caption can still surface on visual match.
+ *
+ * Scenes whose id is absent from *both* embedding maps are dropped —
+ * they have no semantic signal to rank on. Callers should handle that
+ * via keyword fallback or the retroactive indexer.
+ */
+export function semanticRank(
+  queryEmbedding: Float32Array,
+  scenes: RankableScene[],
+  embeddings: Map<string, Float32Array>,
+  options: SemanticRankOptions = {},
+): ScoredScene[] {
+  const threshold = options.threshold ?? SEMANTIC_MATCH_THRESHOLD;
+  const imageThreshold = options.imageThreshold ?? SEMANTIC_IMAGE_MATCH_THRESHOLD;
+  const queryImage = options.queryImageEmbedding ?? null;
+  const imageMap = options.imageEmbeddings;
+  const paletteMap = options.palettes;
+  const referencePalette = options.referencePalette ?? null;
+
+  // Parse color intent once so the per-scene loop stays tight. Explicit
+  // palette queries bypass text/CLIP scoring; mixed queries still get a
+  // palette boost on top of semantic meaning. A reference palette forces
+  // palette-only scoring regardless of the query shape.
+  const colorQuery = options.query ? parseColorQuery(options.query) : { colors: [], paletteOnly: false };
+  const queryColors = colorQuery.colors;
+  const hasColorQuery = queryColors.length > 0;
+  const paletteOnly = !!referencePalette || colorQuery.paletteOnly;
+
+  const scored: ScoredScene[] = [];
+  for (const scene of scenes) {
+    if (referencePalette) {
+      const scenePalette = paletteMap?.get(scene.id) ?? scene.palette;
+      const similarity = paletteSimilarityBoost(referencePalette, scenePalette);
+      if (!similarity) continue;
+      scored.push({
+        ...scene,
+        score: similarity.boost,
+        matchSpans: [],
+        signals: {
+          ranker: 'semantic',
+          paletteDistance: similarity.distance,
+        },
+      });
+      continue;
+    }
+    const textVector = embeddings.get(scene.id);
+    const imageVector = queryImage && imageMap ? imageMap.get(scene.id) : undefined;
+
+    const textScore = textVector ? cosineSimilarity(queryEmbedding, textVector) : 0;
+    const imageScore = imageVector && queryImage
+      ? cosineSimilarity(queryImage, imageVector)
+      : 0;
+
+    let colorBoost: ColorBoostResult | null = null;
+    if (hasColorQuery && paletteMap) {
+      colorBoost = colorBoostFor(queryColors, paletteMap.get(scene.id));
+    }
+
+    // Accept logic is side-aware:
+    //   - When both text and image sides exist for this scene, weak
+    //     "Fair" signals are only accepted when mutually confirmed —
+    //     without this gate ~50% of a 200-scene corpus clears the Fair
+    //     CLIP floor on almost any short query (cosines cluster tight),
+    //     so unrelated thumbnails (doorknobs, skateboarding) surface.
+    //   - When only one side is available (CLIP still loading, or scene
+    //     not image-indexed yet), fall back to the per-side floor so
+    //     honest single-signal matches still show up.
+    //   - Image-alone is held to the strong bar — a CLIP-only Fair match
+    //     is the exact noise pattern we're trying to kill.
+    const hasTextSide = !paletteOnly && !!textVector;
+    const hasImageSide = !paletteOnly && !!imageVector;
+    const fairText = hasTextSide && textScore >= threshold;
+    const fairImage = hasImageSide && imageScore >= imageThreshold;
+    const strongText = hasTextSide && textScore >= SEMANTIC_TEXT_STRONG_THRESHOLD;
+    const strongImage = hasImageSide && imageScore >= SEMANTIC_IMAGE_STRONG_THRESHOLD;
+
+    let accept: boolean;
+    if (hasTextSide && hasImageSide) {
+      accept = strongText || strongImage || (fairText && fairImage);
+    } else if (hasTextSide) {
+      accept = fairText;
+    } else if (hasImageSide) {
+      accept = strongImage;
+    } else {
+      accept = false;
+    }
+
+    const textOk = accept && fairText;
+    const imageOk = accept && fairImage;
+    const colorOk = !!colorBoost;
+    if (!accept && !colorOk) continue;
+
+    // Max of text / image / color signals — weakest side doesn't drag
+    // down a strong one. The color boost is already in cosine-compatible
+    // units (see `MAX_BOOST` in color-boost.ts).
+    const baseScore = Math.max(
+      textOk ? textScore : 0,
+      imageOk ? imageScore : 0,
+    );
+    const score = colorBoost ? Math.max(baseScore, colorBoost.boost) : baseScore;
+
+    // Semantic matches don't map to character spans in the caption text,
+    // so highlighting is empty — the rest of the UI handles that case.
+    scored.push({
+      ...scene,
+      score,
+      matchSpans: [],
+      signals: {
+        ranker: 'semantic',
+        textScore: !paletteOnly && textVector ? textScore : undefined,
+        imageScore: !paletteOnly && imageVector ? imageScore : undefined,
+        colorMatch: colorBoost?.family,
+      },
+    });
+  }
+  scored.sort((a, b) => {
+    if (b.score !== a.score) return b.score - a.score;
+    if (a.mediaFileName !== b.mediaFileName) return a.mediaFileName.localeCompare(b.mediaFileName);
+    return a.timeSec - b.timeSec;
+  });
+  return scored;
+}
diff --git a/src/features/settings/components/hotkey-editor-sections.ts b/src/features/settings/components/hotkey-editor-sections.ts
index c559bf245..68b4f644c 100644
--- a/src/features/settings/components/hotkey-editor-sections.ts
+++ b/src/features/settings/components/hotkey-editor-sections.ts
@@ -113,6 +113,7 @@ export const HOTKEY_EDITOR_SECTIONS: readonly HotkeyEditorSection[] = [
     items: [
       { label: 'Save project', keys: ['SAVE'] },
       { label: 'Export video', keys: ['EXPORT'] },
+      { label: 'Open Scene Browser (search AI captions)', keys: ['OPEN_SCENE_BROWSER'] },
     ],
   },
 ] as const;
diff --git a/src/features/settings/stores/settings-store.test.ts b/src/features/settings/stores/settings-store.test.ts
index 20c91b226..d62b428da 100644
--- a/src/features/settings/stores/settings-store.test.ts
+++ b/src/features/settings/stores/settings-store.test.ts
@@ -8,7 +8,7 @@ const DEFAULT_SETTINGS = {
   editorDensity: 'compact' as const,
   maxUndoHistory: 50,
   autoSaveInterval: 0,
-  defaultWhisperModel: 'whisper-tiny' as const,
+  defaultWhisperModel: 'whisper-small' as const,
   defaultWhisperQuantization: 'hybrid' as const,
   defaultWhisperLanguage: '',
 };
@@ -26,7 +26,7 @@ describe('settings-store', () => {
     expect(state.editorDensity).toBe('compact');
     expect(state.maxUndoHistory).toBe(50);
     expect(state.autoSaveInterval).toBe(0);
-    expect(state.defaultWhisperModel).toBe('whisper-tiny');
+    expect(state.defaultWhisperModel).toBe('whisper-small');
     expect(state.defaultWhisperQuantization).toBe('hybrid');
     expect(state.defaultWhisperLanguage).toBe('');
   });
@@ -53,6 +53,12 @@ describe('settings-store', () => {
       expect(useSettingsStore.getState().defaultWhisperQuantization).toBe('q8');
     });
 
+    it('normalizes legacy tiny model selections back to small', () => {
+      useSettingsStore.getState().setSetting('defaultWhisperModel', 'whisper-tiny');
+
+      expect(useSettingsStore.getState().defaultWhisperModel).toBe('whisper-small');
+    });
+
     it('updates auto-save interval', () => {
       useSettingsStore.getState().setSetting('autoSaveInterval', 5);
       expect(useSettingsStore.getState().autoSaveInterval).toBe(5);
diff --git a/src/features/settings/stores/settings-store.ts b/src/features/settings/stores/settings-store.ts
index 0934c6a12..79f407c22 100644
--- a/src/features/settings/stores/settings-store.ts
+++ b/src/features/settings/stores/settings-store.ts
@@ -5,6 +5,7 @@ import {
   DEFAULT_WHISPER_LANGUAGE,
   DEFAULT_WHISPER_MODEL,
   DEFAULT_WHISPER_QUANTIZATION,
+  normalizeSelectableWhisperModel,
 } from '@/shared/utils/whisper-settings';
 import type { EditorDensityPresetName } from '@/app/editor-layout';
 import { DEFAULT_EDITOR_DENSITY_PRESET } from '@/app/editor-layout';
@@ -37,10 +38,64 @@ interface AppSettings {
   defaultWhisperQuantization: MediaTranscriptQuantization;
   defaultWhisperLanguage: string;
 
+  // AI captioning — interval between sampled frames when running LFM captions.
+  // Frames mode is converted to seconds at capture time using media.fps.
+  captioningIntervalUnit: CaptioningIntervalUnit;
+  captioningIntervalValue: number;
+
+  // Scene Browser — how caption search matches queries. `semantic` uses a
+  // sentence-transformer model to rank by meaning; `keyword` uses
+  // substring + fuzzy-prefix matching on caption text.
+  captionSearchMode: CaptionSearchMode;
+
   // Keyboard shortcuts
   hotkeyOverrides: HotkeyOverrideMap;
 }
 
+export type CaptionSearchMode = 'keyword' | 'semantic';
+
+function normalizeCaptionSearchMode(value: unknown): CaptionSearchMode {
+  return value === 'semantic' ? 'semantic' : 'keyword';
+}
+
+export type CaptioningIntervalUnit = 'seconds' | 'frames';
+
+export const CAPTIONING_INTERVAL_BOUNDS = {
+  seconds: { min: 0.5, max: 60 },
+  frames: { min: 1, max: 1800 },
+} as const;
+
+export const DEFAULT_CAPTIONING_INTERVAL_SECONDS = 3;
+
+function normalizeCaptioningIntervalUnit(value: unknown): CaptioningIntervalUnit {
+  return value === 'frames' ? 'frames' : 'seconds';
+}
+
+function clampCaptioningIntervalValue(
+  value: unknown,
+  unit: CaptioningIntervalUnit,
+): number {
+  const bounds = CAPTIONING_INTERVAL_BOUNDS[unit];
+  const fallback = unit === 'seconds' ? DEFAULT_CAPTIONING_INTERVAL_SECONDS : 90;
+  const numeric = typeof value === 'number' && Number.isFinite(value) ? value : fallback;
+  return Math.min(bounds.max, Math.max(bounds.min, numeric));
+}
+
+/**
+ * Derive the effective `sampleIntervalSec` to pass to the captioning provider.
+ * Frames mode divides by the source media FPS (falling back to 30 when the
+ * media reports no usable frame rate).
+ */
+export function resolveCaptioningIntervalSec(
+  unit: CaptioningIntervalUnit,
+  value: number,
+  fps: number,
+): number {
+  if (unit === 'seconds') return value;
+  const effectiveFps = fps > 0 ? fps : 30;
+  return value / effectiveFps;
+}
+
 interface SettingsActions {
   setSetting: <K extends keyof AppSettings>(key: K, value: AppSettings[K]) => void;
   setHotkeyBinding: (key: HotkeyKey, binding: string) => void;
@@ -84,6 +139,13 @@ const DEFAULT_SETTINGS: AppSettings = {
   defaultWhisperQuantization: DEFAULT_WHISPER_QUANTIZATION,
   defaultWhisperLanguage: DEFAULT_WHISPER_LANGUAGE,
 
+  // AI captioning defaults
+  captioningIntervalUnit: 'seconds',
+  captioningIntervalValue: DEFAULT_CAPTIONING_INTERVAL_SECONDS,
+
+  // Scene Browser defaults
+  captionSearchMode: 'keyword',
+
   // Keyboard shortcuts
   hotkeyOverrides: {},
 };
@@ -101,7 +163,24 @@ export const useSettingsStore = create<SettingsStore>()(
     (set) => ({
       ...DEFAULT_SETTINGS,
 
-      setSetting: (key, value) => set({ [key]: value }),
+      setSetting: (key, value) => set((state) => {
+        if (key === 'defaultWhisperModel') {
+          return { [key]: normalizeSelectableWhisperModel(value as MediaTranscriptModel) };
+        }
+        if (key === 'captioningIntervalUnit') {
+          const unit = normalizeCaptioningIntervalUnit(value);
+          return {
+            captioningIntervalUnit: unit,
+            captioningIntervalValue: clampCaptioningIntervalValue(state.captioningIntervalValue, unit),
+          };
+        }
+        if (key === 'captioningIntervalValue') {
+          return {
+            captioningIntervalValue: clampCaptioningIntervalValue(value, state.captioningIntervalUnit),
+          };
+        }
+        return { [key]: value };
+      }),
 
       setHotkeyBinding: (key, binding) => set((state) => {
         const normalizedBinding = normalizeHotkeyBinding(binding);
@@ -161,11 +240,19 @@ export const useSettingsStore = create<SettingsStore>()(
       name: 'freecut-settings',
       merge: (persistedState, currentState) => {
         const typedState = (persistedState as Partial<AppSettings> | undefined) ?? {};
+        const captioningIntervalUnit = normalizeCaptioningIntervalUnit(typedState.captioningIntervalUnit);
 
         return {
           ...currentState,
           ...typedState,
+          defaultWhisperModel: normalizeSelectableWhisperModel(typedState.defaultWhisperModel),
           hotkeyOverrides: sanitizeHotkeyOverrides(typedState.hotkeyOverrides),
+          captioningIntervalUnit,
+          captioningIntervalValue: clampCaptioningIntervalValue(
+            typedState.captioningIntervalValue,
+            captioningIntervalUnit,
+          ),
+          captionSearchMode: normalizeCaptionSearchMode(typedState.captionSearchMode),
         };
       },
     }
diff --git a/src/features/timeline/components/timeline-content.test.tsx b/src/features/timeline/components/timeline-content.test.tsx
index 0986cfbad..c4d479cc6 100644
--- a/src/features/timeline/components/timeline-content.test.tsx
+++ b/src/features/timeline/components/timeline-content.test.tsx
@@ -1,5 +1,5 @@
 import { createRef, type ReactNode } from 'react';
-import { act, render, waitFor } from '@testing-library/react';
+import { act, fireEvent, render, waitFor } from '@testing-library/react';
 import { beforeAll, beforeEach, describe, expect, it, vi } from 'vitest';
 
 import { useEditorStore } from '@/app/state/editor';
@@ -131,6 +131,7 @@ beforeAll(() => {
 function resetStores() {
   useEditorStore.setState({
     linkedSelectionEnabled: true,
+    transcriptionDialogDepth: 0,
   });
 
   useSelectionStore.setState({
@@ -216,6 +217,39 @@ describe('TimelineContent playback selection behavior', () => {
     });
   });
 
+  it('does not update the hover scrub preview while the transcription dialog is open', async () => {
+    const { container } = render(<TimelineContent duration={10} tracks={[VIDEO_TRACK]} />);
+    const scrollContainer = container.querySelector('[data-timeline-scroll-container]');
+
+    if (!(scrollContainer instanceof HTMLDivElement)) {
+      throw new Error('Expected timeline scroll container');
+    }
+
+    Object.defineProperty(scrollContainer, 'getBoundingClientRect', {
+      configurable: true,
+      value: () => ({
+        left: 0,
+        top: 0,
+        right: 400,
+        bottom: 200,
+        width: 400,
+        height: 200,
+        x: 0,
+        y: 0,
+        toJSON: () => ({}),
+      }),
+    });
+
+    act(() => {
+      useEditorStore.setState({ transcriptionDialogDepth: 1 });
+      usePlaybackStore.getState().setPreviewFrame(12);
+    });
+
+    fireEvent.mouseMove(scrollContainer, { clientX: 180, clientY: 48 });
+
+    expect(usePlaybackStore.getState().previewFrame).toBeNull();
+  });
+
   it('reveals the active track when selection moves to an offscreen lane', async () => {
     const videoTracks: TimelineTrack[] = [
       { ...VIDEO_TRACK, id: 'track-video-1', name: 'V1', order: 0 },
@@ -375,4 +409,19 @@ describe('TimelineContent playback selection behavior', () => {
     });
     expect(audioScrollContainer!.scrollTop).toBe(55);
   });
+
+  it('does not clear previewFrame on ruler mousedown before the ruler handler runs', () => {
+    const { container } = render(<TimelineContent duration={10} tracks={[VIDEO_TRACK]} />);
+
+    act(() => {
+      usePlaybackStore.getState().setPreviewFrame(24);
+    });
+
+    const ruler = container.querySelector('.timeline-ruler') as HTMLDivElement | null;
+    expect(ruler).toBeTruthy();
+
+    fireEvent.mouseDown(ruler!, { button: 0 });
+
+    expect(usePlaybackStore.getState().previewFrame).toBe(24);
+  });
 });
diff --git a/src/features/timeline/components/timeline-content.tsx b/src/features/timeline/components/timeline-content.tsx
index 432bb696c..3deca3c26 100644
--- a/src/features/timeline/components/timeline-content.tsx
+++ b/src/features/timeline/components/timeline-content.tsx
@@ -51,6 +51,12 @@ import { useTransitionsStore } from '../stores/transitions-store';
 import { getFilteredItemSnapEdges } from '../utils/timeline-snap-utils';
 import { expandSelectionWithLinkedItems } from '../utils/linked-items';
 import { getTimelineWidth, getZoomToFitLevel } from '../utils/timeline-layout';
+import {
+  getAnchoredZoomScrollLeft,
+  getCursorZoomAnchor,
+  getPlayheadZoomAnchor,
+  type TimelineZoomAnchor,
+} from '../utils/zoom-anchor';
 
 const ACTIVE_TIMELINE_GESTURE_CURSOR_CLASSES = [
   'timeline-cursor-trim-left',
@@ -720,6 +726,7 @@ export const TimelineContent = memo(function TimelineContent({
   const selectMarker = useSelectionStore((s) => s.selectMarker);
   const clearItemSelection = useSelectionStore((s) => s.clearItemSelection);
   const activeTrackId = useSelectionStore((s) => s.activeTrackId);
+  const isTranscriptionDialogOpen = useEditorStore((s) => s.transcriptionDialogDepth > 0);
   // Granular selectors for drag state - avoid subscribing to entire dragState object
   const isDragging = useSelectionStore((s) => !!s.dragState?.isDragging);
   const containerRef = useRef<HTMLDivElement>(null);
@@ -758,6 +765,13 @@ export const TimelineContent = memo(function TimelineContent({
     }
   }, [isDragging]);
 
+  useEffect(() => {
+    if (!isTranscriptionDialogOpen) return;
+    if (usePlaybackStore.getState().previewFrame !== null) {
+      usePlaybackStore.getState().setPreviewFrame(null);
+    }
+  }, [isTranscriptionDialogOpen]);
+
   // Cleanup preview RAF on unmount
   useEffect(() => {
     return () => {
@@ -1125,12 +1139,23 @@ export const TimelineContent = memo(function TimelineContent({
   // Preview scrubber: show ghost playhead on hover
   const handleTimelineMouseDownCapture = useCallback((e: React.MouseEvent) => {
     if (e.button !== 0) return;
+    const target = e.target as HTMLElement;
+    if (target.closest('.timeline-ruler') || target.closest('[data-playhead-handle]')) {
+      return;
+    }
     if (usePlaybackStore.getState().previewFrame !== null) {
       setPreviewFrameRef.current(null);
     }
   }, []);
 
   const handleTimelineMouseMove = useCallback((e: React.MouseEvent) => {
+    if (useEditorStore.getState().transcriptionDialogDepth > 0) {
+      if (usePlaybackStore.getState().previewFrame !== null) {
+        setPreviewFrameRef.current(null);
+      }
+      return;
+    }
+
     // Skip during playback
     if (usePlaybackStore.getState().isPlaying) {
       if (usePlaybackStore.getState().previewFrame !== null) {
@@ -1235,15 +1260,27 @@ export const TimelineContent = memo(function TimelineContent({
 
   actualDurationRef.current = actualDuration;
 
+  useLayoutEffect(() => {
+    const container = containerRef.current;
+    if (!container) {
+      return;
+    }
+
+    const maxScrollLeft = Math.max(0, timelineWidth - container.clientWidth);
+    if (container.scrollLeft <= maxScrollLeft + 1) {
+      return;
+    }
+
+    // Clamp stale scroll after timeline shrink so ruler and tracks stay aligned
+    // without subscribing broad UI surfaces to item-array churn.
+    container.scrollLeft = maxScrollLeft;
+    scrollLeftRef.current = maxScrollLeft;
+    syncViewportFromContainer();
+  }, [timelineWidth, syncViewportFromContainer]);
+
   // NOTE: itemsByTrack removed - TimelineTrack now fetches its own items
   // This prevents cascade re-renders when only one track's items change
 
-  /**
-   * Adjusts scroll position to keep cursor position stable when zoom changes
-   * (Anchor zooming - cursor stays visually fixed, content scales around it)
-   *
-   * Uses refs for dynamic values to avoid callback recreation on every render
-   */
   const scheduleZoomApply = useCallback((nextZoomLevel: number, nextScrollLeft: number) => {
     queuedZoomLevelRef.current = nextZoomLevel;
     queuedZoomScrollLeftRef.current = nextScrollLeft;
@@ -1278,64 +1315,64 @@ export const TimelineContent = memo(function TimelineContent({
     }
   }, []);
 
-  const applyZoomWithPlayheadCentering = useCallback((newZoomLevel: number) => {
-    const container = containerRef.current;
-    if (!container) return;
-
+  const applyZoomWithAnchor = useCallback((newZoomLevel: number, anchor: TimelineZoomAnchor) => {
     const currentZoom = queuedZoomLevelRef.current ?? zoomLevelRef.current;
-
-    // Clamp zoom to valid range
     const clampedZoom = Math.max(0.01, Math.min(2, newZoomLevel));
     if (clampedZoom === currentZoom) return;
 
-    // Cursor's screen position (relative to container's visible left edge)
-    const cursorScreenX = zoomCursorXRef.current;
+    const nextScrollLeft = getAnchoredZoomScrollLeft({
+      anchor,
+      maxDurationSeconds: actualDurationRef.current,
+      nextZoomLevel: clampedZoom,
+    });
 
-    // Calculate cursor's position in CONTENT coordinates (timeline space)
-    const baseScrollLeft = queuedZoomScrollLeftRef.current ?? pendingScrollRef.current ?? container.scrollLeft;
-    const cursorContentX = baseScrollLeft + cursorScreenX;
+    scheduleZoomApply(clampedZoom, nextScrollLeft);
+  }, [scheduleZoomApply]);
 
-    // Convert to time using current zoom, clamped to actual content duration
-    const currentPixelsPerSecond = currentZoom * 100;
-    const cursorTime = Math.min(
-      cursorContentX / currentPixelsPerSecond,
-      actualDurationRef.current
-    );
+  const applyZoomWithCursorAnchor = useCallback((newZoomLevel: number) => {
+    const container = containerRef.current;
+    if (!container) return;
 
-    // Calculate where that same time point will be at the new zoom
-    const newPixelsPerSecond = clampedZoom * 100;
-    const newCursorContentX = cursorTime * newPixelsPerSecond;
+    const currentZoom = queuedZoomLevelRef.current ?? zoomLevelRef.current;
+    const baseScrollLeft = queuedZoomScrollLeftRef.current ?? pendingScrollRef.current ?? container.scrollLeft;
 
-    // Calculate scroll needed to keep cursor at same screen position
-    // cursor should stay at cursorScreenX, so:
-    // newScrollLeft + cursorScreenX = newCursorContentX
-    // newScrollLeft = newCursorContentX - cursorScreenX
-    const newScrollLeft = newCursorContentX - cursorScreenX;
+    applyZoomWithAnchor(newZoomLevel, getCursorZoomAnchor({
+      currentZoomLevel: currentZoom,
+      cursorScreenX: zoomCursorXRef.current,
+      maxDurationSeconds: actualDurationRef.current,
+      scrollLeft: baseScrollLeft,
+    }));
+  }, [applyZoomWithAnchor]);
 
-    // Only clamp to prevent negative scroll (left boundary)
-    const clampedScrollLeft = Math.max(0, newScrollLeft);
+  const applyZoomWithPlayheadAnchor = useCallback((newZoomLevel: number) => {
+    const container = containerRef.current;
+    if (!container) return;
 
-    // Coalesce dense wheel updates into a single visual zoom publish per frame.
-    scheduleZoomApply(clampedZoom, clampedScrollLeft);
-  }, [scheduleZoomApply]);
+    const currentZoom = queuedZoomLevelRef.current ?? zoomLevelRef.current;
+    const baseScrollLeft = queuedZoomScrollLeftRef.current ?? pendingScrollRef.current ?? container.scrollLeft;
+
+    applyZoomWithAnchor(newZoomLevel, getPlayheadZoomAnchor({
+      currentFrame: currentFrameRef.current,
+      currentZoomLevel: currentZoom,
+      fps: useTimelineStore.getState().fps,
+      maxDurationSeconds: actualDurationRef.current,
+      scrollLeft: baseScrollLeft,
+    }));
+  }, [applyZoomWithAnchor]);
 
-  // Create zoom handlers that include playhead centering
-  // These callbacks are stable and don't recreate on every render thanks to refs
   const handleZoomChange = useCallback((newZoom: number) => {
-    applyZoomWithPlayheadCentering(newZoom);
-  }, [applyZoomWithPlayheadCentering]);
+    applyZoomWithPlayheadAnchor(newZoom);
+  }, [applyZoomWithPlayheadAnchor]);
 
   const handleZoomIn = useCallback(() => {
-    // Use standard zoom step (0.1), read from ref to avoid callback recreation
     const newZoomLevel = Math.min(2, zoomLevelRef.current + 0.1);
-    applyZoomWithPlayheadCentering(newZoomLevel);
-  }, [applyZoomWithPlayheadCentering]);
+    applyZoomWithPlayheadAnchor(newZoomLevel);
+  }, [applyZoomWithPlayheadAnchor]);
 
   const handleZoomOut = useCallback(() => {
-    // Use standard zoom step (0.1), read from ref to avoid callback recreation
     const newZoomLevel = Math.max(0.01, zoomLevelRef.current - 0.1);
-    applyZoomWithPlayheadCentering(newZoomLevel);
-  }, [applyZoomWithPlayheadCentering]);
+    applyZoomWithPlayheadAnchor(newZoomLevel);
+  }, [applyZoomWithPlayheadAnchor]);
 
   // Keep a ref to containerWidth for use in stable callbacks
   const containerWidthRef = useRef(containerWidth);
@@ -1460,7 +1497,7 @@ export const TimelineContent = memo(function TimelineContent({
           const logZoom = Math.log(currentZoom);
           const newLogZoom = logZoom - velocityZoomRef.current * 1.2; // Scale factor for feel
           const newZoomLevel = Math.exp(newLogZoom);
-          applyZoomWithPlayheadCentering(newZoomLevel);
+          applyZoomWithCursorAnchor(newZoomLevel);
           lastZoomApplyTimeRef.current = now;
         }
 
@@ -1479,7 +1516,7 @@ export const TimelineContent = memo(function TimelineContent({
     };
 
     momentumIdRef.current = requestAnimationFrame(momentumLoop);
-  }, [applyZoomWithPlayheadCentering]);
+  }, [applyZoomWithCursorAnchor]);
 
   // Cleanup momentum on unmount
   useEffect(() => {
@@ -1540,7 +1577,7 @@ export const TimelineContent = memo(function TimelineContent({
           newZoom = Math.min(MAX_ZOOM, currentZoom * ZOOM_FACTOR);
         }
 
-        applyZoomWithPlayheadCentering(newZoom);
+        applyZoomWithCursorAnchor(newZoom);
         return;
       }
 
@@ -1589,7 +1626,7 @@ export const TimelineContent = memo(function TimelineContent({
     return () => {
       container.removeEventListener('wheel', wheelHandler);
     };
-  }, [applyZoomWithPlayheadCentering, getVerticalScrollTarget, hasTrackSections, startMomentumScroll]);
+  }, [applyZoomWithCursorAnchor, getVerticalScrollTarget, hasTrackSections, startMomentumScroll]);
 
   const singleSectionTracks = videoTracks.length > 0 ? videoTracks : audioTracks;
   const singleSectionKind = videoTracks.length > 0 ? 'video' : 'audio';
@@ -1715,4 +1752,4 @@ export const TimelineContent = memo(function TimelineContent({
     </div>
   );
 });
-                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  
\ No newline at end of file
+                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  
diff --git a/src/features/timeline/components/timeline-item/index.tsx b/src/features/timeline/components/timeline-item/index.tsx
index 3712bb059..262c82344 100644
--- a/src/features/timeline/components/timeline-item/index.tsx
+++ b/src/features/timeline/components/timeline-item/index.tsx
@@ -21,8 +21,17 @@ import {
   useTransitionDragStore,
 } from '@/shared/state/transition-drag';
 import { useMediaLibraryStore } from '@/features/timeline/deps/media-library-store';
+import { mediaTranscriptionService } from '@/features/timeline/deps/media-transcription-service';
+import { TranscribeDialog, type TranscribeDialogValues } from '@/features/timeline/deps/transcribe-dialog';
+import {
+  getTranscriptionOverallPercent,
+  getTranscriptionStageLabel,
+} from '@/shared/utils/transcription-progress';
+import {
+  isTranscriptionOutOfMemoryError,
+  TRANSCRIPTION_OOM_HINT,
+} from '@/shared/utils/transcription-cancellation';
 import type { PreviewItemUpdate } from '../../utils/item-edit-preview';
-import { useSettingsStore } from '@/features/timeline/deps/settings';
 import { useTimelineDrag, dragOffsetRef, dragPreviewOffsetByItemRef } from '../../hooks/use-timeline-drag';
 import { useTimelineTrim } from '../../hooks/use-timeline-trim';
 import { useTrackPush } from '../../hooks/use-track-push';
@@ -208,6 +217,38 @@ export const TimelineItem = memo(function TimelineItem({ item, timelineDuration
       [item.mediaId]
     )
   );
+  const transcriptProgress = useMediaLibraryStore(
+    useCallback(
+      (s) => (item.mediaId ? s.transcriptProgress.get(item.mediaId) ?? null : null),
+      [item.mediaId]
+    )
+  );
+  const mediaFileName = useMediaLibraryStore(
+    useCallback(
+      (s) => (item.mediaId
+        ? s.mediaItems.find((m) => m.id === item.mediaId)?.fileName ?? ''
+        : ''),
+      [item.mediaId]
+    )
+  );
+  const [captionDialogOpen, setCaptionDialogOpen] = useState(false);
+  const [captionDialogError, setCaptionDialogError] = useState<string | null>(null);
+  const mediaHasTranscript = transcriptStatus === 'ready';
+  const captionStartedRef = useRef(false);
+  const captionStopRequestedRef = useRef(false);
+
+  const captionIsActive =
+    transcriptStatus === 'queued' || transcriptStatus === 'transcribing';
+  useEffect(() => {
+    if (captionStartedRef.current && !captionIsActive) {
+      captionStartedRef.current = false;
+      const keepOpen = captionStopRequestedRef.current || captionDialogError !== null;
+      captionStopRequestedRef.current = false;
+      setCaptionDialogOpen((wasOpen) => {
+        return wasOpen && keepOpen;
+      });
+    }
+  }, [captionIsActive, captionDialogError]);
   // O(1) index lookup that preserves both explicit captionSource links and
   // legacy generated-caption detection.
   const hasGeneratedCaptions = useItemsStore(
@@ -216,11 +257,13 @@ export const TimelineItem = memo(function TimelineItem({ item, timelineDuration
       [item.id]
     )
   );
-  const defaultWhisperModel = useSettingsStore((s) => s.defaultWhisperModel);
   // O(1) via index, including legacy linked audio/video pairs.
   const isLinked = useItemsStore(
     useCallback((s) => !!s.linkedItemsByItemId[item.id], [item.id])
   );
+  const linkedItemsForCaptionOwnership = useItemsStore(
+    useCallback((s) => s.linkedItemsByItemId[item.id] ?? EMPTY_LINKED_ITEMS, [item.id])
+  );
   const linkedSelectionEnabled = useEditorStore((s) => s.linkedSelectionEnabled);
   const segmentOverlays = useTimelineItemOverlayStore(
     useCallback((s) => s.overlaysByItemId[item.id] ?? EMPTY_SEGMENT_OVERLAYS, [item.id])
@@ -241,6 +284,20 @@ export const TimelineItem = memo(function TimelineItem({ item, timelineDuration
     [itemKeyframes]
   );
   const hasKeyframes = keyframedProperties.length > 0;
+  const linkedVideoCaptionOwner = useMemo(() => {
+    if (item.type !== 'audio' || !item.mediaId) {
+      return null;
+    }
+
+    return linkedItemsForCaptionOwnership.find((linkedItem) => (
+      linkedItem.id !== item.id
+      && linkedItem.type === 'video'
+      && linkedItem.mediaId === item.mediaId
+    )) ?? null;
+  }, [item.id, item.mediaId, item.type, linkedItemsForCaptionOwnership]);
+  const canManageCaptions = !!item.mediaId
+    && !isBroken
+    && (item.type === 'video' || (item.type === 'audio' && linkedVideoCaptionOwner === null));
 
   // Use refs for actions to avoid selector re-renders - read from store in callbacks
   const activeTool = useSelectionStore((s) => s.activeTool);
@@ -1403,7 +1460,6 @@ export const TimelineItem = memo(function TimelineItem({ item, timelineDuration
     getCanLinkSelected,
     getCanUnlinkSelected,
     hasSpeakableText,
-    isCaptionGenerationActive,
     isSceneDetectionActive,
     isCompositionItem,
     handleJoinSelected,
@@ -1418,8 +1474,8 @@ export const TimelineItem = memo(function TimelineItem({ item, timelineDuration
     handleBentoLayout,
     handleFreezeFrame,
     handleGenerateAudioFromText,
-    handleGenerateCaptions,
-    handleRegenerateCaptions,
+    handleCaptionsFromDialog,
+    handleApplyCaptionsFromTranscript,
     handleCreatePreComp,
     handleEnterComposition,
     handleDissolveComposition,
@@ -2480,12 +2536,19 @@ export const TimelineItem = memo(function TimelineItem({ item, timelineDuration
         onFreezeFrame={handleFreezeFrame}
         isTextItem={item.type === 'text' && hasSpeakableText}
         onGenerateAudioFromText={handleGenerateAudioFromText}
-        canGenerateCaptions={(item.type === 'video' || item.type === 'audio') && !!item.mediaId && !isBroken}
-        canRegenerateCaptions={hasGeneratedCaptions}
-        isGeneratingCaptions={isCaptionGenerationActive || transcriptStatus === 'transcribing'}
-        defaultCaptionModel={defaultWhisperModel}
-        onGenerateCaptions={handleGenerateCaptions}
-        onRegenerateCaptions={handleRegenerateCaptions}
+        canManageCaptions={canManageCaptions}
+        hasCaptions={hasGeneratedCaptions}
+        hasTranscript={mediaHasTranscript}
+        isGeneratingCaptions={
+          transcriptStatus === 'queued'
+          || transcriptStatus === 'transcribing'
+        }
+        onOpenCaptionDialog={() => {
+          captionStopRequestedRef.current = false;
+          setCaptionDialogError(null);
+          setCaptionDialogOpen(true);
+        }}
+        onApplyCaptionsFromTranscript={handleApplyCaptionsFromTranscript}
         isCompositionItem={isCompositionItem}
         onEnterComposition={handleEnterComposition}
         onDissolveComposition={handleDissolveComposition}
@@ -2864,6 +2927,56 @@ export const TimelineItem = memo(function TimelineItem({ item, timelineDuration
       />
 
       <DragBlockedTooltip hint={pointerHint} />
+      {canManageCaptions && item.mediaId && (
+        <TranscribeDialog
+          open={captionDialogOpen}
+          onOpenChange={(next) => {
+            if (!next) setCaptionDialogError(null);
+            setCaptionDialogOpen(next);
+          }}
+          fileName={mediaFileName}
+          hasTranscript={mediaHasTranscript}
+          isRunning={
+            transcriptStatus === 'queued'
+            || transcriptStatus === 'transcribing'
+          }
+          progressPercent={
+            transcriptProgress
+              ? Math.round(getTranscriptionOverallPercent(transcriptProgress))
+              : null
+          }
+          progressLabel={
+            transcriptProgress
+              ? `${getTranscriptionStageLabel(transcriptProgress.stage)} (${Math.round(
+                  getTranscriptionOverallPercent(transcriptProgress),
+                )}%)`
+              : 'Transcribing...'
+          }
+          errorMessage={captionDialogError}
+          onStart={(values: TranscribeDialogValues) => {
+            captionStartedRef.current = true;
+            captionStopRequestedRef.current = false;
+            setCaptionDialogError(null);
+            handleCaptionsFromDialog(values, hasGeneratedCaptions, (error) => {
+              captionStartedRef.current = false;
+              const baseMessage = error instanceof Error
+                ? error.message
+                : 'Failed to generate captions';
+              setCaptionDialogError(
+                isTranscriptionOutOfMemoryError(error)
+                  ? TRANSCRIPTION_OOM_HINT
+                  : baseMessage,
+              );
+            });
+          }}
+          onCancel={() => {
+            if (item.mediaId) {
+              captionStopRequestedRef.current = true;
+              mediaTranscriptionService.cancelTranscription(item.mediaId);
+            }
+          }}
+        />
+      )}
     </>
   );
 }, (prevProps, nextProps) => {
diff --git a/src/features/timeline/components/timeline-item/item-context-menu.test.tsx b/src/features/timeline/components/timeline-item/item-context-menu.test.tsx
index 43996f5fb..26f940d9b 100644
--- a/src/features/timeline/components/timeline-item/item-context-menu.test.tsx
+++ b/src/features/timeline/components/timeline-item/item-context-menu.test.tsx
@@ -39,11 +39,6 @@ vi.mock('@/features/timeline/deps/analysis', () => ({
   getSceneVerificationModelOptions: mockGetSceneVerificationModelOptions,
 }));
 
-vi.mock('@/features/timeline/deps/media-transcription-service', () => ({
-  getMediaTranscriptionModelLabel: (model: string) => model,
-  getMediaTranscriptionModelOptions: () => [],
-}));
-
 vi.mock('@/features/timeline/deps/settings', () => ({
   useResolvedHotkeys: () => ({}),
 }));
@@ -114,3 +109,54 @@ describe('ItemContextMenu scene detection', () => {
     expect(onDetectScenes).toHaveBeenCalledWith('optical-flow', 'lfm');
   });
 });
+
+describe('ItemContextMenu captions', () => {
+  it('shows a single "Generate Captions" item when no transcript exists', () => {
+    const onOpenCaptionDialog = vi.fn();
+
+    renderContextMenu({
+      canManageCaptions: true,
+      hasCaptions: false,
+      hasTranscript: false,
+      onOpenCaptionDialog,
+    });
+
+    const item = screen.getByRole('button', { name: 'Generate Captions' });
+    expect(item).toBeInTheDocument();
+    expect(screen.queryByText('Captions')).not.toBeInTheDocument();
+    fireEvent.click(item);
+    expect(onOpenCaptionDialog).toHaveBeenCalledTimes(1);
+  });
+
+  it('shows a Captions submenu with Insert + Generate when a transcript already exists', () => {
+    const onOpenCaptionDialog = vi.fn();
+    const onApplyCaptionsFromTranscript = vi.fn();
+
+    renderContextMenu({
+      canManageCaptions: true,
+      hasCaptions: false,
+      hasTranscript: true,
+      onOpenCaptionDialog,
+      onApplyCaptionsFromTranscript,
+    });
+
+    expect(screen.getByText('Captions')).toBeInTheDocument();
+    expect(screen.getByRole('button', { name: 'Insert Existing Captions' })).toBeInTheDocument();
+    expect(screen.getByRole('button', { name: 'Generate Captions' })).toBeInTheDocument();
+
+    fireEvent.click(screen.getByRole('button', { name: 'Insert Existing Captions' }));
+    expect(onApplyCaptionsFromTranscript).toHaveBeenCalledTimes(1);
+  });
+
+  it('labels the generate item "Regenerate Captions" when the clip already has captions', () => {
+    renderContextMenu({
+      canManageCaptions: true,
+      hasCaptions: true,
+      hasTranscript: true,
+      onOpenCaptionDialog: vi.fn(),
+      onApplyCaptionsFromTranscript: vi.fn(),
+    });
+
+    expect(screen.getByRole('button', { name: 'Regenerate Captions' })).toBeInTheDocument();
+  });
+});
diff --git a/src/features/timeline/components/timeline-item/item-context-menu.tsx b/src/features/timeline/components/timeline-item/item-context-menu.tsx
index de117ccb4..f4e27d58c 100644
--- a/src/features/timeline/components/timeline-item/item-context-menu.tsx
+++ b/src/features/timeline/components/timeline-item/item-context-menu.tsx
@@ -18,11 +18,6 @@ import {
 import { useSelectionStore } from '@/shared/state/selection';
 import { PROPERTY_LABELS, type AnimatableProperty } from '@/types/keyframe';
 import type { PropertyKeyframes } from '@/types/keyframe';
-import type { MediaTranscriptModel } from '@/types/storage';
-import {
-  getMediaTranscriptionModelLabel,
-  getMediaTranscriptionModelOptions,
-} from '@/features/timeline/deps/media-transcription-service';
 import {
   getSceneVerificationModelOptions,
   type VerificationModel,
@@ -58,12 +53,12 @@ interface ItemContextMenuProps {
   /** Whether the playhead is within this item's bounds */
   playheadInBounds?: boolean;
   onFreezeFrame?: () => void;
-  canGenerateCaptions?: boolean;
-  canRegenerateCaptions?: boolean;
+  canManageCaptions?: boolean;
+  hasCaptions?: boolean;
+  hasTranscript?: boolean;
   isGeneratingCaptions?: boolean;
-  defaultCaptionModel?: MediaTranscriptModel;
-  onGenerateCaptions?: (model: MediaTranscriptModel) => void;
-  onRegenerateCaptions?: (model: MediaTranscriptModel) => void;
+  onOpenCaptionDialog?: () => void;
+  onApplyCaptionsFromTranscript?: () => void;
   /** Whether this item is a composition item (enables enter/dissolve options) */
   isCompositionItem?: boolean;
   onEnterComposition?: () => void;
@@ -113,12 +108,12 @@ export const ItemContextMenu = memo(function ItemContextMenu({
   isVideoItem,
   playheadInBounds,
   onFreezeFrame,
-  canGenerateCaptions,
-  canRegenerateCaptions,
+  canManageCaptions,
+  hasCaptions,
+  hasTranscript,
   isGeneratingCaptions,
-  defaultCaptionModel,
-  onGenerateCaptions,
-  onRegenerateCaptions,
+  onOpenCaptionDialog,
+  onApplyCaptionsFromTranscript,
   isCompositionItem,
   onEnterComposition,
   onDissolveComposition,
@@ -175,12 +170,12 @@ export const ItemContextMenu = memo(function ItemContextMenu({
       isVideoItem={isVideoItem}
       playheadInBounds={playheadInBounds}
       onFreezeFrame={onFreezeFrame}
-      canGenerateCaptions={canGenerateCaptions}
-      canRegenerateCaptions={canRegenerateCaptions}
+      canManageCaptions={canManageCaptions}
+      hasCaptions={hasCaptions}
+      hasTranscript={hasTranscript}
       isGeneratingCaptions={isGeneratingCaptions}
-      defaultCaptionModel={defaultCaptionModel}
-      onGenerateCaptions={onGenerateCaptions}
-      onRegenerateCaptions={onRegenerateCaptions}
+      onOpenCaptionDialog={onOpenCaptionDialog}
+      onApplyCaptionsFromTranscript={onApplyCaptionsFromTranscript}
       isCompositionItem={isCompositionItem}
       onEnterComposition={onEnterComposition}
       onDissolveComposition={onDissolveComposition}
@@ -255,12 +250,12 @@ const ItemContextMenuFull = memo(function ItemContextMenuFull({
   isVideoItem,
   playheadInBounds,
   onFreezeFrame,
-  canGenerateCaptions,
-  canRegenerateCaptions,
+  canManageCaptions,
+  hasCaptions,
+  hasTranscript,
   isGeneratingCaptions,
-  defaultCaptionModel,
-  onGenerateCaptions,
-  onRegenerateCaptions,
+  onOpenCaptionDialog,
+  onApplyCaptionsFromTranscript,
   isCompositionItem,
   onEnterComposition,
   onDissolveComposition,
@@ -286,18 +281,11 @@ const ItemContextMenuFull = memo(function ItemContextMenuFull({
     if (!keyframedProperties) return [];
     return keyframedProperties.filter(p => p.keyframes.length > 0);
   }, [keyframedProperties]);
-  const transcriptionModelOptions = useMemo(
-    () => getMediaTranscriptionModelOptions(),
-    [],
-  );
-  const explicitCaptionModelOptions = useMemo(
-    () => transcriptionModelOptions.filter((option) => option.value !== defaultCaptionModel),
-    [defaultCaptionModel, transcriptionModelOptions],
-  );
   const sceneVerificationModelOptions = useMemo(
     () => getSceneVerificationModelOptions(),
     [],
   );
+  const captionActionLabel = hasCaptions ? 'Regenerate Captions' : 'Generate Captions';
 
   const hasKeyframes = propertiesWithKeyframes.length > 0;
 
@@ -454,60 +442,28 @@ const ItemContextMenuFull = memo(function ItemContextMenuFull({
           </>
         )}
 
-        {canGenerateCaptions && onGenerateCaptions && (
+        {canManageCaptions && onOpenCaptionDialog && (
           <>
             {isGeneratingCaptions ? (
               <ContextMenuItem disabled>
-                Updating Captions...
+                Updating captions...
               </ContextMenuItem>
+            ) : hasTranscript && onApplyCaptionsFromTranscript ? (
+              <ContextMenuSub>
+                <ContextMenuSubTrigger>Captions</ContextMenuSubTrigger>
+                <ContextMenuSubContent className="w-56">
+                  <ContextMenuItem onClick={onApplyCaptionsFromTranscript}>
+                    Insert Existing Captions
+                  </ContextMenuItem>
+                  <ContextMenuItem onClick={onOpenCaptionDialog}>
+                    {captionActionLabel}
+                  </ContextMenuItem>
+                </ContextMenuSubContent>
+              </ContextMenuSub>
             ) : (
-              <>
-                <ContextMenuSub>
-                  <ContextMenuSubTrigger>Generate Captions for Segment</ContextMenuSubTrigger>
-                  <ContextMenuSubContent className="w-48">
-                    {defaultCaptionModel && (
-                      <>
-                        <ContextMenuItem onClick={() => onGenerateCaptions(defaultCaptionModel)}>
-                          {`Default (${getMediaTranscriptionModelLabel(defaultCaptionModel)})`}
-                        </ContextMenuItem>
-                        <ContextMenuSeparator />
-                      </>
-                    )}
-                    {explicitCaptionModelOptions.map((option) => (
-                      <ContextMenuItem
-                        key={option.value}
-                        onClick={() => onGenerateCaptions(option.value)}
-                      >
-                        {option.label}
-                      </ContextMenuItem>
-                    ))}
-                  </ContextMenuSubContent>
-                </ContextMenuSub>
-
-                {canRegenerateCaptions && onRegenerateCaptions && (
-                  <ContextMenuSub>
-                    <ContextMenuSubTrigger>Regenerate Captions for Segment</ContextMenuSubTrigger>
-                    <ContextMenuSubContent className="w-48">
-                      {defaultCaptionModel && (
-                        <>
-                          <ContextMenuItem onClick={() => onRegenerateCaptions(defaultCaptionModel)}>
-                            {`Default (${getMediaTranscriptionModelLabel(defaultCaptionModel)})`}
-                          </ContextMenuItem>
-                          <ContextMenuSeparator />
-                        </>
-                      )}
-                      {explicitCaptionModelOptions.map((option) => (
-                        <ContextMenuItem
-                          key={option.value}
-                          onClick={() => onRegenerateCaptions(option.value)}
-                        >
-                          {option.label}
-                        </ContextMenuItem>
-                      ))}
-                    </ContextMenuSubContent>
-                  </ContextMenuSub>
-                )}
-              </>
+              <ContextMenuItem onClick={onOpenCaptionDialog}>
+                {captionActionLabel}
+              </ContextMenuItem>
             )}
             <ContextMenuSeparator />
           </>
diff --git a/src/features/timeline/components/timeline-item/use-timeline-item-actions.ts b/src/features/timeline/components/timeline-item/use-timeline-item-actions.ts
index b1372a1bc..0279752b1 100644
--- a/src/features/timeline/components/timeline-item/use-timeline-item-actions.ts
+++ b/src/features/timeline/components/timeline-item/use-timeline-item-actions.ts
@@ -2,13 +2,20 @@ import { useCallback, useEffect, useRef } from 'react';
 import { toast } from 'sonner';
 import type { TimelineItem as TimelineItemType } from '@/types/timeline';
 import type { AnimatableProperty } from '@/types/keyframe';
-import type { MediaTranscriptModel } from '@/types/storage';
+import type {
+  MediaTranscriptModel,
+  MediaTranscriptQuantization,
+} from '@/types/storage';
 import { useSelectionStore } from '@/shared/state/selection';
 import { usePlaybackStore } from '@/shared/state/playback';
 import { useClearKeyframesDialogStore } from '@/app/state/clear-keyframes-dialog';
 import { useTtsGenerateDialogStore } from '@/app/state/tts-generate-dialog';
-import { isLocalInferenceCancellationError } from '@/shared/state/local-inference';
-import { getTranscriptionOverallPercent } from '@/shared/utils/transcription-progress';
+import { scheduleAfterPaint } from '@/shared/utils/schedule-after-paint';
+import {
+  isTranscriptionCancellationError,
+  isTranscriptionOutOfMemoryError,
+  TRANSCRIPTION_OOM_HINT,
+} from '@/shared/utils/transcription-cancellation';
 import { useMediaLibraryStore } from '@/features/timeline/deps/media-library-store';
 import {
   getMediaTranscriptionModelLabel,
@@ -40,8 +47,11 @@ import {
 } from '../../deps/analysis';
 import { resolveMediaUrl } from '../../deps/media-library-resolver';
 import { useBentoLayoutDialogStore } from '../bento-layout-dialog-store';
+import { createLogger } from '@/shared/logging/logger';
+import { saveScenes } from '@/infrastructure/storage/workspace-fs/scenes';
+
+const logger = createLogger('UseTimelineItemActions');
 
-const CAPTION_GENERATION_OVERLAY_ID = 'caption-generation';
 const SCENE_DETECTION_OVERLAY_ID = 'scene-detection';
 
 interface UseTimelineItemActionsParams {
@@ -180,6 +190,9 @@ export function useTimelineItemActions({
     options?: {
       forceTranscription?: boolean;
       replaceExisting?: boolean;
+      quantization?: MediaTranscriptQuantization;
+      language?: string;
+      onError?: (error: unknown) => void;
     },
   ) => {
     if ((item.type !== 'video' && item.type !== 'audio') || !item.mediaId || isBroken) {
@@ -189,11 +202,9 @@ export function useTimelineItemActions({
     const mediaId = item.mediaId;
     const clipId = item.id;
     const store = useMediaLibraryStore.getState();
-    const overlayStore = useTimelineItemOverlayStore.getState();
     const previousStatus = store.transcriptStatus.get(mediaId) ?? 'idle';
     const forceTranscription = options?.forceTranscription ?? false;
     const replaceExisting = options?.replaceExisting ?? false;
-    const overlayLabel = forceTranscription ? 'Regenerating captions' : 'Generating captions';
 
     const run = async () => {
       let updatedTranscriptStatus = previousStatus;
@@ -204,28 +215,26 @@ export function useTimelineItemActions({
           forceTranscription || !existingTranscript || existingTranscript.model !== model;
 
         if (needsTranscription) {
-          overlayStore.upsertOverlay(clipId, {
-            id: CAPTION_GENERATION_OVERLAY_ID,
-            label: overlayLabel,
-            progress: 0,
-            tone: 'info',
-          });
-          store.setTranscriptStatus(mediaId, 'transcribing');
-          store.setTranscriptProgress(mediaId, { stage: 'loading', progress: 0 });
+          store.setTranscriptStatus(mediaId, 'queued');
+          store.setTranscriptProgress(mediaId, { stage: 'queued', progress: 0 });
 
           await mediaTranscriptionService.transcribeMedia(mediaId, {
             model,
+            quantization: options?.quantization,
+            language: options?.language || undefined,
+            onQueueStatusChange: (state) => {
+              if (state === 'queued') {
+                store.setTranscriptStatus(mediaId, 'queued');
+                store.setTranscriptProgress(mediaId, { stage: 'queued', progress: 0 });
+                return;
+              }
+
+              store.setTranscriptStatus(mediaId, 'transcribing');
+              store.setTranscriptProgress(mediaId, { stage: 'loading', progress: 0 });
+            },
             onProgress: (progress) => {
               const mediaLibraryStore = useMediaLibraryStore.getState();
               mediaLibraryStore.setTranscriptProgress(mediaId, progress);
-              const mergedProgress = mediaLibraryStore.transcriptProgress.get(mediaId) ?? progress;
-
-              useTimelineItemOverlayStore.getState().upsertOverlay(clipId, {
-                id: CAPTION_GENERATION_OVERLAY_ID,
-                label: overlayLabel,
-                progress: getTranscriptionOverallPercent(mergedProgress),
-                tone: 'info',
-              });
             },
           });
 
@@ -233,16 +242,10 @@ export function useTimelineItemActions({
           store.setTranscriptStatus(mediaId, updatedTranscriptStatus);
           store.clearTranscriptProgress(mediaId);
         } else {
-          overlayStore.upsertOverlay(clipId, {
-            id: CAPTION_GENERATION_OVERLAY_ID,
-            label: replaceExisting ? 'Replacing captions' : 'Adding captions',
-            tone: 'info',
-          });
           updatedTranscriptStatus = 'ready';
           store.setTranscriptStatus(mediaId, updatedTranscriptStatus);
           store.clearTranscriptProgress(mediaId);
         }
-
         const result = await mediaTranscriptionService.insertTranscriptAsCaptions(mediaId, {
           clipIds: [clipId],
           replaceExisting,
@@ -251,17 +254,17 @@ export function useTimelineItemActions({
         const successMessage = replaceExisting
           ? result.insertedItemCount > 0
             ? result.removedItemCount > 0
-              ? `Replaced ${result.removedItemCount} caption clip${result.removedItemCount === 1 ? '' : 's'} with ${result.insertedItemCount} updated clip${result.insertedItemCount === 1 ? '' : 's'} for this segment using ${getMediaTranscriptionModelLabel(model)}`
-              : `Regenerated ${result.insertedItemCount} caption clip${result.insertedItemCount === 1 ? '' : 's'} for this segment using ${getMediaTranscriptionModelLabel(model)}`
-            : `Removed ${result.removedItemCount} generated caption clip${result.removedItemCount === 1 ? '' : 's'} for this segment using ${getMediaTranscriptionModelLabel(model)}`
-          : `Inserted ${result.insertedItemCount} caption clip${result.insertedItemCount === 1 ? '' : 's'} for this segment with ${getMediaTranscriptionModelLabel(model)}`;
+              ? `Updated captions on this segment with ${getMediaTranscriptionModelLabel(model)}`
+              : `Refreshed captions on this segment with ${getMediaTranscriptionModelLabel(model)}`
+            : `Removed captions from this segment`
+          : `Added captions to this segment with ${getMediaTranscriptionModelLabel(model)}`;
 
         store.showNotification({
           type: 'success',
           message: successMessage,
         });
       } catch (error) {
-        if (isLocalInferenceCancellationError(error)) {
+        if (isTranscriptionCancellationError(error)) {
           store.setTranscriptStatus(mediaId, previousStatus);
           store.clearTranscriptProgress(mediaId);
           return;
@@ -269,32 +272,81 @@ export function useTimelineItemActions({
 
         store.setTranscriptStatus(mediaId, updatedTranscriptStatus === 'ready' ? 'ready' : 'error');
         store.clearTranscriptProgress(mediaId);
+        const fallbackMessage = error instanceof Error
+          ? error.message
+          : 'Failed to generate captions for segment';
+        const friendlyMessage = isTranscriptionOutOfMemoryError(error)
+          ? TRANSCRIPTION_OOM_HINT
+          : fallbackMessage;
+        options?.onError?.(error);
         store.showNotification({
           type: 'error',
-          message: error instanceof Error ? error.message : 'Failed to generate captions for segment',
+          message: friendlyMessage,
         });
-      } finally {
-        useTimelineItemOverlayStore.getState().removeOverlay(clipId, CAPTION_GENERATION_OVERLAY_ID);
       }
     };
 
-    void run();
+    scheduleAfterPaint(() => {
+      void run();
+    });
   }, [item.id, item.mediaId, item.type, isBroken]);
 
-  const handleGenerateCaptions = useCallback((model: MediaTranscriptModel) => {
-    handleCaptionGeneration(model);
-  }, [handleCaptionGeneration]);
-
-  const handleRegenerateCaptions = useCallback((model: MediaTranscriptModel) => {
-    handleCaptionGeneration(model, {
+  const handleCaptionsFromDialog = useCallback((values: {
+    model: MediaTranscriptModel;
+    quantization: MediaTranscriptQuantization;
+    language: string;
+  }, hasExistingCaptions: boolean, onError?: (error: unknown) => void) => {
+    handleCaptionGeneration(values.model, {
+      // The dialog path is always "generate fresh captions". Reusing the
+      // current transcript is handled explicitly by "Insert Existing Captions".
       forceTranscription: true,
-      replaceExisting: true,
+      replaceExisting: hasExistingCaptions,
+      quantization: values.quantization,
+      language: values.language,
+      onError,
     });
   }, [handleCaptionGeneration]);
 
-  const isCaptionGenerationActive = segmentOverlays.some(
-    (overlay) => overlay.id === CAPTION_GENERATION_OVERLAY_ID,
-  );
+  const handleApplyCaptionsFromTranscript = useCallback(() => {
+    if ((item.type !== 'video' && item.type !== 'audio') || !item.mediaId || isBroken) {
+      return;
+    }
+
+    const mediaId = item.mediaId;
+    const clipId = item.id;
+    const replaceExisting = useItemsStore.getState().replaceableCaptionClipIds.has(clipId);
+    const store = useMediaLibraryStore.getState();
+
+    const run = async () => {
+      try {
+        const existingTranscript = await mediaTranscriptionService.getTranscript(mediaId);
+        if (!existingTranscript) {
+          throw new Error('Generate a transcript first, then add captions from it.');
+        }
+
+        const result = await mediaTranscriptionService.insertTranscriptAsCaptions(mediaId, {
+          clipIds: [clipId],
+          replaceExisting,
+        });
+
+        store.showNotification({
+          type: 'success',
+          message: replaceExisting
+            ? result.insertedItemCount > 0 || result.removedItemCount > 0
+              ? 'Updated captions on this segment from the current transcript'
+              : 'Removed captions from this segment'
+            : 'Added captions to this segment from the current transcript',
+        });
+      } catch (error) {
+        store.showNotification({
+          type: 'error',
+          message: error instanceof Error ? error.message : 'Failed to update captions for segment',
+        });
+      }
+    };
+
+    void run();
+  }, [isBroken, item.id, item.mediaId, item.type]);
 
   const isSceneDetectionActive = segmentOverlays.some(
     (overlay) => overlay.id === SCENE_DETECTION_OVERLAY_ID,
@@ -414,6 +466,21 @@ export function useTimelineItemActions({
           },
         });
 
+        // Persist scene cuts to the workspace so the next session/window
+        // doesn't need to recompute. Fire-and-forget — UX proceeds regardless.
+        if (cuts.length > 0) {
+          void saveScenes({
+            mediaId,
+            service: method === 'histogram' ? 'scene-detect-histogram' : 'scene-detect-optical-flow',
+            model: verificationModel ?? method,
+            method,
+            sampleIntervalMs: method === 'histogram' ? 250 : 500,
+            verificationModel,
+            fps: mediaFps,
+            cuts,
+          }).catch((error) => logger.warn('Failed to persist scene cuts', error));
+        }
+
         if (cuts.length === 0) {
           toast.info('No scene cuts detected');
           return;
@@ -470,7 +537,6 @@ export function useTimelineItemActions({
     getCanLinkSelected,
     getCanUnlinkSelected,
     hasSpeakableText,
-    isCaptionGenerationActive,
     isSceneDetectionActive,
     isCompositionItem,
     handleJoinSelected,
@@ -485,8 +551,8 @@ export function useTimelineItemActions({
     handleBentoLayout,
     handleFreezeFrame,
     handleGenerateAudioFromText,
-    handleGenerateCaptions,
-    handleRegenerateCaptions,
+    handleCaptionsFromDialog,
+    handleApplyCaptionsFromTranscript,
     handleCreatePreComp,
     handleEnterComposition,
     handleDissolveComposition,
diff --git a/src/features/timeline/components/timeline-navigator.tsx b/src/features/timeline/components/timeline-navigator.tsx
index 5e4dbd5df..f5b139d8e 100644
--- a/src/features/timeline/components/timeline-navigator.tsx
+++ b/src/features/timeline/components/timeline-navigator.tsx
@@ -2,6 +2,7 @@ import { useCallback, useEffect, useMemo, useRef, useState } from 'react';
 
 import { useTimelineViewportStore } from '../stores/timeline-viewport-store';
 import { useTimelineStore } from '../stores/timeline-store';
+import { useItemsStore } from '../stores/items-store';
 import { useZoomStore } from '../stores/zoom-store';
 import { cn } from '@/shared/ui/cn';
 import { getNavigatorResizeDragResult, getNavigatorThumbMetrics } from './timeline-navigator-utils';
@@ -32,9 +33,7 @@ export function TimelineNavigator({
   const [dragStartThumbLeft, setDragStartThumbLeft] = useState(0);
   const [dragStartThumbWidth, setDragStartThumbWidth] = useState(0);
 
-  const maxFrame = useTimelineStore((s) =>
-    s.items.reduce((max, item) => Math.max(max, item.from + item.durationInFrames), 0)
-  );
+  const maxFrame = useItemsStore((s) => s.maxItemEndFrame);
 
   const contentDuration = useMemo(() => {
     const furthestEndSeconds = maxFrame / fps;
diff --git a/src/features/timeline/components/timeline-playhead.test.tsx b/src/features/timeline/components/timeline-playhead.test.tsx
new file mode 100644
index 000000000..1cd135ec3
--- /dev/null
+++ b/src/features/timeline/components/timeline-playhead.test.tsx
@@ -0,0 +1,72 @@
+import { fireEvent, render, waitFor } from '@testing-library/react';
+import { beforeEach, describe, expect, it } from 'vitest';
+
+import { usePlaybackStore } from '@/shared/state/playback';
+import { TimelinePlayhead } from './timeline-playhead';
+import { useZoomStore, _resetZoomStoreForTest } from '../stores/zoom-store';
+import { useTimelineStore } from '../stores/timeline-store';
+
+describe('TimelinePlayhead', () => {
+  beforeEach(() => {
+    usePlaybackStore.setState({
+      currentFrame: 12,
+      currentFrameEpoch: 0,
+      isPlaying: false,
+      playbackRate: 1,
+      loop: false,
+      volume: 1,
+      muted: false,
+      zoom: -1,
+      previewFrame: null,
+      previewFrameEpoch: 0,
+      frameUpdateEpoch: 0,
+      previewItemId: null,
+      useProxy: true,
+      previewQuality: 1,
+    });
+    useTimelineStore.setState({ fps: 30 });
+    _resetZoomStoreForTest();
+    useZoomStore.getState().setZoomLevelSynchronized(1);
+  });
+
+  it('uses atomic scrub updates while dragging and clears preview on release', async () => {
+    const { container } = render(
+      <div className="timeline-ruler">
+        <TimelinePlayhead inRuler maxFrame={300} />
+      </div>,
+    );
+
+    const ruler = container.querySelector('.timeline-ruler') as HTMLDivElement | null;
+    expect(ruler).toBeTruthy();
+
+    ruler!.getBoundingClientRect = () => ({
+      x: 0,
+      y: 0,
+      left: 0,
+      top: 0,
+      right: 600,
+      bottom: 40,
+      width: 600,
+      height: 40,
+      toJSON: () => ({}),
+    });
+
+    const hitArea = container.querySelector('[style*="width: 20px"]') as HTMLDivElement | null;
+    expect(hitArea).toBeTruthy();
+
+    fireEvent.mouseDown(hitArea!, { clientX: 24, clientY: 8, button: 0 });
+    fireEvent.mouseMove(document, { clientX: 120, clientY: 8 });
+
+    await waitFor(() => {
+      expect(usePlaybackStore.getState().previewFrame).toBe(36);
+      expect(usePlaybackStore.getState().currentFrame).toBe(36);
+    });
+
+    fireEvent.mouseUp(document, { clientX: 120, clientY: 8 });
+
+    await waitFor(() => {
+      expect(usePlaybackStore.getState().currentFrame).toBe(36);
+      expect(usePlaybackStore.getState().previewFrame).toBeNull();
+    });
+  });
+});
diff --git a/src/features/timeline/components/timeline-playhead.tsx b/src/features/timeline/components/timeline-playhead.tsx
index d72fda7d0..a21a444a8 100644
--- a/src/features/timeline/components/timeline-playhead.tsx
+++ b/src/features/timeline/components/timeline-playhead.tsx
@@ -25,13 +25,13 @@ interface TimelinePlayheadProps {
  */
 export function TimelinePlayhead({ inRuler = false, maxFrame }: TimelinePlayheadProps) {
   // Don't subscribe to currentFrame - use ref + manual subscription instead
-  const setCurrentFrame = usePlaybackStore((s) => s.setCurrentFrame);
   const setScrubFrame = usePlaybackStore((s) => s.setScrubFrame);
   const { frameToPixels, pixelsToFrame, pixelsPerSecond } = useTimelineZoomContext();
 
   const [isDragging, setIsDragging] = useState(false);
   const [isExternalDrag, setIsExternalDrag] = useState(false);
   const playheadRef = useRef<HTMLDivElement>(null);
+  const isDraggingRef = useRef(false);
 
   // Track activeTool via ref subscription to avoid re-renders during playback
   // This prevents mode toggle from interrupting frame updates
@@ -44,7 +44,6 @@ export function TimelinePlayhead({ inRuler = false, maxFrame }: TimelinePlayhead
 
   // Use refs to avoid stale closures
   const pixelsToFrameRef = useRef(pixelsToFrame);
-  const setCurrentFrameRef = useRef(setCurrentFrame);
   const setScrubFrameRef = useRef(setScrubFrame);
   const maxFrameRef = useRef(maxFrame);
   const frameToPixelsRef = useRef(frameToPixels);
@@ -68,14 +67,19 @@ export function TimelinePlayhead({ inRuler = false, maxFrame }: TimelinePlayhead
   // Update refs when functions change
   useEffect(() => {
     pixelsToFrameRef.current = pixelsToFrame;
-    setCurrentFrameRef.current = setCurrentFrame;
     setScrubFrameRef.current = setScrubFrame;
     maxFrameRef.current = maxFrame;
     frameToPixelsRef.current = frameToPixels;
     pixelsPerSecondRef.current = pixelsPerSecond;
-  }, [pixelsToFrame, setCurrentFrame, setScrubFrame, maxFrame, frameToPixels, pixelsPerSecond]);
+  }, [pixelsToFrame, setScrubFrame, maxFrame, frameToPixels, pixelsPerSecond]);
 
-  // Subscribe to currentFrame changes and update position directly (no React re-renders)
+  useEffect(() => {
+    isDraggingRef.current = isDragging;
+  }, [isDragging]);
+
+  // Subscribe to playback frame changes and update position directly.
+  // During playhead drags, use the same atomic scrub state as the main ruler
+  // so the fast-scrub overlay hands back to the player consistently.
   useEffect(() => {
     const updatePosition = (frame: number) => {
       if (!playheadRef.current) return;
@@ -88,17 +92,24 @@ export function TimelinePlayhead({ inRuler = false, maxFrame }: TimelinePlayhead
 
     // Subscribe to store changes
     return usePlaybackStore.subscribe((state) => {
-      updatePosition(state.currentFrame);
+      updatePosition(
+        isDraggingRef.current && state.previewFrame !== null
+          ? state.previewFrame
+          : state.currentFrame
+      );
     });
   }, []);
 
   // Also update position when frameToPixels changes (zoom changes)
   useLayoutEffect(() => {
     if (!playheadRef.current) return;
-    const frame = usePlaybackStore.getState().currentFrame;
+    const playbackState = usePlaybackStore.getState();
+    const frame = isDraggingRef.current && playbackState.previewFrame !== null
+      ? playbackState.previewFrame
+      : playbackState.currentFrame;
     const leftPosition = Math.round(frameToPixels(frame));
     playheadRef.current.style.left = `${leftPosition}px`;
-  }, [frameToPixels]);
+  }, [frameToPixels, isDragging]);
 
   // Track external drag operations to disable pointer events on hit areas
   useEffect(() => {
@@ -120,7 +131,6 @@ export function TimelinePlayhead({ inRuler = false, maxFrame }: TimelinePlayhead
   const handleMouseDown = useCallback((e: React.MouseEvent) => {
     e.preventDefault();
     e.stopPropagation();
-    usePlaybackStore.getState().setPreviewFrame(null);
     const container = inRuler
       ? playheadRef.current?.closest('.timeline-ruler')
       : playheadRef.current?.closest('.timeline-tracks');
@@ -196,7 +206,7 @@ export function TimelinePlayhead({ inRuler = false, maxFrame }: TimelinePlayhead
       }
 
       if (pendingFrame !== null) {
-        setCurrentFrameRef.current(pendingFrame);
+        setScrubFrameRef.current(pendingFrame);
       }
 
       pendingFrameRef.current = null;
diff --git a/src/features/timeline/contracts/media-library.ts b/src/features/timeline/contracts/media-library.ts
index 0b8ec0426..5cc6da95e 100644
--- a/src/features/timeline/contracts/media-library.ts
+++ b/src/features/timeline/contracts/media-library.ts
@@ -7,6 +7,8 @@ export { useTimelineStore } from '../stores/timeline-store';
 export { useCompositionNavigationStore } from '../stores/composition-navigation-store';
 export { DEFAULT_TRACK_HEIGHT } from '../constants';
 export { timelineToSourceFrames, sourceToTimelineFrames } from '../utils/source-calculations';
+export { getNextClassicTrackName, getTrackKind, type TrackKind } from '../utils/classic-tracks';
+export { getEffectiveTrackKindForItem } from '../utils/track-item-compatibility';
 export {
   useCompositionsStore,
   type SubComposition,
diff --git a/src/features/timeline/contracts/preview.ts b/src/features/timeline/contracts/preview.ts
index fd5d54cac..05fab42a7 100644
--- a/src/features/timeline/contracts/preview.ts
+++ b/src/features/timeline/contracts/preview.ts
@@ -35,3 +35,8 @@ export {
   buildSubCompositionInput,
   collectSubCompositionMediaIds,
 } from '../utils/sub-composition-preview';
+export {
+  createScrubThrottleState,
+  shouldCommitScrubFrame,
+  type ScrubThrottleState,
+} from '../utils/scrub-throttle';
diff --git a/src/features/timeline/deps/transcribe-dialog.ts b/src/features/timeline/deps/transcribe-dialog.ts
new file mode 100644
index 000000000..16f353d07
--- /dev/null
+++ b/src/features/timeline/deps/transcribe-dialog.ts
@@ -0,0 +1,4 @@
+export {
+  TranscribeDialog,
+  type TranscribeDialogValues,
+} from './media-library-contract';
diff --git a/src/features/timeline/hooks/use-rate-stretch.ts b/src/features/timeline/hooks/use-rate-stretch.ts
index 6825b011d..b1d6e704d 100644
--- a/src/features/timeline/hooks/use-rate-stretch.ts
+++ b/src/features/timeline/hooks/use-rate-stretch.ts
@@ -1,7 +1,7 @@
 import { useState, useCallback, useRef, useEffect, useEffectEvent } from 'react';
 import type { TimelineItem } from '@/types/timeline';
 import { useEditorStore } from '@/app/state/editor';
-import { usePlaybackStore } from '@/shared/state/playback';
+import { commitPreviewFrameToCurrentFrame } from '@/shared/state/playback';
 import type { SnapTarget } from '../types/drag';
 import { useTimelineStore } from '../stores/timeline-store';
 import { useSelectionStore } from '@/shared/state/selection';
@@ -16,7 +16,11 @@ import {
   timelineToSourceFrames,
 } from '../utils/source-calculations';
 import { useLinkedEditPreviewStore } from '../stores/linked-edit-preview-store';
-import { getSynchronizedLinkedItems, getLinkedItemIds } from '../utils/linked-items';
+import {
+  expandItemIdsWithAttachedCaptions,
+  getSynchronizedLinkedItems,
+  getLinkedItemIds,
+} from '../utils/linked-items';
 import { applyRateStretchPreview, applyMovePreview } from '../utils/item-edit-preview';
 import type { PreviewItemUpdate } from '../utils/item-edit-preview';
 import { useTransitionsStore } from '../stores/transitions-store';
@@ -69,7 +73,7 @@ function computeRipplePreviewUpdates(
     movedIds.add(itemId);
     updates.push(applyMovePreview(it, delta));
 
-    for (const linkedId of getLinkedItemIds(items, itemId)) {
+    for (const linkedId of expandItemIdsWithAttachedCaptions(items, getLinkedItemIds(items, itemId))) {
       if (linkedId === itemId || movedIds.has(linkedId)) continue;
       const linked = items.find((i) => i.id === linkedId);
       if (linked) {
@@ -566,7 +570,7 @@ export function useRateStretch(item: TimelineItem, timelineDuration: number, tra
 
       e.stopPropagation();
       e.preventDefault();
-      usePlaybackStore.getState().setPreviewFrame(null);
+      commitPreviewFrameToCurrentFrame();
 
       setDragState({
         isDragging: true,
diff --git a/src/features/timeline/hooks/use-timeline-drag.ts b/src/features/timeline/hooks/use-timeline-drag.ts
index a8e42f861..92c89d6fb 100644
--- a/src/features/timeline/hooks/use-timeline-drag.ts
+++ b/src/features/timeline/hooks/use-timeline-drag.ts
@@ -10,6 +10,7 @@ import { useSnapCalculator } from './use-snap-calculator';
 import { findNearestAvailableSpace } from '../utils/collision-utils';
 import { getTrackKind } from '../utils/classic-tracks';
 import {
+  expandItemIdsWithAttachedCaptions,
   buildLinkedMovePreviewUpdates,
   expandSelectionWithLinkedItems,
   filterUnlockedItemIds,
@@ -577,12 +578,13 @@ export function useTimelineDrag(
       }
 
       // Determine which items to drag
-      const itemsToDrag = isInSelection
+      const baseItemsToDrag = isInSelection
         ? (linkedSelectionEnabled ? expandSelectionWithLinkedItems(allItems, currentSelectedIds) : currentSelectedIds)
         : linkedIds;
+      const itemsToDrag = expandItemIdsWithAttachedCaptions(allItems, baseItemsToDrag);
       const draggableItemIds = filterUnlockedItemIds(allItems, currentTracks, itemsToDrag);
-      if (isInSelection && itemsToDrag.length !== currentSelectedIds.length) {
-        selectItems(itemsToDrag);
+      if (isInSelection && baseItemsToDrag.length !== currentSelectedIds.length) {
+        selectItems(baseItemsToDrag);
       }
 
       // Store initial state for all dragged items
diff --git a/src/features/timeline/hooks/use-timeline-slip-slide.ts b/src/features/timeline/hooks/use-timeline-slip-slide.ts
index 233d13732..0f279b5d7 100644
--- a/src/features/timeline/hooks/use-timeline-slip-slide.ts
+++ b/src/features/timeline/hooks/use-timeline-slip-slide.ts
@@ -1,6 +1,7 @@
 import { useState, useCallback, useRef, useEffect } from 'react';
 import type { TimelineItem } from '@/types/timeline';
-import { usePlaybackStore } from '@/shared/state/playback';
+import type { Transition } from '@/types/transition';
+import { commitPreviewFrameToCurrentFrame } from '@/shared/state/playback';
 import { useEditorStore } from '@/app/state/editor';
 import { DRAG_THRESHOLD_PIXELS } from '../constants';
 import { useTimelineStore } from '../stores/timeline-store';
@@ -8,6 +9,7 @@ import { useTransitionsStore } from '../stores/transitions-store';
 import { useSelectionStore } from '@/shared/state/selection';
 import { pixelsToTimeNow } from '../utils/zoom-conversions';
 import { useSnapCalculator } from './use-snap-calculator';
+import type { SnapTarget } from '../types/drag';
 import { useSlipEditPreviewStore } from '../stores/slip-edit-preview-store';
 import { useSlideEditPreviewStore } from '../stores/slide-edit-preview-store';
 import { useLinkedEditPreviewStore } from '../stores/linked-edit-preview-store';
@@ -24,7 +26,7 @@ import {
   getMatchingSynchronizedLinkedCounterpart,
   getSynchronizedLinkedItems,
 } from '../utils/linked-items';
-import { clampSlipDeltaToPreserveTransitions, clampSlideDeltaToPreserveTransitions } from '../utils/transition-utils';
+import { canAddTransition, clampSlipDeltaToPreserveTransitions, clampSlideDeltaToPreserveTransitions } from '../utils/transition-utils';
 import {
   applyMovePreview,
   applySlipPreview,
@@ -33,6 +35,7 @@ import {
   type PreviewItemUpdate,
 } from '../utils/item-edit-preview';
 import { hasExceededDragThreshold } from '../utils/drag-threshold';
+import { computeSlideContinuitySourceDelta } from '../utils/slide-utils';
 
 interface SlipSlideState {
   isActive: boolean;
@@ -50,6 +53,153 @@ interface SlipSlideStartOptions {
   activateOnMoveThreshold?: boolean;
 }
 
+interface SlideParticipantConstraintContext {
+  participant: TimelineItem;
+  leftAdjacent: TimelineItem | null;
+  rightAdjacent: TimelineItem | null;
+  nearestNeighbors: ReturnType<typeof findNearestNeighbors>;
+  excludeIds: Set<string>;
+  leftAdjacentNearestStart: number | null;
+  rightAdjacentNearestEnd: number | null;
+}
+
+interface SlideGestureContext {
+  currentItem: TimelineItem;
+  allItems: TimelineItem[];
+  itemsById: Map<string, TimelineItem>;
+  transitions: Transition[];
+  leftNeighbor: TimelineItem | null;
+  rightNeighbor: TimelineItem | null;
+  snapTargets: SnapTarget[];
+  snapExcludeIds: Set<string>;
+  linkedSelectionEnabled: boolean;
+  synchronizedCounterpart: TimelineItem | null;
+  leftCounterpart: TimelineItem | null;
+  rightCounterpart: TimelineItem | null;
+  slideItemIds: Set<string>;
+  primaryNearestNeighbors: ReturnType<typeof findNearestNeighbors>;
+  leftNeighborNearestStart: number | null;
+  rightNeighborNearestEnd: number | null;
+  participantContexts: SlideParticipantConstraintContext[];
+  relatedTransitions: Transition[];
+}
+
+function findAdjacentTrackNeighbors(
+  item: TimelineItem,
+  items: TimelineItem[],
+): { leftAdjacent: TimelineItem | null; rightAdjacent: TimelineItem | null } {
+  const itemEnd = item.from + item.durationInFrames;
+  let leftAdjacent: TimelineItem | null = null;
+  let rightAdjacent: TimelineItem | null = null;
+
+  for (const other of items) {
+    if (other.id === item.id || other.trackId !== item.trackId) continue;
+    const otherEnd = other.from + other.durationInFrames;
+
+    if (otherEnd === item.from && (!leftAdjacent || other.from > leftAdjacent.from)) {
+      leftAdjacent = other;
+    }
+    if (other.from === itemEnd && (!rightAdjacent || other.from < rightAdjacent.from)) {
+      rightAdjacent = other;
+    }
+  }
+
+  return { leftAdjacent, rightAdjacent };
+}
+
+function findNearestStartAtOrAfter(
+  item: TimelineItem,
+  items: TimelineItem[],
+  excludeIds: ReadonlySet<string>,
+): number | null {
+  const itemEnd = item.from + item.durationInFrames;
+  let nearestStart = Infinity;
+
+  for (const other of items) {
+    if (other.id === item.id || other.trackId !== item.trackId || excludeIds.has(other.id)) continue;
+    if (other.from >= itemEnd) {
+      nearestStart = Math.min(nearestStart, other.from);
+    }
+  }
+
+  return Number.isFinite(nearestStart) ? nearestStart : null;
+}
+
+function findNearestEndAtOrBefore(
+  item: TimelineItem,
+  items: TimelineItem[],
+  excludeIds: ReadonlySet<string>,
+): number | null {
+  let nearestEnd = -Infinity;
+
+  for (const other of items) {
+    if (other.id === item.id || other.trackId !== item.trackId || excludeIds.has(other.id)) continue;
+    const otherEnd = other.from + other.durationInFrames;
+    if (otherEnd <= item.from) {
+      nearestEnd = Math.max(nearestEnd, otherEnd);
+    }
+  }
+
+  return Number.isFinite(nearestEnd) ? nearestEnd : null;
+}
+
+function clampEndAgainstNearestStart(
+  item: TimelineItem,
+  trimAmount: number,
+  nearestStart: number | null,
+): number {
+  if (trimAmount <= 0 || nearestStart === null) return trimAmount;
+  const itemEnd = item.from + item.durationInFrames;
+  const maxExtend = nearestStart - itemEnd;
+  return trimAmount > maxExtend ? maxExtend : trimAmount;
+}
+
+function clampStartAgainstNearestEnd(
+  item: TimelineItem,
+  trimAmount: number,
+  nearestEnd: number | null,
+): number {
+  if (trimAmount >= 0 || nearestEnd === null) return trimAmount;
+  const maxExtend = item.from - nearestEnd;
+  if (-trimAmount > maxExtend) {
+    return maxExtend > 0 ? -maxExtend : 0;
+  }
+  return trimAmount;
+}
+
+function applyPreviewUpdate(
+  item: TimelineItem,
+  previewUpdate: PreviewItemUpdate | null | undefined,
+): TimelineItem {
+  return previewUpdate
+    ? ({ ...item, ...previewUpdate } as TimelineItem)
+    : item;
+}
+
+function clampDeltaToLastValidValue(
+  requestedDelta: number,
+  isValid: (delta: number) => boolean,
+): number {
+  if (!isValid(0)) return 0;
+  if (isValid(requestedDelta)) return requestedDelta;
+
+  const sign = requestedDelta < 0 ? -1 : 1;
+  let low = 0;
+  let high = Math.abs(requestedDelta);
+
+  while (low < high) {
+    const mid = Math.ceil((low + high) / 2);
+    const candidate = sign * mid;
+    if (isValid(candidate)) {
+      low = mid;
+    } else {
+      high = mid - 1;
+    }
+  }
+
+  return sign * low;
+}
+
 /**
  * Hook for handling slip and slide editing on timeline items.
  *
@@ -88,6 +238,7 @@ export function useTimelineSlipSlide(
   stateRef.current = state;
   const latestDeltaRef = useRef(0);
   const pendingStartCleanupRef = useRef<(() => void) | null>(null);
+  const slideGestureContextRef = useRef<SlideGestureContext | null>(null);
 
   const getItemFromStore = useCallback(() => {
     return useTimelineStore.getState().items.find((i) => i.id === item.id) ?? item;
@@ -104,8 +255,98 @@ export function useTimelineSlipSlide(
     return findEditNeighborsWithTransitions(currentItem, allItems, transitions);
   }, [getItemFromStore]);
 
+  const buildSlideGestureContext = useCallback((
+    currentItem: TimelineItem,
+    leftNeighbor: TimelineItem | null,
+    rightNeighbor: TimelineItem | null,
+  ): SlideGestureContext => {
+    const allItems = useTimelineStore.getState().items;
+    const transitions = useTransitionsStore.getState().transitions;
+    const itemsById = new Map(allItems.map((candidate) => [candidate.id, candidate]));
+    const linkedSelectionEnabled = useEditorStore.getState().linkedSelectionEnabled;
+    const synchronizedItems = linkedSelectionEnabled
+      ? getSynchronizedLinkedItems(allItems, currentItem.id)
+      : [currentItem];
+    const synchronizedCounterpart = synchronizedItems.find((candidate) => candidate.id !== currentItem.id) ?? null;
+    const leftCounterpart = leftNeighbor && synchronizedCounterpart
+      ? getMatchingSynchronizedLinkedCounterpart(allItems, leftNeighbor.id, synchronizedCounterpart.trackId, synchronizedCounterpart.type)
+      : null;
+    const rightCounterpart = rightNeighbor && synchronizedCounterpart
+      ? getMatchingSynchronizedLinkedCounterpart(allItems, rightNeighbor.id, synchronizedCounterpart.trackId, synchronizedCounterpart.type)
+      : null;
+    const slideItemIds = new Set<string>([
+      currentItem.id,
+      leftNeighbor?.id ?? '',
+      rightNeighbor?.id ?? '',
+    ].filter(Boolean));
+    const snapExcludeIds = new Set<string>(slideItemIds);
+    const snapTargets = snapEnabled ? getMagneticSnapTargets() : [];
+    const primaryNearestNeighbors = findNearestNeighbors(currentItem, allItems);
+    const leftNeighborNearestStart = leftNeighbor
+      ? findNearestStartAtOrAfter(leftNeighbor, allItems, slideItemIds)
+      : null;
+    const rightNeighborNearestEnd = rightNeighbor
+      ? findNearestEndAtOrBefore(rightNeighbor, allItems, slideItemIds)
+      : null;
+
+    const participantContexts: SlideParticipantConstraintContext[] = synchronizedItems
+      .filter((candidate) => candidate.id !== currentItem.id)
+      .map((participant) => {
+        const excludeIds = new Set<string>(slideItemIds);
+        for (const synchronizedItem of synchronizedItems) {
+          excludeIds.add(synchronizedItem.id);
+        }
+
+        const { leftAdjacent, rightAdjacent } = findAdjacentTrackNeighbors(participant, allItems);
+        if (leftAdjacent) excludeIds.add(leftAdjacent.id);
+        if (rightAdjacent) excludeIds.add(rightAdjacent.id);
+
+        return {
+          participant,
+          leftAdjacent,
+          rightAdjacent,
+          nearestNeighbors: findNearestNeighbors(participant, allItems),
+          excludeIds,
+          leftAdjacentNearestStart: leftAdjacent
+            ? findNearestStartAtOrAfter(leftAdjacent, allItems, excludeIds)
+            : null,
+          rightAdjacentNearestEnd: rightAdjacent
+            ? findNearestEndAtOrBefore(rightAdjacent, allItems, excludeIds)
+            : null,
+        };
+      });
+
+    const affectedIds = new Set<string>([currentItem.id]);
+    if (leftNeighbor) affectedIds.add(leftNeighbor.id);
+    if (rightNeighbor) affectedIds.add(rightNeighbor.id);
+    const relatedTransitions = transitions.filter((transition) => (
+      affectedIds.has(transition.leftClipId) || affectedIds.has(transition.rightClipId)
+    ));
+
+    return {
+      currentItem,
+      allItems,
+      itemsById,
+      transitions,
+      leftNeighbor,
+      rightNeighbor,
+      snapTargets,
+      snapExcludeIds,
+      linkedSelectionEnabled,
+      synchronizedCounterpart,
+      leftCounterpart,
+      rightCounterpart,
+      slideItemIds,
+      primaryNearestNeighbors,
+      leftNeighborNearestStart,
+      rightNeighborNearestEnd,
+      participantContexts,
+      relatedTransitions,
+    };
+  }, [getMagneticSnapTargets, snapEnabled]);
+
   const beginSlipSlideGesture = useCallback((startX: number, mode: 'slip' | 'slide') => {
-    usePlaybackStore.getState().setPreviewFrame(null);
+    commitPreviewFrameToCurrentFrame();
 
     const { leftNeighbor, rightNeighbor } = findNeighbors();
     const currentItem = getItemFromStore();
@@ -137,6 +378,7 @@ export function useTimelineSlipSlide(
         trackId: currentItem.trackId,
         slipDelta: 0,
       });
+      slideGestureContextRef.current = null;
     } else {
       // Compute the effective slide range (tightest across all tracks),
       // incorporating transition constraints so the initial limit box matches
@@ -162,6 +404,7 @@ export function useTimelineSlipSlide(
         minDelta: slideMinDelta,
         maxDelta: slideMaxDelta,
       });
+      slideGestureContextRef.current = buildSlideGestureContext(currentItem, leftNeighbor ?? null, rightNeighbor ?? null);
     }
 
     // Seed linked companion previews with zero-delta so their overlays appear immediately
@@ -179,7 +422,7 @@ export function useTimelineSlipSlide(
     }
   // Note: clampSlideDelta intentionally omitted — it reads fps from store at
   // call time, and including it would cause a TDZ error (defined after this hook).
-  }, [findNeighbors, getItemFromStore, item.id, setDragState]);
+  }, [buildSlideGestureContext, findNeighbors, getItemFromStore, item.id, setDragState]);
 
   /**
    * Clamp slip delta to source boundaries.
@@ -303,6 +546,152 @@ export function useTimelineSlipSlide(
     return clamped;
   }, [getItemFromStore, fps, item.id]);
 
+  const clampSlideDeltaWithContext = useCallback((delta: number, context: SlideGestureContext): number => {
+    let clamped = delta;
+    const { currentItem } = context;
+
+    if (currentItem.from + clamped < 0) {
+      clamped = -currentItem.from;
+    }
+
+    if (context.leftNeighbor) {
+      const { clampedAmount } = clampTrimAmount(context.leftNeighbor, 'end', clamped, fps);
+      if (Math.abs(clampedAmount) < Math.abs(clamped)) {
+        clamped = clampedAmount;
+      }
+      clamped = clampEndAgainstNearestStart(
+        context.leftNeighbor,
+        clamped,
+        context.leftNeighborNearestStart,
+      );
+    }
+
+    if (context.rightNeighbor) {
+      const { clampedAmount } = clampTrimAmount(context.rightNeighbor, 'start', clamped, fps);
+      if (Math.abs(clampedAmount) < Math.abs(clamped)) {
+        clamped = clampedAmount;
+      }
+      clamped = clampStartAgainstNearestEnd(
+        context.rightNeighbor,
+        clamped,
+        context.rightNeighborNearestEnd,
+      );
+    }
+
+    for (const participantContext of context.participantContexts) {
+      if (participantContext.leftAdjacent) {
+        const { clampedAmount } = clampTrimAmount(participantContext.leftAdjacent, 'end', clamped, fps);
+        if (Math.abs(clampedAmount) < Math.abs(clamped)) {
+          clamped = clampedAmount;
+        }
+        clamped = clampEndAgainstNearestStart(
+          participantContext.leftAdjacent,
+          clamped,
+          participantContext.leftAdjacentNearestStart,
+        );
+      }
+
+      if (participantContext.rightAdjacent) {
+        const { clampedAmount } = clampTrimAmount(participantContext.rightAdjacent, 'start', clamped, fps);
+        if (Math.abs(clampedAmount) < Math.abs(clamped)) {
+          clamped = clampedAmount;
+        }
+        clamped = clampStartAgainstNearestEnd(
+          participantContext.rightAdjacent,
+          clamped,
+          participantContext.rightAdjacentNearestEnd,
+        );
+      }
+
+      const leftWall = participantContext.nearestNeighbors.leftNeighbor;
+      if (leftWall && !participantContext.excludeIds.has(leftWall.id)) {
+        const wallRight = leftWall.from + leftWall.durationInFrames;
+        const maxLeft = -(participantContext.participant.from - wallRight);
+        if (clamped < maxLeft) clamped = maxLeft;
+      }
+
+      const rightWall = participantContext.nearestNeighbors.rightNeighbor;
+      if (rightWall && !participantContext.excludeIds.has(rightWall.id)) {
+        const participantEnd = participantContext.participant.from + participantContext.participant.durationInFrames;
+        const maxRight = rightWall.from - participantEnd;
+        if (clamped > maxRight) clamped = maxRight;
+      }
+    }
+
+    const primaryLeftWall = context.primaryNearestNeighbors.leftNeighbor;
+    if (primaryLeftWall && !context.slideItemIds.has(primaryLeftWall.id)) {
+      const wallRight = primaryLeftWall.from + primaryLeftWall.durationInFrames;
+      const maxLeft = -(currentItem.from - wallRight);
+      if (clamped < maxLeft) clamped = maxLeft;
+    }
+
+    const primaryRightWall = context.primaryNearestNeighbors.rightNeighbor;
+    if (primaryRightWall && !context.slideItemIds.has(primaryRightWall.id)) {
+      const primaryEnd = currentItem.from + currentItem.durationInFrames;
+      const maxRight = primaryRightWall.from - primaryEnd;
+      if (clamped > maxRight) clamped = maxRight;
+    }
+
+    return clamped;
+  }, [fps]);
+
+  const clampSlideDeltaToPreserveTransitionsWithContext = useCallback((
+    requestedDelta: number,
+    context: SlideGestureContext,
+  ): number => {
+    if (requestedDelta === 0 || context.relatedTransitions.length === 0) {
+      return requestedDelta;
+    }
+
+    const isValid = (delta: number): boolean => {
+      const previewById = new Map<string, TimelineItem>();
+
+      if (context.leftNeighbor) {
+        previewById.set(
+          context.leftNeighbor.id,
+          applyPreviewUpdate(context.leftNeighbor, applyTrimEndPreview(context.leftNeighbor, delta, fps)),
+        );
+      }
+
+      if (context.rightNeighbor) {
+        previewById.set(
+          context.rightNeighbor.id,
+          applyPreviewUpdate(context.rightNeighbor, applyTrimStartPreview(context.rightNeighbor, delta, fps)),
+        );
+      }
+
+      let slidItemPreview = applyPreviewUpdate(context.currentItem, applyMovePreview(context.currentItem, delta));
+      const continuitySourceDelta = computeSlideContinuitySourceDelta(
+        context.currentItem,
+        context.leftNeighbor,
+        context.rightNeighbor,
+        delta,
+        fps,
+      );
+      if (
+        continuitySourceDelta !== 0
+        && (slidItemPreview.type === 'video' || slidItemPreview.type === 'audio' || slidItemPreview.type === 'composition')
+        && slidItemPreview.sourceEnd !== undefined
+      ) {
+        slidItemPreview = {
+          ...slidItemPreview,
+          sourceStart: (slidItemPreview.sourceStart ?? 0) + continuitySourceDelta,
+          sourceEnd: slidItemPreview.sourceEnd + continuitySourceDelta,
+        };
+      }
+      previewById.set(context.currentItem.id, slidItemPreview);
+
+      return context.relatedTransitions.every((transition) => {
+        const leftClip = previewById.get(transition.leftClipId) ?? context.itemsById.get(transition.leftClipId) ?? null;
+        const rightClip = previewById.get(transition.rightClipId) ?? context.itemsById.get(transition.rightClipId) ?? null;
+        if (!leftClip || !rightClip) return true;
+        return canAddTransition(leftClip, rightClip, transition.durationInFrames, transition.alignment).canAdd;
+      });
+    };
+
+    return clampDeltaToLastValidValue(requestedDelta, isValid);
+  }, [fps]);
+
   // Mouse move handler
   const handleMouseMove = useCallback(
     (e: MouseEvent) => {
@@ -385,15 +774,18 @@ export function useTimelineSlipSlide(
         useLinkedEditPreviewStore.getState().setUpdates(linkedPreviewUpdates);
 
       } else if (mode === 'slide') {
+        const slideContext = slideGestureContextRef.current;
         const { leftNeighborId, rightNeighborId } = stateRef.current;
-        const storeItem = getItemFromStore();
+        const storeItem = slideContext?.currentItem ?? getItemFromStore();
 
         // Apply snapping for slide (clip edges snap to items/playhead/grid)
         if (snapEnabled) {
-          const targets = getMagneticSnapTargets();
-          const excludeIds = new Set<string>([item.id]);
-          if (leftNeighborId) excludeIds.add(leftNeighborId);
-          if (rightNeighborId) excludeIds.add(rightNeighborId);
+          const targets = slideContext?.snapTargets ?? getMagneticSnapTargets();
+          const excludeIds = slideContext?.snapExcludeIds ?? new Set<string>([
+            item.id,
+            leftNeighborId ?? '',
+            rightNeighborId ?? '',
+          ].filter(Boolean));
 
           const newStart = storeItem.from + deltaFrames;
           const newEnd = newStart + storeItem.durationInFrames;
@@ -425,17 +817,21 @@ export function useTimelineSlipSlide(
           }
         }
 
-        const allItems = useTimelineStore.getState().items;
-        const sourceClamped = clampSlideDelta(deltaFrames, leftNeighborId, rightNeighborId);
-        const transitionClamped = clampSlideDeltaToPreserveTransitions(
-          storeItem,
-          sourceClamped,
-          leftNeighborId ? (allItems.find((candidate) => candidate.id === leftNeighborId) ?? null) : null,
-          rightNeighborId ? (allItems.find((candidate) => candidate.id === rightNeighborId) ?? null) : null,
-          allItems,
-          useTransitionsStore.getState().transitions,
-          fps,
-        );
+        const allItems = slideContext?.allItems ?? useTimelineStore.getState().items;
+        const sourceClamped = slideContext
+          ? clampSlideDeltaWithContext(deltaFrames, slideContext)
+          : clampSlideDelta(deltaFrames, leftNeighborId, rightNeighborId);
+        const transitionClamped = slideContext
+          ? clampSlideDeltaToPreserveTransitionsWithContext(sourceClamped, slideContext)
+          : clampSlideDeltaToPreserveTransitions(
+            storeItem,
+            sourceClamped,
+            leftNeighborId ? (allItems.find((candidate) => candidate.id === leftNeighborId) ?? null) : null,
+            rightNeighborId ? (allItems.find((candidate) => candidate.id === rightNeighborId) ?? null) : null,
+            allItems,
+            useTransitionsStore.getState().transitions,
+            fps,
+          );
         const clamped = transitionClamped;
         const isConstrained = clamped !== deltaFrames;
         const constraintEdge = !isConstrained
@@ -485,21 +881,27 @@ export function useTimelineSlipSlide(
         }
 
         const linkedSelectionEnabled = useEditorStore.getState().linkedSelectionEnabled;
-        const synchronizedCounterpart = linkedSelectionEnabled
-          ? getSynchronizedLinkedItems(allItems, storeItem.id)
-            .find((candidate) => candidate.id !== storeItem.id) ?? null
-          : null;
+        const synchronizedCounterpart = slideContext
+          ? slideContext.synchronizedCounterpart
+          : linkedSelectionEnabled
+            ? getSynchronizedLinkedItems(allItems, storeItem.id)
+              .find((candidate) => candidate.id !== storeItem.id) ?? null
+            : null;
         const linkedPreviewUpdates: PreviewItemUpdate[] = [];
 
         if (synchronizedCounterpart) {
           linkedPreviewUpdates.push(applyMovePreview(synchronizedCounterpart, clamped));
 
-          const leftCounterpart = leftNeighborId
-            ? getMatchingSynchronizedLinkedCounterpart(allItems, leftNeighborId, synchronizedCounterpart.trackId, synchronizedCounterpart.type)
-            : null;
-          const rightCounterpart = rightNeighborId
-            ? getMatchingSynchronizedLinkedCounterpart(allItems, rightNeighborId, synchronizedCounterpart.trackId, synchronizedCounterpart.type)
-            : null;
+          const leftCounterpart = slideContext
+            ? slideContext.leftCounterpart
+            : leftNeighborId
+              ? getMatchingSynchronizedLinkedCounterpart(allItems, leftNeighborId, synchronizedCounterpart.trackId, synchronizedCounterpart.type)
+              : null;
+          const rightCounterpart = slideContext
+            ? slideContext.rightCounterpart
+            : rightNeighborId
+              ? getMatchingSynchronizedLinkedCounterpart(allItems, rightNeighborId, synchronizedCounterpart.trackId, synchronizedCounterpart.type)
+              : null;
 
           if (leftCounterpart) {
             linkedPreviewUpdates.push(applyTrimEndPreview(leftCounterpart, clamped, fps));
@@ -513,7 +915,20 @@ export function useTimelineSlipSlide(
 
       }
     },
-    [pixelsToTime, fps, trackLocked, item.id, getItemFromStore, clampSlipDelta, clampSlideDelta, snapEnabled, getMagneticSnapTargets, getSnapThresholdFrames],
+    [
+      pixelsToTime,
+      fps,
+      trackLocked,
+      item.id,
+      getItemFromStore,
+      clampSlipDelta,
+      clampSlideDelta,
+      clampSlideDeltaToPreserveTransitionsWithContext,
+      clampSlideDeltaWithContext,
+      snapEnabled,
+      getMagneticSnapTargets,
+      getSnapThresholdFrames,
+    ],
   );
 
   // Mouse up handler — commits changes
@@ -552,6 +967,7 @@ export function useTimelineSlipSlide(
         constraintLabel: null,
       });
       latestDeltaRef.current = 0;
+      slideGestureContextRef.current = null;
     }
   }, [item.id, setDragState]);
 
@@ -571,6 +987,7 @@ export function useTimelineSlipSlide(
           useLinkedEditPreviewStore.getState().clear();
           setDragState(null);
           latestDeltaRef.current = 0;
+          slideGestureContextRef.current = null;
         }
       };
     }
@@ -578,6 +995,7 @@ export function useTimelineSlipSlide(
 
   useEffect(() => () => {
     pendingStartCleanupRef.current?.();
+    slideGestureContextRef.current = null;
   }, []);
 
   // Start slip/slide drag
diff --git a/src/features/timeline/hooks/use-timeline-trim.ts b/src/features/timeline/hooks/use-timeline-trim.ts
index a1700b824..7c98a728d 100644
--- a/src/features/timeline/hooks/use-timeline-trim.ts
+++ b/src/features/timeline/hooks/use-timeline-trim.ts
@@ -1,6 +1,6 @@
 import { useState, useCallback, useRef, useEffect } from 'react';
 import type { TimelineItem } from '@/types/timeline';
-import { usePlaybackStore } from '@/shared/state/playback';
+import { commitPreviewFrameToCurrentFrame } from '@/shared/state/playback';
 import { useEditorStore } from '@/app/state/editor';
 import { toast } from 'sonner';
 import type { SnapTarget } from '../types/drag';
@@ -719,7 +719,7 @@ export function useTimelineTrim(item: TimelineItem, timelineDuration: number, tr
       // including guardrail early returns.
       e.stopPropagation();
       e.preventDefault();
-      usePlaybackStore.getState().setPreviewFrame(null);
+      commitPreviewFrameToCurrentFrame();
 
       const forcedMode = options?.forcedMode ?? null;
       const destroyTransitionAtHandle = options?.destroyTransitionAtHandle ?? false;
diff --git a/src/features/timeline/hooks/use-track-push.ts b/src/features/timeline/hooks/use-track-push.ts
index 5ef1e5a2b..95d895cd9 100644
--- a/src/features/timeline/hooks/use-track-push.ts
+++ b/src/features/timeline/hooks/use-track-push.ts
@@ -1,6 +1,6 @@
 import { useState, useCallback, useRef, useEffect } from 'react';
 import type { TimelineItem } from '@/types/timeline';
-import { usePlaybackStore } from '@/shared/state/playback';
+import { commitPreviewFrameToCurrentFrame } from '@/shared/state/playback';
 import { useSelectionStore } from '@/shared/state/selection';
 import { useTimelineStore } from '../stores/timeline-store';
 import { useItemsStore } from '../stores/items-store';
@@ -135,7 +135,7 @@ export function useTrackPush(item: TimelineItem, timelineDuration: number, track
     if (e.button !== 0 || trackLocked) return;
     e.stopPropagation();
     e.preventDefault();
-    usePlaybackStore.getState().setPreviewFrame(null);
+    commitPreviewFrameToCurrentFrame();
 
     const { items: allItems, itemsByTrackId } = useItemsStore.getState();
     const cutFrame = item.from;
diff --git a/src/features/timeline/hooks/use-transition-resize.ts b/src/features/timeline/hooks/use-transition-resize.ts
index c1cc96f0b..57173b276 100644
--- a/src/features/timeline/hooks/use-transition-resize.ts
+++ b/src/features/timeline/hooks/use-transition-resize.ts
@@ -1,6 +1,6 @@
 import { useState, useCallback, useRef, useEffect, useMemo } from 'react';
 import type { Transition } from '@/types/transition';
-import { usePlaybackStore } from '@/shared/state/playback';
+import { commitPreviewFrameToCurrentFrame } from '@/shared/state/playback';
 import { TRANSITION_CONFIGS } from '@/types/transition';
 import { useTimelineStore } from '../stores/timeline-store';
 import { useItemsStore } from '../stores/items-store';
@@ -132,7 +132,7 @@ export function useTransitionResize(transition: Transition) {
     (e: React.MouseEvent, handle: ResizeHandle) => {
       e.preventDefault();
       e.stopPropagation();
-      usePlaybackStore.getState().setPreviewFrame(null);
+      commitPreviewFrameToCurrentFrame();
 
       setResizeState({
         isResizing: true,
diff --git a/src/features/timeline/services/filmstrip-cache.test.ts b/src/features/timeline/services/filmstrip-cache.test.ts
index ef383b5fc..9b5a2fca1 100644
--- a/src/features/timeline/services/filmstrip-cache.test.ts
+++ b/src/features/timeline/services/filmstrip-cache.test.ts
@@ -22,10 +22,15 @@ vi.mock('@/shared/logging/logger', () => ({
   createLogger: vi.fn(() => loggerMocks),
 }));
 
-vi.mock('./filmstrip-opfs-storage', () => ({
-  filmstripOPFSStorage: {
+vi.mock('./filmstrip-storage', () => ({
+  filmstripStorage: {
     load: vi.fn(),
     saveMetadata: vi.fn(),
+    saveFrameBlob: vi.fn(),
+    loadSingleFrame: vi.fn(),
+    getExistingIndices: vi.fn(),
+    createFrameFromBitmap: vi.fn(),
+    createFrameFromBlob: vi.fn(),
     revokeUrls: vi.fn(),
     delete: vi.fn(),
     clearAll: vi.fn(),
diff --git a/src/features/timeline/services/filmstrip-cache.ts b/src/features/timeline/services/filmstrip-cache.ts
index d17f065d9..f9e968d88 100644
--- a/src/features/timeline/services/filmstrip-cache.ts
+++ b/src/features/timeline/services/filmstrip-cache.ts
@@ -3,7 +3,7 @@
  *
  * Simple service that:
  * 1. Manages extraction worker
- * 2. Provides object URLs from OPFS storage
+ * 2. Provides object URLs from persisted filmstrip storage
  * 3. Notifies subscribers when new frames are available
  *
  * No ImageBitmaps in memory - just URLs for <img> tags.
@@ -25,7 +25,7 @@ import {
   FILMSTRIP_EXTRACT_HEIGHT,
   THUMBNAIL_WIDTH,
 } from '@/features/timeline/constants';
-import { filmstripOPFSStorage, type FilmstripFrame } from './filmstrip-opfs-storage';
+import { filmstripStorage, type FilmstripFrame } from './filmstrip-storage';
 import { FilmstripMemoryState } from './filmstrip-memory-state';
 import type { ExtractRequest, WorkerResponse } from '../workers/filmstrip-extraction-worker';
 
@@ -319,7 +319,7 @@ class FilmstripCacheService {
 
     this.cache.delete(mediaId);
     this.clearCacheMeta(mediaId);
-    filmstripOPFSStorage.revokeUrls(mediaId);
+    filmstripStorage.revokeUrls(mediaId);
     this.clearIdleEvictionTimer(mediaId);
     logger.debug(`Evicted in-memory filmstrip ${mediaId} (${reason})`);
     return true;
@@ -798,7 +798,7 @@ class FilmstripCacheService {
         frames: cached.frames,
         existingIndices: cached.frames.map((frame) => frame.index),
       }
-      : await filmstripOPFSStorage.load(mediaId);
+      : await filmstripStorage.load(mediaId);
 
     const existingFrames = stored?.frames ?? [];
     const existingIndices = stored?.existingIndices ?? [];
@@ -1082,7 +1082,7 @@ class FilmstripCacheService {
     options?: FilmstripLoadOptions,
   ): Promise<Filmstrip> {
     // Try loading from storage
-    const stored = await filmstripOPFSStorage.load(mediaId);
+    const stored = await filmstripStorage.load(mediaId);
 
     if (stored?.metadata.isComplete) {
       // Complete - return immediately
@@ -1231,7 +1231,7 @@ class FilmstripCacheService {
       const targetFrames = [...existingFrames].sort((a, b) => a.index - b.index);
       const settled = this.buildSettledFilmstrip(pending, targetFrames);
       if (settled.isComplete && this.shouldPersistCompletionMetadata(pending)) {
-        void filmstripOPFSStorage.saveMetadata(mediaId, {
+        void filmstripStorage.saveMetadata(mediaId, {
           width: FILMSTRIP_EXTRACT_WIDTH,
           height: FILMSTRIP_EXTRACT_HEIGHT,
           isComplete: true,
@@ -1251,7 +1251,7 @@ class FilmstripCacheService {
 
     // Persist extraction session metadata once. Workers should focus on frame
     // writes; centralizing meta writes avoids cross-worker file contention.
-    void filmstripOPFSStorage.saveMetadata(mediaId, {
+    void filmstripStorage.saveMetadata(mediaId, {
       width: FILMSTRIP_EXTRACT_WIDTH,
       height: FILMSTRIP_EXTRACT_HEIGHT,
       isComplete: false,
@@ -1504,9 +1504,9 @@ class FilmstripCacheService {
           }
 
           // When blobs arrive (after JPEG encode), upgrade frames with proper URLs
-          // and persist to OPFS. This replaces bitmap-only frames.
+          // and persist them to the workspace. This replaces bitmap-only frames.
           if (Array.isArray(response.savedFrames) && response.savedFrames.length > 0) {
-            this.ingestSavedFrames(
+            await this.ingestSavedFrames(
               mediaId,
               response.savedFrames.filter((frame) =>
                 frame.index >= workerState.startIndex
@@ -1525,14 +1525,14 @@ class FilmstripCacheService {
               try {
                 await this.loadNewFrames(mediaId, newIndices);
               } catch (error) {
-                logger.error('Failed to load saved filmstrip frames from OPFS', {
+                logger.error('Failed to load saved filmstrip frames from persisted storage', {
                   mediaId,
                   requestId: workerState.requestId,
                   range: [workerState.startIndex, workerState.endIndex],
                   newIndicesCount: newIndices.length,
                   error,
                 });
-                this.handleWorkerError(mediaId, 'Failed to load saved frames from OPFS');
+                this.handleWorkerError(mediaId, 'Failed to load saved frames from storage');
                 return;
               }
             }
@@ -1544,14 +1544,14 @@ class FilmstripCacheService {
               try {
                 await this.flushWorkerRangeLoads(mediaId, workerState);
               } catch (error) {
-                logger.error('Failed to flush worker frame range loads from OPFS', {
+                logger.error('Failed to flush worker frame range loads from persisted storage', {
                   mediaId,
                   requestId: workerState.requestId,
                   range: [workerState.startIndex, workerState.endIndex],
                   newFrameCount,
                   error,
                 });
-                this.handleWorkerError(mediaId, 'Failed to refresh worker frame range from OPFS');
+                this.handleWorkerError(mediaId, 'Failed to refresh worker frame range from storage');
                 return;
               }
             }
@@ -1580,12 +1580,12 @@ class FilmstripCacheService {
           // Check if all workers are done
           if (pending.completedWorkers === pending.workers.length) {
             // All workers done - finalize directly from in-memory extracted frames
-            // to avoid an extra full OPFS directory scan and URL recreation pass.
+            // to avoid an extra full storage scan and URL recreation pass.
             const finalFrames = Array.from(pending.extractedFrames.values())
               .sort((a, b) => a.index - b.index);
             const settled = this.buildSettledFilmstrip(pending, finalFrames);
             try {
-              await filmstripOPFSStorage.saveMetadata(mediaId, {
+              await filmstripStorage.saveMetadata(mediaId, {
                 width: FILMSTRIP_EXTRACT_WIDTH,
                 height: FILMSTRIP_EXTRACT_HEIGHT,
                 isComplete: settled.isComplete && this.shouldPersistCompletionMetadata(pending),
@@ -1678,7 +1678,7 @@ class FilmstripCacheService {
     if (!pending) return 0;
 
     // Discover what is actually saved on disk for this worker's range.
-    const inRangeExistingIndices = await filmstripOPFSStorage.getExistingIndices(
+    const inRangeExistingIndices = await filmstripStorage.getExistingIndices(
       mediaId,
       startIndex,
       endIndex
@@ -1703,7 +1703,7 @@ class FilmstripCacheService {
     if (indices.length === 0) return;
 
     const loadPromises = indices.map(async (index) => {
-      const frame = await filmstripOPFSStorage.loadSingleFrame(mediaId, index);
+      const frame = await filmstripStorage.loadSingleFrame(mediaId, index);
       if (frame) {
         pending.extractedFrames.set(index, frame);
         this.noteFirstFrame(pending.metrics);
@@ -1725,7 +1725,7 @@ class FilmstripCacheService {
         bf.bitmap.close();
         continue;
       }
-      const frame = filmstripOPFSStorage.createFrameFromBitmap(mediaId, bf.index, bf.bitmap);
+      const frame = filmstripStorage.createFrameFromBitmap(mediaId, bf.index, bf.bitmap);
       if (frame) {
         pending.extractedFrames.set(bf.index, frame);
         this.noteFirstFrame(pending.metrics);
@@ -1733,16 +1733,18 @@ class FilmstripCacheService {
     }
   }
 
-  private ingestSavedFrames(
+  private async ingestSavedFrames(
     mediaId: string,
     savedFrames: Array<{ index: number; blob: Blob }>
-  ): void {
+  ): Promise<void> {
     const pending = this.pendingExtractions.get(mediaId);
     if (!pending || savedFrames.length === 0) return;
 
+    const persistWrites: Promise<void>[] = [];
+
     for (const saved of savedFrames) {
       const existing = pending.extractedFrames.get(saved.index);
-      const frame = filmstripOPFSStorage.createFrameFromBlob(mediaId, saved.index, saved.blob);
+      const frame = filmstripStorage.createFrameFromBlob(mediaId, saved.index, saved.blob);
       if (frame) {
         // Close bitmap if this frame was previously bitmap-only
         if (existing?.bitmap) {
@@ -1753,6 +1755,13 @@ class FilmstripCacheService {
           this.noteFirstFrame(pending.metrics);
         }
       }
+      persistWrites.push(
+        filmstripStorage.saveFrameBlob(mediaId, saved.index, saved.blob),
+      );
+    }
+
+    if (persistWrites.length > 0) {
+      await Promise.all(persistWrites);
     }
   }
 
@@ -1919,7 +1928,7 @@ class FilmstripCacheService {
       const totalTargetFrames = Math.max(1, targetIndices.length);
       let extractedTargetCount = skipSet.size;
 
-      await filmstripOPFSStorage.saveMetadata(mediaId, {
+      await filmstripStorage.saveMetadata(mediaId, {
         width: FILMSTRIP_EXTRACT_WIDTH,
         height: FILMSTRIP_EXTRACT_HEIGHT,
         isComplete: false,
@@ -1951,9 +1960,9 @@ class FilmstripCacheService {
         this.drawCoverFrame(video, ctx, canvas.width, canvas.height);
 
         const blob = await this.canvasToBlob(canvas);
-        await filmstripOPFSStorage.saveFrameBlob(mediaId, i, blob);
+        await filmstripStorage.saveFrameBlob(mediaId, i, blob);
 
-        const frame = await filmstripOPFSStorage.loadSingleFrame(mediaId, i);
+        const frame = await filmstripStorage.loadSingleFrame(mediaId, i);
         if (frame) {
           currentPending.extractedFrames.set(i, frame);
           this.noteFirstFrame(currentPending.metrics);
@@ -1987,7 +1996,7 @@ class FilmstripCacheService {
       const finalFrames = Array.from(finishedPending.extractedFrames.values())
         .sort((a, b) => a.index - b.index);
       const settled = this.buildSettledFilmstrip(finishedPending, finalFrames);
-      await filmstripOPFSStorage.saveMetadata(mediaId, {
+      await filmstripStorage.saveMetadata(mediaId, {
         width: FILMSTRIP_EXTRACT_WIDTH,
         height: FILMSTRIP_EXTRACT_HEIGHT,
         isComplete: settled.isComplete && this.shouldPersistCompletionMetadata(finishedPending),
@@ -2285,7 +2294,7 @@ class FilmstripCacheService {
   }
 
   /**
-   * Refresh cached frame URLs from OPFS when a visible tile reports a stale source.
+   * Refresh cached frame URLs from persisted storage when a visible tile reports a stale source.
    */
   async refreshFrames(mediaId: string, frameIndices: number[]): Promise<void> {
     const normalizedIndices = Array.from(new Set(
@@ -2296,7 +2305,7 @@ class FilmstripCacheService {
     }
 
     const refreshedEntries = await Promise.all(normalizedIndices.map(async (index) => {
-      const frame = await filmstripOPFSStorage.loadSingleFrame(mediaId, index);
+      const frame = await filmstripStorage.loadSingleFrame(mediaId, index);
       return frame ? [index, frame] as const : null;
     }));
     const refreshedByIndex = new Map(
@@ -2369,8 +2378,8 @@ class FilmstripCacheService {
     this.clearIdleEvictionTimer(mediaId);
     this.cache.delete(mediaId);
     this.clearCacheMeta(mediaId);
-    filmstripOPFSStorage.revokeUrls(mediaId);
-    await filmstripOPFSStorage.delete(mediaId);
+    filmstripStorage.revokeUrls(mediaId);
+    await filmstripStorage.delete(mediaId);
   }
 
   /**
@@ -2382,7 +2391,7 @@ class FilmstripCacheService {
     }
     this.cache.clear();
     this.memoryState.clear();
-    await filmstripOPFSStorage.clearAll();
+    await filmstripStorage.clearAll();
   }
 
   /**
@@ -2390,7 +2399,7 @@ class FilmstripCacheService {
    *
    * IMPORTANT:
    * - This is runtime cleanup only (workers, timers, in-memory URLs/cache).
-   * - Do NOT clear OPFS filmstrip files here.
+   * - Do NOT clear persisted filmstrip files here.
    *   Persistent filmstrip data must survive page refresh so F5 can reuse cache.
    * - Use clearAll()/clearMedia() only for explicit user/debug cache reset flows.
    */
@@ -2399,9 +2408,9 @@ class FilmstripCacheService {
       this.abort(mediaId);
     }
     this.workerPoolManager.terminateAll();
-    // Revoke in-memory object URLs only; keep persisted OPFS filmstrip files.
+    // Revoke in-memory object URLs only; keep persisted filmstrip files.
     for (const mediaId of this.cache.keys()) {
-      filmstripOPFSStorage.revokeUrls(mediaId);
+      filmstripStorage.revokeUrls(mediaId);
     }
     this.cache.clear();
     this.memoryState.clear();
diff --git a/src/features/timeline/services/filmstrip-memory-state.ts b/src/features/timeline/services/filmstrip-memory-state.ts
index 1e395ea1a..7be6ed086 100644
--- a/src/features/timeline/services/filmstrip-memory-state.ts
+++ b/src/features/timeline/services/filmstrip-memory-state.ts
@@ -2,7 +2,7 @@ import {
   FILMSTRIP_EXTRACT_HEIGHT,
   FILMSTRIP_EXTRACT_WIDTH,
 } from '@/features/timeline/constants';
-import type { FilmstripFrame } from './filmstrip-opfs-storage';
+import type { FilmstripFrame } from './filmstrip-storage';
 
 const FRAME_MEMORY_FALLBACK_BYTES = FILMSTRIP_EXTRACT_WIDTH * FILMSTRIP_EXTRACT_HEIGHT * 4;
 
diff --git a/src/features/timeline/services/filmstrip-opfs-storage.ts b/src/features/timeline/services/filmstrip-opfs-storage.ts
deleted file mode 100644
index f1698f910..000000000
--- a/src/features/timeline/services/filmstrip-opfs-storage.ts
+++ /dev/null
@@ -1,623 +0,0 @@
-﻿/**
- * OPFS Filmstrip Storage
- *
- * Simple storage for filmstrip frames. Worker handles saving,
- * this service handles loading and providing object URLs.
- *
- * Storage structure:
- *   filmstrips/{mediaId}/
- *     meta.json - { width, height, isComplete, frameCount }
- *     0.jpg, 1.jpg, 2.jpg, ... (legacy caches may still use .webp)
- */
-
-import { createLogger } from '@/shared/logging/logger';
-import { getCacheMigration } from '@/infrastructure/storage/cache-version';
-import {
-  mirrorBlobToWorkspace,
-  mirrorJsonToWorkspace,
-  readWorkspaceBlob,
-  removeWorkspaceCacheEntry,
-} from '@/infrastructure/storage/workspace-fs/cache-mirror';
-import {
-  filmstripFileFramePath,
-  filmstripMetaPath,
-  WORKSPACE_FILMSTRIPS_DIR,
-} from '@/infrastructure/storage/workspace-fs/paths';
-import { safeWrite } from '../utils/opfs-safe-write';
-
-const logger = createLogger('FilmstripOPFS');
-
-const FILMSTRIP_DIR = 'filmstrips';
-const FRAME_RATE = 1; // Must match worker - 1fps for filmstrip thumbnails
-const PRIMARY_FRAME_EXT = 'jpg';
-const LEGACY_FRAME_EXT = 'webp';
-const FRAME_EXTENSIONS = new Set([PRIMARY_FRAME_EXT, LEGACY_FRAME_EXT]);
-const VALIDATION_TTL_MS = 10_000;
-
-function parseFrameFileNameParts(name: string): { index: number; ext: string } | null {
-  const dotIndex = name.lastIndexOf('.');
-  if (dotIndex <= 0) return null;
-  const ext = name.slice(dotIndex + 1).toLowerCase();
-  if (!FRAME_EXTENSIONS.has(ext)) return null;
-  const index = parseInt(name.slice(0, dotIndex), 10);
-  if (Number.isNaN(index)) return null;
-  return { index, ext };
-}
-
-function parseFrameFileName(name: string): number | null {
-  const parsed = parseFrameFileNameParts(name);
-  return parsed?.index ?? null;
-}
-
-interface FilmstripMetadata {
-  width: number;
-  height: number;
-  isComplete: boolean;
-  frameCount: number;
-}
-
-export interface FilmstripFrame {
-  index: number;
-  timestamp: number;
-  url: string; // Object URL for img src
-  byteSize?: number;
-  /** Hardware-backed bitmap for instant canvas rendering (skips JPEG decode) */
-  bitmap?: ImageBitmap;
-}
-
-interface LoadedFilmstrip {
-  metadata: FilmstripMetadata;
-  frames: FilmstripFrame[];
-  existingIndices: number[];
-}
-
-interface MediaDirCacheEntry {
-  handle: FileSystemDirectoryHandle;
-  lastValidated: number;
-}
-
-/**
- * OPFS Filmstrip Storage Service
- */
-class FilmstripOPFSStorage {
-  private dirHandle: FileSystemDirectoryHandle | null = null;
-  private initPromise: Promise<FileSystemDirectoryHandle> | null = null;
-  private objectUrls = new Map<string, Map<number, string>>(); // mediaId -> frameIndex -> url
-  private mediaDirCache = new Map<string, MediaDirCacheEntry>();
-
-  private scheduleRevoke(urls: string[]): void {
-    if (urls.length === 0) return;
-
-    const revoke = () => {
-      for (const url of urls) {
-        URL.revokeObjectURL(url);
-      }
-    };
-
-    if (typeof requestIdleCallback === 'function') {
-      requestIdleCallback(revoke, { timeout: 10_000 });
-      return;
-    }
-
-    setTimeout(revoke, 0);
-  }
-
-  private setFrameUrl(mediaId: string, index: number, url: string): void {
-    const urlsByIndex = this.objectUrls.get(mediaId) ?? new Map<number, string>();
-    const previous = urlsByIndex.get(index);
-    urlsByIndex.set(index, url);
-    this.objectUrls.set(mediaId, urlsByIndex);
-
-    if (previous && previous !== url) {
-      this.scheduleRevoke([previous]);
-    }
-  }
-
-  private replaceAllFrameUrls(
-    mediaId: string,
-    entries: Array<{ index: number; url: string }>
-  ): void {
-    const previous = this.objectUrls.get(mediaId);
-    const next = new Map<number, string>();
-    for (const entry of entries) {
-      next.set(entry.index, entry.url);
-    }
-    this.objectUrls.set(mediaId, next);
-
-    if (!previous) return;
-
-    const toRevoke: string[] = [];
-    for (const [index, url] of previous) {
-      const nextUrl = next.get(index);
-      if (nextUrl !== url) {
-        toRevoke.push(url);
-      }
-    }
-    this.scheduleRevoke(toRevoke);
-  }
-
-  /**
-   * Initialize OPFS directory
-   */
-  private async ensureDirectory(): Promise<FileSystemDirectoryHandle> {
-    if (this.dirHandle) return this.dirHandle;
-    if (this.initPromise) return this.initPromise;
-
-    this.initPromise = this.initialize();
-    return this.initPromise;
-  }
-
-  private async initialize(): Promise<FileSystemDirectoryHandle> {
-    try {
-      const root = await navigator.storage.getDirectory();
-      const dir = await root.getDirectoryHandle(FILMSTRIP_DIR, { create: true });
-
-      // Run migration if needed
-      const migration = getCacheMigration('filmstrip');
-      if (migration.needsMigration) {
-        const entries: string[] = [];
-        for await (const entry of dir.values()) {
-          entries.push(entry.name);
-        }
-        for (const name of entries) {
-          await dir.removeEntry(name, { recursive: true }).catch(() => {});
-        }
-        migration.markComplete();
-        logger.info(`Filmstrip cache cleared for v${migration.newVersion}`);
-      }
-
-      this.dirHandle = dir;
-      return dir;
-    } catch (error) {
-      logger.error('Failed to initialize OPFS:', error);
-      throw error;
-    }
-  }
-
-  /**
-   * Get media directory handle
-   */
-  private async getMediaDir(mediaId: string): Promise<FileSystemDirectoryHandle | null> {
-    const cached = this.mediaDirCache.get(mediaId);
-    const dir = await this.ensureDirectory();
-
-    if (cached) {
-      if (Date.now() - cached.lastValidated <= VALIDATION_TTL_MS) {
-        return cached.handle;
-      }
-
-      try {
-        // Probe the cached handle. If the underlying directory was removed,
-        // OPFS access will throw and we'll invalidate + recover below.
-        const iterator = cached.handle.values();
-        await iterator.next();
-        this.mediaDirCache.set(mediaId, {
-          handle: cached.handle,
-          lastValidated: Date.now(),
-        });
-        return cached.handle;
-      } catch {
-        this.mediaDirCache.delete(mediaId);
-        try {
-          const reopened = await dir.getDirectoryHandle(mediaId);
-          this.mediaDirCache.set(mediaId, {
-            handle: reopened,
-            lastValidated: Date.now(),
-          });
-          return reopened;
-        } catch {
-          return null;
-        }
-      }
-    }
-
-    try {
-      const mediaDir = await dir.getDirectoryHandle(mediaId);
-      this.mediaDirCache.set(mediaId, {
-        handle: mediaDir,
-        lastValidated: Date.now(),
-      });
-      return mediaDir;
-    } catch {
-      return null;
-    }
-  }
-
-  /**
-   * Get or create media directory handle
-   */
-  private async getOrCreateMediaDir(mediaId: string): Promise<FileSystemDirectoryHandle> {
-    const cached = this.mediaDirCache.get(mediaId);
-    if (cached && Date.now() - cached.lastValidated <= VALIDATION_TTL_MS) {
-      return cached.handle;
-    }
-
-    const dir = await this.ensureDirectory();
-    const mediaDir = await dir.getDirectoryHandle(mediaId, { create: true });
-    this.mediaDirCache.set(mediaId, {
-      handle: mediaDir,
-      lastValidated: Date.now(),
-    });
-    return mediaDir;
-  }
-
-  /**
-   * Save metadata file (used by worker and fallback extraction)
-   */
-  async saveMetadata(
-    mediaId: string,
-    metadata: { width: number; height: number; isComplete: boolean; frameCount: number }
-  ): Promise<void> {
-    const mediaDir = await this.getOrCreateMediaDir(mediaId);
-    const fileHandle = await mediaDir.getFileHandle('meta.json', { create: true });
-    const writable = await fileHandle.createWritable();
-    await safeWrite(writable, JSON.stringify(metadata));
-    void mirrorJsonToWorkspace(filmstripMetaPath(mediaId), metadata);
-  }
-
-  /**
-   * Save a frame blob at a specific index
-   */
-  async saveFrameBlob(mediaId: string, index: number, blob: Blob): Promise<void> {
-    const mediaDir = await this.getOrCreateMediaDir(mediaId);
-    const fileHandle = await mediaDir.getFileHandle(`${index}.${PRIMARY_FRAME_EXT}`, { create: true });
-    const writable = await fileHandle.createWritable();
-    await safeWrite(writable, blob);
-    void mirrorBlobToWorkspace(
-      filmstripFileFramePath(mediaId, index, PRIMARY_FRAME_EXT),
-      blob,
-    );
-  }
-
-  /**
-   * Load filmstrip - returns object URLs for img src
-   */
-  async load(mediaId: string): Promise<LoadedFilmstrip | null> {
-    try {
-      let mediaDir = await this.getMediaDir(mediaId);
-
-      // If OPFS has nothing for this media, try hydrating from the workspace
-      // folder — another origin may have produced the filmstrip. hydration
-      // writes the meta.json + primary-ext frames back into OPFS so the
-      // normal load path can pick it up.
-      if (!mediaDir) {
-        const hydrated = await this.hydrateFromWorkspace(mediaId);
-        if (!hydrated) return null;
-        mediaDir = await this.getMediaDir(mediaId);
-        if (!mediaDir) return null;
-      }
-
-      // Load metadata
-      let metadata: FilmstripMetadata;
-      try {
-        const metaHandle = await mediaDir.getFileHandle('meta.json');
-        const metaFile = await metaHandle.getFile();
-        metadata = JSON.parse(await metaFile.text());
-      } catch {
-        const hydrated = await this.hydrateFromWorkspace(mediaId);
-        if (!hydrated) return null;
-        try {
-          const metaHandle = await mediaDir.getFileHandle('meta.json');
-          const metaFile = await metaHandle.getFile();
-          metadata = JSON.parse(await metaFile.text());
-        } catch {
-          return null;
-        }
-      }
-
-      // Collect frame files (dedupe by frame index, prefer primary extension).
-      const frameFilesByIndex = new Map<number, { file: File; ext: string }>();
-      for await (const entry of mediaDir.values()) {
-        if (entry.kind !== 'file') continue;
-        const parsed = parseFrameFileNameParts(entry.name);
-        if (!parsed) continue;
-        try {
-          const fileHandle = entry as FileSystemFileHandle;
-          const file = await fileHandle.getFile();
-          if (file.size <= 0) continue;
-
-          const existing = frameFilesByIndex.get(parsed.index);
-          const shouldReplace = !existing
-            || (parsed.ext === PRIMARY_FRAME_EXT && existing.ext !== PRIMARY_FRAME_EXT);
-          if (shouldReplace) {
-            frameFilesByIndex.set(parsed.index, { file, ext: parsed.ext });
-          }
-        } catch {
-          // Skip unreadable files
-        }
-      }
-
-      const frameFiles = Array.from(frameFilesByIndex.entries())
-        .map(([index, value]) => ({ index, file: value.file }))
-        .sort((a, b) => a.index - b.index);
-
-      // Create object URLs
-      const nextUrls: Array<{ index: number; url: string }> = [];
-      const frames: FilmstripFrame[] = frameFiles.map(({ index, file }) => {
-        const url = URL.createObjectURL(file);
-        nextUrls.push({ index, url });
-        return {
-          index,
-          timestamp: index / FRAME_RATE,
-          url,
-          byteSize: file.size,
-        };
-      });
-      this.replaceAllFrameUrls(mediaId, nextUrls);
-
-      const existingIndices = frameFiles.map(f => f.index);
-
-      // Sanity check: if marked complete but no frames, treat as incomplete
-      if (metadata.isComplete && frames.length === 0) {
-        logger.warn(`Filmstrip ${mediaId} marked complete but has 0 frames - resetting`);
-        metadata.isComplete = false;
-        metadata.frameCount = 0;
-      }
-
-      logger.debug(`Loaded filmstrip ${mediaId}: ${frames.length} frames, complete: ${metadata.isComplete}`);
-
-      return { metadata, frames, existingIndices };
-    } catch (error) {
-      logger.warn('Failed to load filmstrip:', error);
-      return null;
-    }
-  }
-
-  /**
-   * Get existing frame indices (for resume)
-   */
-  async getExistingIndices(
-    mediaId: string,
-    startIndex?: number,
-    endIndex?: number
-  ): Promise<number[]> {
-    try {
-      const mediaDir = await this.getMediaDir(mediaId);
-      if (!mediaDir) return [];
-
-      const indices = new Set<number>();
-      for await (const entry of mediaDir.values()) {
-        if (entry.kind !== 'file') continue;
-        const index = parseFrameFileName(entry.name);
-        if (index !== null) {
-          if (typeof startIndex === 'number' && index < startIndex) {
-            continue;
-          }
-          if (typeof endIndex === 'number' && index >= endIndex) {
-            continue;
-          }
-          try {
-            const fileHandle = entry as FileSystemFileHandle;
-            const file = await fileHandle.getFile();
-            if (file.size > 0) {
-              indices.add(index);
-            }
-          } catch {
-            // Skip
-          }
-        }
-      }
-
-      return Array.from(indices).sort((a, b) => a - b);
-    } catch {
-      return [];
-    }
-  }
-
-  /**
-   * Load a single frame by index - for incremental updates during extraction
-   */
-  async loadSingleFrame(mediaId: string, index: number): Promise<FilmstripFrame | null> {
-    try {
-      const mediaDir = await this.getMediaDir(mediaId);
-      if (!mediaDir) return null;
-
-      let file: File | null = null;
-      try {
-        const primaryHandle = await mediaDir.getFileHandle(`${index}.${PRIMARY_FRAME_EXT}`);
-        file = await primaryHandle.getFile();
-      } catch {
-        try {
-          const legacyHandle = await mediaDir.getFileHandle(`${index}.${LEGACY_FRAME_EXT}`);
-          file = await legacyHandle.getFile();
-        } catch {
-          return null;
-        }
-      }
-      if (!file || file.size === 0) return null;
-
-      const url = URL.createObjectURL(file);
-      this.setFrameUrl(mediaId, index, url);
-
-      return {
-        index,
-        timestamp: index / FRAME_RATE,
-        url,
-        byteSize: file.size,
-      };
-    } catch {
-      return null;
-    }
-  }
-
-  /**
-   * Create an in-memory frame URL from a worker-provided blob.
-   * Used for progressive UI updates to avoid immediate OPFS read-after-write.
-   */
-  createFrameFromBlob(mediaId: string, index: number, blob: Blob): FilmstripFrame | null {
-    if (!blob || blob.size === 0) {
-      return null;
-    }
-
-    const url = URL.createObjectURL(blob);
-    this.setFrameUrl(mediaId, index, url);
-
-    return {
-      index,
-      timestamp: index / FRAME_RATE,
-      url,
-      byteSize: blob.size,
-    };
-  }
-
-  /**
-   * Create an in-memory frame from a transferred ImageBitmap.
-   * Provides instant display without JPEG encode/decode roundtrip.
-   * URL is empty — the component renders from bitmap directly via canvas.
-   * Once the JPEG blob arrives (via createFrameFromBlob), the URL is set
-   * and the bitmap can be closed.
-   */
-  createFrameFromBitmap(_mediaId: string, index: number, bitmap: ImageBitmap): FilmstripFrame | null {
-    if (!bitmap || bitmap.width === 0) return null;
-
-    return {
-      index,
-      timestamp: index / FRAME_RATE,
-      url: '',
-      byteSize: bitmap.width * bitmap.height * 4,
-      bitmap,
-    };
-  }
-
-  /**
-   * Check if filmstrip is complete
-   */
-  async isComplete(mediaId: string): Promise<boolean> {
-    try {
-      const mediaDir = await this.getMediaDir(mediaId);
-      if (!mediaDir) return false;
-
-      const metaHandle = await mediaDir.getFileHandle('meta.json');
-      const metaFile = await metaHandle.getFile();
-      const metadata: FilmstripMetadata = JSON.parse(await metaFile.text());
-      return metadata.isComplete;
-    } catch {
-      return false;
-    }
-  }
-
-  /**
-   * Delete filmstrip
-   */
-  async delete(mediaId: string): Promise<void> {
-    this.revokeUrls(mediaId);
-    this.mediaDirCache.delete(mediaId);
-    try {
-      const dir = await this.ensureDirectory();
-      await dir.removeEntry(mediaId, { recursive: true });
-      logger.debug(`Deleted filmstrip ${mediaId}`);
-    } catch {
-      // May not exist
-    }
-    void removeWorkspaceCacheEntry([WORKSPACE_FILMSTRIPS_DIR, mediaId], {
-      recursive: true,
-    });
-  }
-
-  /**
-   * Pull filmstrip meta + frames from the workspace folder into OPFS.
-   * Used as a cross-origin fallback when OPFS has no cache for this media.
-   * Returns true when at least the metadata was recovered.
-   */
-  private async hydrateFromWorkspace(mediaId: string): Promise<boolean> {
-    try {
-      const metaBlob = await readWorkspaceBlob(filmstripMetaPath(mediaId));
-      if (!metaBlob) return false;
-
-      let metadata: FilmstripMetadata;
-      try {
-        metadata = JSON.parse(await metaBlob.text()) as FilmstripMetadata;
-      } catch {
-        return false;
-      }
-
-      const mediaDir = await this.getOrCreateMediaDir(mediaId);
-
-      const metaHandle = await mediaDir.getFileHandle('meta.json', { create: true });
-      const metaWritable = await metaHandle.createWritable();
-      await safeWrite(metaWritable, JSON.stringify(metadata));
-
-      // Frame count is the declared upper bound; missing frames are skipped.
-      const expected = Math.max(0, metadata.frameCount | 0);
-      for (let index = 0; index < expected; index += 1) {
-        const frameBlob = await readWorkspaceBlob(
-          filmstripFileFramePath(mediaId, index, PRIMARY_FRAME_EXT),
-        );
-        if (!frameBlob || frameBlob.size === 0) continue;
-        const frameHandle = await mediaDir.getFileHandle(
-          `${index}.${PRIMARY_FRAME_EXT}`,
-          { create: true },
-        );
-        const frameWritable = await frameHandle.createWritable();
-        await safeWrite(frameWritable, frameBlob);
-      }
-
-      logger.debug(`Hydrated filmstrip ${mediaId} from workspace`);
-      return true;
-    } catch (error) {
-      logger.warn(`hydrateFromWorkspace(${mediaId}) failed`, error);
-      return false;
-    }
-  }
-
-  /**
-   * Revoke object URLs for a media
-   */
-  revokeUrls(mediaId: string): void {
-    const urlsByIndex = this.objectUrls.get(mediaId);
-    if (urlsByIndex) {
-      for (const url of urlsByIndex.values()) {
-        URL.revokeObjectURL(url);
-      }
-      this.objectUrls.delete(mediaId);
-    }
-  }
-
-  /**
-   * Clear all filmstrips
-   */
-  async clearAll(): Promise<void> {
-    // Revoke all URLs
-    for (const mediaId of this.objectUrls.keys()) {
-      this.revokeUrls(mediaId);
-    }
-    this.mediaDirCache.clear();
-
-    try {
-      const dir = await this.ensureDirectory();
-      const entries: string[] = [];
-      for await (const entry of dir.values()) {
-        entries.push(entry.name);
-      }
-      for (const name of entries) {
-        await dir.removeEntry(name, { recursive: true });
-      }
-      logger.debug(`Cleared ${entries.length} filmstrips`);
-    } catch (error) {
-      logger.error('Failed to clear filmstrips:', error);
-    }
-    void removeWorkspaceCacheEntry([WORKSPACE_FILMSTRIPS_DIR], { recursive: true });
-  }
-
-  /**
-   * List all stored filmstrips
-   */
-  async list(): Promise<string[]> {
-    try {
-      const dir = await this.ensureDirectory();
-      const ids: string[] = [];
-      for await (const entry of dir.values()) {
-        if (entry.kind === 'directory') {
-          ids.push(entry.name);
-        }
-      }
-      return ids;
-    } catch {
-      return [];
-    }
-  }
-}
-
-// Singleton
-export const filmstripOPFSStorage = new FilmstripOPFSStorage();
-
diff --git a/src/features/timeline/services/filmstrip-storage.ts b/src/features/timeline/services/filmstrip-storage.ts
new file mode 100644
index 000000000..c05472043
--- /dev/null
+++ b/src/features/timeline/services/filmstrip-storage.ts
@@ -0,0 +1,446 @@
+/**
+ * Filmstrip Storage
+ *
+ * Filmstrip frames are now persisted in the selected workspace folder:
+ *   filmstrips/{mediaId}/
+ *     meta.json - { width, height, isComplete, frameCount }
+ *     0.jpg, 1.jpg, 2.jpg, ... (legacy caches may still use .webp)
+ *
+ * Legacy OPFS filmstrips are read only as a fallback. When encountered,
+ * they are hydrated into the workspace so subsequent reads stay unified.
+ */
+
+import { createLogger } from '@/shared/logging/logger';
+import { getCacheMigration } from '@/infrastructure/storage/cache-version';
+import {
+  readBlob,
+  readJson,
+  writeBlob,
+  writeJsonAtomic,
+  removeEntry,
+  listDirectory,
+} from '@/infrastructure/storage/workspace-fs/fs-primitives';
+import { requireWorkspaceRoot } from '@/infrastructure/storage/workspace-fs/root';
+import {
+  filmstripFileFramePath,
+  filmstripMetaPath,
+  WORKSPACE_FILMSTRIPS_DIR,
+} from '@/infrastructure/storage/workspace-fs/paths';
+
+const logger = createLogger('FilmstripStorage');
+
+const FILMSTRIP_DIR = 'filmstrips';
+const FRAME_RATE = 1; // Must match worker - 1fps for filmstrip thumbnails
+const PRIMARY_FRAME_EXT = 'jpg';
+const LEGACY_FRAME_EXT = 'webp';
+const FRAME_EXTENSIONS = new Set([PRIMARY_FRAME_EXT, LEGACY_FRAME_EXT]);
+
+function parseFrameFileNameParts(name: string): { index: number; ext: string } | null {
+  const dotIndex = name.lastIndexOf('.');
+  if (dotIndex <= 0) return null;
+  const ext = name.slice(dotIndex + 1).toLowerCase();
+  if (!FRAME_EXTENSIONS.has(ext)) return null;
+  const index = parseInt(name.slice(0, dotIndex), 10);
+  if (Number.isNaN(index)) return null;
+  return { index, ext };
+}
+
+function parseFrameFileName(name: string): number | null {
+  return parseFrameFileNameParts(name)?.index ?? null;
+}
+
+interface FilmstripMetadata {
+  width: number;
+  height: number;
+  isComplete: boolean;
+  frameCount: number;
+}
+
+export interface FilmstripFrame {
+  index: number;
+  timestamp: number;
+  url: string;
+  byteSize?: number;
+  bitmap?: ImageBitmap;
+}
+
+interface LoadedFilmstrip {
+  metadata: FilmstripMetadata;
+  frames: FilmstripFrame[];
+  existingIndices: number[];
+}
+
+class FilmstripStorage {
+  private objectUrls = new Map<string, Map<number, string>>();
+  private legacyInitPromise: Promise<FileSystemDirectoryHandle | null> | null = null;
+
+  private scheduleRevoke(urls: string[]): void {
+    if (urls.length === 0) return;
+
+    const revoke = () => {
+      for (const url of urls) {
+        URL.revokeObjectURL(url);
+      }
+    };
+
+    if (typeof requestIdleCallback === 'function') {
+      requestIdleCallback(revoke, { timeout: 10_000 });
+      return;
+    }
+
+    setTimeout(revoke, 0);
+  }
+
+  private setFrameUrl(mediaId: string, index: number, url: string): void {
+    const urlsByIndex = this.objectUrls.get(mediaId) ?? new Map<number, string>();
+    const previous = urlsByIndex.get(index);
+    urlsByIndex.set(index, url);
+    this.objectUrls.set(mediaId, urlsByIndex);
+
+    if (previous && previous !== url) {
+      this.scheduleRevoke([previous]);
+    }
+  }
+
+  private replaceAllFrameUrls(
+    mediaId: string,
+    entries: Array<{ index: number; url: string }>,
+  ): void {
+    const previous = this.objectUrls.get(mediaId);
+    const next = new Map<number, string>();
+    for (const entry of entries) {
+      next.set(entry.index, entry.url);
+    }
+    this.objectUrls.set(mediaId, next);
+
+    if (!previous) return;
+
+    const toRevoke: string[] = [];
+    for (const [index, url] of previous) {
+      const nextUrl = next.get(index);
+      if (nextUrl !== url) {
+        toRevoke.push(url);
+      }
+    }
+    this.scheduleRevoke(toRevoke);
+  }
+
+  private async readMetadata(
+    mediaId: string,
+  ): Promise<FilmstripMetadata | null> {
+    return await readJson<FilmstripMetadata>(requireWorkspaceRoot(), filmstripMetaPath(mediaId));
+  }
+
+  private async ensureWorkspaceFilmstrip(
+    mediaId: string,
+  ): Promise<FilmstripMetadata | null> {
+    const existing = await this.readMetadata(mediaId);
+    if (existing) return existing;
+
+    const hydrated = await this.hydrateFromLegacyOpfs(mediaId);
+    if (!hydrated) return null;
+    return await this.readMetadata(mediaId);
+  }
+
+  private async getLegacyFilmstripRoot(): Promise<FileSystemDirectoryHandle | null> {
+    if (this.legacyInitPromise) return this.legacyInitPromise;
+
+    this.legacyInitPromise = (async () => {
+      try {
+        const root = await navigator.storage.getDirectory();
+        const dir = await root.getDirectoryHandle(FILMSTRIP_DIR, { create: true });
+
+        const migration = getCacheMigration('filmstrip');
+        if (migration.needsMigration) {
+          const entries: string[] = [];
+          for await (const entry of dir.values()) {
+            entries.push(entry.name);
+          }
+          for (const name of entries) {
+            await dir.removeEntry(name, { recursive: true }).catch(() => undefined);
+          }
+          migration.markComplete();
+          logger.info(`Legacy filmstrip cache cleared for v${migration.newVersion}`);
+        }
+
+        return dir;
+      } catch (error) {
+        logger.warn('Failed to access legacy OPFS filmstrip root', error);
+        return null;
+      }
+    })();
+
+    return this.legacyInitPromise;
+  }
+
+  private async getLegacyMediaDir(mediaId: string): Promise<FileSystemDirectoryHandle | null> {
+    try {
+      const root = await this.getLegacyFilmstripRoot();
+      if (!root) return null;
+      return await root.getDirectoryHandle(mediaId);
+    } catch {
+      return null;
+    }
+  }
+
+  private async deleteLegacyFilmstrip(mediaId: string): Promise<void> {
+    try {
+      const root = await this.getLegacyFilmstripRoot();
+      if (!root) return;
+      await root.removeEntry(mediaId, { recursive: true });
+    } catch {
+      // ignore missing legacy cache
+    }
+  }
+
+  private async clearLegacyFilmstrips(): Promise<void> {
+    try {
+      const root = await this.getLegacyFilmstripRoot();
+      if (!root) return;
+      const entries: string[] = [];
+      for await (const entry of root.values()) {
+        entries.push(entry.name);
+      }
+      for (const name of entries) {
+        await root.removeEntry(name, { recursive: true }).catch(() => undefined);
+      }
+    } catch (error) {
+      logger.warn('Failed to clear legacy OPFS filmstrips', error);
+    }
+  }
+
+  private async hydrateFromLegacyOpfs(mediaId: string): Promise<boolean> {
+    try {
+      const mediaDir = await this.getLegacyMediaDir(mediaId);
+      if (!mediaDir) return false;
+
+      const metaHandle = await mediaDir.getFileHandle('meta.json');
+      const metaFile = await metaHandle.getFile();
+      const metadata = JSON.parse(await metaFile.text()) as FilmstripMetadata;
+      await writeJsonAtomic(requireWorkspaceRoot(), filmstripMetaPath(mediaId), metadata);
+
+      for await (const entry of mediaDir.values()) {
+        if (entry.kind !== 'file') continue;
+        const parsed = parseFrameFileNameParts(entry.name);
+        if (!parsed) continue;
+        const file = await (entry as FileSystemFileHandle).getFile();
+        if (file.size <= 0) continue;
+        await writeBlob(
+          requireWorkspaceRoot(),
+          filmstripFileFramePath(mediaId, parsed.index, parsed.ext),
+          file,
+        );
+      }
+
+      logger.debug(`Hydrated filmstrip ${mediaId} from legacy OPFS`);
+      return true;
+    } catch (error) {
+      logger.warn(`hydrateFromLegacyOpfs(${mediaId}) failed`, error);
+      return false;
+    }
+  }
+
+  async saveMetadata(
+    mediaId: string,
+    metadata: { width: number; height: number; isComplete: boolean; frameCount: number },
+  ): Promise<void> {
+    await writeJsonAtomic(requireWorkspaceRoot(), filmstripMetaPath(mediaId), metadata);
+  }
+
+  async saveFrameBlob(mediaId: string, index: number, blob: Blob): Promise<void> {
+    await writeBlob(
+      requireWorkspaceRoot(),
+      filmstripFileFramePath(mediaId, index, PRIMARY_FRAME_EXT),
+      blob,
+    );
+  }
+
+  async load(mediaId: string): Promise<LoadedFilmstrip | null> {
+    try {
+      const metadata = await this.ensureWorkspaceFilmstrip(mediaId);
+      if (!metadata) return null;
+
+      const entries = await listDirectory(requireWorkspaceRoot(), [WORKSPACE_FILMSTRIPS_DIR, mediaId]);
+      const frameFilesByIndex = new Map<number, { blob: Blob; ext: string }>();
+
+      for (const entry of entries) {
+        if (entry.kind !== 'file') continue;
+        const parsed = parseFrameFileNameParts(entry.name);
+        if (!parsed) continue;
+
+        const blob = await readBlob(
+          requireWorkspaceRoot(),
+          filmstripFileFramePath(mediaId, parsed.index, parsed.ext),
+        );
+        if (!blob || blob.size <= 0) continue;
+
+        const existing = frameFilesByIndex.get(parsed.index);
+        const shouldReplace = !existing
+          || (parsed.ext === PRIMARY_FRAME_EXT && existing.ext !== PRIMARY_FRAME_EXT);
+        if (shouldReplace) {
+          frameFilesByIndex.set(parsed.index, { blob, ext: parsed.ext });
+        }
+      }
+
+      const frameFiles = Array.from(frameFilesByIndex.entries())
+        .map(([index, value]) => ({ index, blob: value.blob }))
+        .sort((a, b) => a.index - b.index);
+
+      const nextUrls: Array<{ index: number; url: string }> = [];
+      const frames: FilmstripFrame[] = frameFiles.map(({ index, blob }) => {
+        const url = URL.createObjectURL(blob);
+        nextUrls.push({ index, url });
+        return {
+          index,
+          timestamp: index / FRAME_RATE,
+          url,
+          byteSize: blob.size,
+        };
+      });
+      this.replaceAllFrameUrls(mediaId, nextUrls);
+
+      const existingIndices = frameFiles.map((frame) => frame.index);
+
+      if (metadata.isComplete && frames.length === 0) {
+        logger.warn(`Filmstrip ${mediaId} marked complete but has 0 frames - resetting`);
+        metadata.isComplete = false;
+        metadata.frameCount = 0;
+      }
+
+      logger.debug(`Loaded filmstrip ${mediaId}: ${frames.length} frames, complete: ${metadata.isComplete}`);
+      return { metadata, frames, existingIndices };
+    } catch (error) {
+      logger.warn('Failed to load filmstrip:', error);
+      return null;
+    }
+  }
+
+  async getExistingIndices(
+    mediaId: string,
+    startIndex?: number,
+    endIndex?: number,
+  ): Promise<number[]> {
+    const metadata = await this.ensureWorkspaceFilmstrip(mediaId);
+    if (!metadata) return [];
+
+    const entries = await listDirectory(requireWorkspaceRoot(), [WORKSPACE_FILMSTRIPS_DIR, mediaId]);
+    const indices = new Set<number>();
+
+    for (const entry of entries) {
+      if (entry.kind !== 'file') continue;
+      const index = parseFrameFileName(entry.name);
+      if (index === null) continue;
+      if (typeof startIndex === 'number' && index < startIndex) continue;
+      if (typeof endIndex === 'number' && index >= endIndex) continue;
+
+      const parsed = parseFrameFileNameParts(entry.name);
+      if (!parsed) continue;
+      const blob = await readBlob(
+        requireWorkspaceRoot(),
+        filmstripFileFramePath(mediaId, parsed.index, parsed.ext),
+      );
+      if (blob && blob.size > 0) {
+        indices.add(index);
+      }
+    }
+
+    return Array.from(indices).sort((a, b) => a - b);
+  }
+
+  async loadSingleFrame(mediaId: string, index: number): Promise<FilmstripFrame | null> {
+    const metadata = await this.ensureWorkspaceFilmstrip(mediaId);
+    if (!metadata) return null;
+
+    let blob = await readBlob(
+      requireWorkspaceRoot(),
+      filmstripFileFramePath(mediaId, index, PRIMARY_FRAME_EXT),
+    );
+    if (!blob || blob.size === 0) {
+      blob = await readBlob(
+        requireWorkspaceRoot(),
+        filmstripFileFramePath(mediaId, index, LEGACY_FRAME_EXT),
+      );
+    }
+    if (!blob || blob.size === 0) return null;
+
+    const url = URL.createObjectURL(blob);
+    this.setFrameUrl(mediaId, index, url);
+
+    return {
+      index,
+      timestamp: index / FRAME_RATE,
+      url,
+      byteSize: blob.size,
+    };
+  }
+
+  createFrameFromBlob(mediaId: string, index: number, blob: Blob): FilmstripFrame | null {
+    if (!blob || blob.size === 0) return null;
+
+    const url = URL.createObjectURL(blob);
+    this.setFrameUrl(mediaId, index, url);
+
+    return {
+      index,
+      timestamp: index / FRAME_RATE,
+      url,
+      byteSize: blob.size,
+    };
+  }
+
+  createFrameFromBitmap(_mediaId: string, index: number, bitmap: ImageBitmap): FilmstripFrame | null {
+    if (!bitmap || bitmap.width === 0) return null;
+
+    return {
+      index,
+      timestamp: index / FRAME_RATE,
+      url: '',
+      byteSize: bitmap.width * bitmap.height * 4,
+      bitmap,
+    };
+  }
+
+  async isComplete(mediaId: string): Promise<boolean> {
+    const metadata = await this.ensureWorkspaceFilmstrip(mediaId);
+    return metadata?.isComplete ?? false;
+  }
+
+  async delete(mediaId: string): Promise<void> {
+    this.revokeUrls(mediaId);
+    await removeEntry(requireWorkspaceRoot(), [WORKSPACE_FILMSTRIPS_DIR, mediaId], {
+      recursive: true,
+    });
+    await this.deleteLegacyFilmstrip(mediaId);
+    logger.debug(`Deleted filmstrip ${mediaId}`);
+  }
+
+  revokeUrls(mediaId: string): void {
+    const urlsByIndex = this.objectUrls.get(mediaId);
+    if (!urlsByIndex) return;
+
+    for (const url of urlsByIndex.values()) {
+      URL.revokeObjectURL(url);
+    }
+    this.objectUrls.delete(mediaId);
+  }
+
+  async clearAll(): Promise<void> {
+    for (const mediaId of this.objectUrls.keys()) {
+      this.revokeUrls(mediaId);
+    }
+
+    await removeEntry(requireWorkspaceRoot(), [WORKSPACE_FILMSTRIPS_DIR], {
+      recursive: true,
+    }).catch(() => undefined);
+    await this.clearLegacyFilmstrips();
+  }
+
+  async list(): Promise<string[]> {
+    const entries = await listDirectory(requireWorkspaceRoot(), [WORKSPACE_FILMSTRIPS_DIR]);
+    return entries
+      .filter((entry) => entry.kind === 'directory')
+      .map((entry) => entry.name);
+  }
+}
+
+export const filmstripStorage = new FilmstripStorage();
diff --git a/src/features/timeline/stores/actions/item-actions.linked-items.test.ts b/src/features/timeline/stores/actions/item-actions.linked-items.test.ts
index a6b3530f4..e570d537a 100644
--- a/src/features/timeline/stores/actions/item-actions.linked-items.test.ts
+++ b/src/features/timeline/stores/actions/item-actions.linked-items.test.ts
@@ -1,5 +1,5 @@
 import { beforeEach, describe, expect, it } from 'vitest';
-import type { AudioItem, TimelineTrack, VideoItem } from '@/types/timeline';
+import type { AudioItem, TextItem, TimelineTrack, VideoItem } from '@/types/timeline';
 import { useItemsStore } from '../items-store';
 import { useTransitionsStore } from '../transitions-store';
 import { useKeyframesStore } from '../keyframes-store';
@@ -51,6 +51,20 @@ function makeAudioItem(overrides: Partial<AudioItem> = {}): AudioItem {
   };
 }
 
+function makeTextItem(overrides: Partial<TextItem> = {}): TextItem {
+  return {
+    id: 'text-1',
+    type: 'text',
+    trackId: 'caption-track',
+    from: 0,
+    durationInFrames: 60,
+    text: 'Caption',
+    style: {},
+    textRole: 'caption',
+    ...overrides,
+  };
+}
+
 function makeTrack(overrides: Partial<TimelineTrack> & Pick<TimelineTrack, 'id' | 'name' | 'order' | 'kind'>): TimelineTrack {
   return {
     height: 80,
@@ -599,6 +613,65 @@ describe('linked timeline items', () => {
     expect(items.find((item) => item.id === 'solo-audio')).toBeUndefined();
   });
 
+  it('ripple delete shifts attached captions with their surviving clip', () => {
+    useItemsStore.getState().setTracks([
+      makeTrack({ id: 'video-track', name: 'V1', order: 0, kind: 'video' }),
+      makeTrack({ id: 'audio-track', name: 'A1', order: 1, kind: 'audio' }),
+      makeTrack({ id: 'video-track-2', name: 'V2', order: 2, kind: 'video', syncLock: false }),
+    ]);
+    useItemsStore.getState().setItems([
+      makeVideoItem({
+        id: 'video-delete',
+        durationInFrames: 100,
+        linkedGroupId: 'group-del',
+        originId: 'origin-del',
+        mediaId: 'media-del',
+      }),
+      makeAudioItem({
+        id: 'audio-delete',
+        durationInFrames: 100,
+        linkedGroupId: 'group-del',
+        originId: 'origin-del',
+        mediaId: 'media-del',
+      }),
+      makeVideoItem({
+        id: 'video-survivor',
+        from: 100,
+        durationInFrames: 60,
+        linkedGroupId: 'group-survivor',
+        originId: 'origin-survivor',
+        mediaId: 'media-survivor',
+      }),
+      makeAudioItem({
+        id: 'audio-survivor',
+        from: 100,
+        durationInFrames: 60,
+        linkedGroupId: 'group-survivor',
+        originId: 'origin-survivor',
+        mediaId: 'media-survivor',
+      }),
+      makeTextItem({
+        id: 'caption-survivor',
+        trackId: 'video-track-2',
+        from: 100,
+        durationInFrames: 60,
+        captionSource: {
+          type: 'transcript',
+          clipId: 'video-survivor',
+          mediaId: 'media-survivor',
+        },
+      }),
+    ]);
+
+    rippleDeleteItems(['video-delete']);
+
+    const state = useItemsStore.getState();
+    expect(state.items.find((item) => item.id === 'video-survivor')).toMatchObject({ from: 0 });
+    expect(state.items.find((item) => item.id === 'audio-survivor')).toMatchObject({ from: 0 });
+    expect(state.items.find((item) => item.id === 'caption-survivor')).toMatchObject({ from: 0 });
+    expect(state.maxItemEndFrame).toBe(60);
+  });
+
   it('links an arbitrary multi-selection with a fresh group id', () => {
     useItemsStore.getState().setItems([
       makeVideoItem({ linkedGroupId: 'video-1' }),
diff --git a/src/features/timeline/stores/actions/item-actions.slip-slide.test.ts b/src/features/timeline/stores/actions/item-actions.slip-slide.test.ts
index f009d2fab..997e2008c 100644
--- a/src/features/timeline/stores/actions/item-actions.slip-slide.test.ts
+++ b/src/features/timeline/stores/actions/item-actions.slip-slide.test.ts
@@ -2,6 +2,8 @@ import { beforeEach, describe, expect, it } from 'vitest';
 import type { VideoItem, TextItem } from '@/types/timeline';
 import { useItemsStore } from '../items-store';
 import { useTimelineSettingsStore } from '../timeline-settings-store';
+import { usePlaybackStore } from '@/shared/state/playback';
+import { usePreviewBridgeStore } from '@/shared/state/preview-bridge';
 import { slipItem, slideItem } from './item-actions';
 
 function makeVideoItem(overrides: Partial<VideoItem> = {}): VideoItem {
@@ -23,6 +25,20 @@ describe('slipItem', () => {
     useTimelineSettingsStore.setState({ fps: 30 });
     useItemsStore.getState().setItems([]);
     useItemsStore.getState().setTracks([]);
+    usePlaybackStore.setState({
+      currentFrame: 150,
+      currentFrameEpoch: 0,
+      previewFrame: null,
+      previewFrameEpoch: 0,
+      isPlaying: false,
+    });
+    usePreviewBridgeStore.setState({
+      displayedFrame: null,
+      captureFrame: null,
+      captureFrameImageData: null,
+      captureCanvasSource: null,
+      postEditWarmRequest: null,
+    });
   });
 
   it('shifts sourceStart and sourceEnd by slipDelta', () => {
@@ -371,6 +387,51 @@ describe('slideItem', () => {
     expect(updatedMiddle.sourceEnd).toBe(updatedRight.sourceStart);
   });
 
+  it('queues post-edit warm frames around the edited clip boundaries', () => {
+    const left = makeVideoItem({
+      id: 'left',
+      trackId: 'track-1',
+      from: 0,
+      durationInFrames: 100,
+      sourceStart: 0,
+      sourceEnd: 100,
+      sourceDuration: 200,
+      sourceFps: 30,
+    });
+    const middle = makeVideoItem({
+      id: 'middle',
+      trackId: 'track-1',
+      from: 100,
+      durationInFrames: 100,
+      sourceStart: 0,
+      sourceEnd: 100,
+      sourceDuration: 200,
+      sourceFps: 30,
+      mediaId: 'media-2',
+    });
+    const right = makeVideoItem({
+      id: 'right',
+      trackId: 'track-1',
+      from: 200,
+      durationInFrames: 100,
+      sourceStart: 0,
+      sourceEnd: 100,
+      sourceDuration: 200,
+      sourceFps: 30,
+      mediaId: 'media-3',
+    });
+
+    useItemsStore.getState().setItems([left, middle, right]);
+
+    slideItem('middle', 20, 'left', 'right');
+
+    expect(usePreviewBridgeStore.getState().postEditWarmRequest).toMatchObject({
+      frame: 150,
+      itemIds: ['middle', 'left', 'right'],
+      frames: expect.arrayContaining([150, 120, 121, 218, 219, 0, 1, 118, 119, 220, 221, 298, 299]),
+    });
+  });
+
   it('keeps default slide semantics for non-split chains', () => {
     const left = makeVideoItem({
       id: 'left',
diff --git a/src/features/timeline/stores/actions/item-actions.ts b/src/features/timeline/stores/actions/item-actions.ts
index 03d878255..6744b7e7e 100644
--- a/src/features/timeline/stores/actions/item-actions.ts
+++ b/src/features/timeline/stores/actions/item-actions.ts
@@ -22,6 +22,7 @@ import {
   expandSelectionWithLinkedItems,
   getLinkedItemIds,
 } from '../../utils/linked-items';
+import { isTrackSyncLockEnabled } from '../../utils/track-sync-lock';
 import { placeItemsWithoutTimelineOverlap } from './item-placement';
 
 function isLinkedSelectionEnabled(): boolean {
@@ -167,8 +168,9 @@ export function rippleDeleteItems(ids: string[]): void {
     }));
 
   // Per-track: shift downstream items on the same track as each deleted item.
-  // Linked counterparts on other tracks shift via buildLinkedLeftShiftUpdates.
-  // Solo clips on unrelated tracks are left in place.
+  // Linked counterparts and attached captions on tracks that won't be handled
+  // by sync-lock ripple get shifted manually. Solo clips on unrelated tracks
+  // are left in place.
   for (const item of remainingItems) {
     const shiftAmount = items
       .filter((candidate) => idsToDelete.has(candidate.id))
@@ -180,8 +182,30 @@ export function rippleDeleteItems(ids: string[]): void {
     }
   }
 
+  const trackById = new Map(useItemsStore.getState().tracks.map((track) => [track.id, track]));
+  const itemById = new Map(remainingItems.map((item) => [item.id, item]));
+  const shiftByItemId = new Map<string, number>();
+
+  for (const [itemId, shiftAmount] of baseShiftByItemId) {
+    if (shiftAmount <= 0) continue;
+
+    const relatedIds = expandIdsWithLinkedItems(remainingItems, [itemId], linkedSelectionEnabled);
+    for (const relatedId of relatedIds) {
+      const relatedItem = itemById.get(relatedId);
+      if (!relatedItem) continue;
+
+      const handledBySyncLock = !editedTrackIds.has(relatedItem.trackId)
+        && isTrackSyncLockEnabled(trackById.get(relatedItem.trackId));
+      if (handledBySyncLock) {
+        continue;
+      }
+
+      shiftByItemId.set(relatedId, Math.max(shiftByItemId.get(relatedId) ?? 0, shiftAmount));
+    }
+  }
+
   const updates = remainingItems.flatMap((item) => {
-    const shiftAmount = baseShiftByItemId.get(item.id) ?? 0;
+    const shiftAmount = shiftByItemId.get(item.id) ?? 0;
     return shiftAmount > 0
       ? [{ id: item.id, from: item.from - shiftAmount }]
       : [];
diff --git a/src/features/timeline/stores/actions/item-edit-actions.ts b/src/features/timeline/stores/actions/item-edit-actions.ts
index 3f5e05f04..e33cf730f 100644
--- a/src/features/timeline/stores/actions/item-edit-actions.ts
+++ b/src/features/timeline/stores/actions/item-edit-actions.ts
@@ -15,6 +15,7 @@ import {
   mediaLibraryService,
   opfsService,
 } from '@/features/timeline/deps/media-library-service';
+import { writeMediaSource } from '@/infrastructure/storage/workspace-fs/media-source';
 import { toast } from 'sonner';
 import { execute, applyTransitionRepairs, getLogger } from './shared';
 import {
@@ -24,12 +25,18 @@ import {
   getSynchronizedLinkedItemsForEdit,
 } from './linked-edit';
 import { blobUrlManager } from '@/infrastructure/browser/blob-url-manager';
+import { usePlaybackStore } from '@/shared/state/playback';
+import { usePreviewBridgeStore } from '@/shared/state/preview-bridge';
 import { timelineToSourceFrames, sourceToTimelineFrames } from '../../utils/source-calculations';
 import { computeClampedSlipDelta } from '../../utils/slip-utils';
 import { computeSlideContinuitySourceDelta } from '../../utils/slide-utils';
 import { clampSlideDeltaToPreserveTransitions } from '../../utils/transition-utils';
 import { calculateTransitionPortions } from '@/core/timeline/transitions/transition-planner';
-import { getLinkedItemIds, getUniqueLinkedItemAnchorIds } from '../../utils/linked-items';
+import {
+  expandItemIdsWithAttachedCaptions,
+  getLinkedItemIds,
+  getUniqueLinkedItemAnchorIds,
+} from '../../utils/linked-items';
 import {
   propagateInsertedGapToSyncLockedTracks,
   propagateRemovedIntervalsToSyncLockedTracks,
@@ -40,6 +47,67 @@ function isLinkedSelectionEnabled(): boolean {
   return useEditorStore.getState().linkedSelectionEnabled;
 }
 
+const POST_EDIT_WARM_MAX_FRAMES = 32;
+
+function appendWarmFrame(target: number[], seen: Set<number>, frame: number): void {
+  if (!Number.isFinite(frame)) return;
+  const normalizedFrame = Math.max(0, Math.round(frame));
+  if (seen.has(normalizedFrame)) return;
+  seen.add(normalizedFrame);
+  target.push(normalizedFrame);
+}
+
+function appendItemWarmFrames(
+  target: number[],
+  seen: Set<number>,
+  item: TimelineItem | undefined,
+): void {
+  if (!item) return;
+  const startFrame = Math.max(0, Math.trunc(item.from));
+  const endFrame = Math.max(startFrame, Math.trunc(item.from + item.durationInFrames) - 1);
+  appendWarmFrame(target, seen, startFrame);
+  appendWarmFrame(target, seen, Math.min(endFrame, startFrame + 1));
+  appendWarmFrame(target, seen, Math.max(startFrame, endFrame - 1));
+  appendWarmFrame(target, seen, endFrame);
+}
+
+function collectPostEditWarmFrames(
+  itemIds: Iterable<string>,
+  preferredFrames: number[] = [],
+): number[] {
+  const frames: number[] = [];
+  const seen = new Set<number>();
+
+  for (const frame of preferredFrames) {
+    appendWarmFrame(frames, seen, frame);
+  }
+
+  const itemById = useItemsStore.getState().itemById;
+  for (const itemId of itemIds) {
+    appendItemWarmFrames(frames, seen, itemById[itemId]);
+    if (frames.length >= POST_EDIT_WARM_MAX_FRAMES) {
+      break;
+    }
+  }
+
+  return frames.slice(0, POST_EDIT_WARM_MAX_FRAMES);
+}
+
+function requestPostEditWarmForItems(
+  itemIds: Iterable<string>,
+  preferredFrames: number[] = [],
+): void {
+  const playbackState = usePlaybackStore.getState();
+  if (playbackState.isPlaying) return;
+
+  const uniqueItemIds = Array.from(new Set(itemIds));
+  if (uniqueItemIds.length === 0) return;
+
+  const primaryFrame = playbackState.currentFrame;
+  const warmFrames = collectPostEditWarmFrames(uniqueItemIds, [primaryFrame, ...preferredFrames]);
+  usePreviewBridgeStore.getState().requestPostEditWarm(primaryFrame, uniqueItemIds, warmFrames);
+}
+
 function applySynchronizedTrim(id: string, handle: 'start' | 'end', trimAmount: number): void {
   const itemsStore = useItemsStore.getState();
   const itemsBefore = itemsStore.items;
@@ -70,7 +138,9 @@ function applySynchronizedTrim(id: string, handle: 'start' | 'end', trimAmount:
     }
   }
 
-  applyTransitionRepairs(synchronizedItems.map((item) => item.id));
+  const affectedIds = synchronizedItems.map((item) => item.id);
+  applyTransitionRepairs(affectedIds);
+  requestPostEditWarmForItems(affectedIds);
   useTimelineSettingsStore.getState().markDirty();
 }
 
@@ -485,7 +555,7 @@ export function rateStretchItem(
           moveUpdates.push({ id: downstream.id, from: downstream.from + endDelta });
 
           // Also move linked companions on other tracks
-          const linkedIds = getLinkedItemIds(freshItems, downstream.id);
+          const linkedIds = expandItemIdsWithAttachedCaptions(freshItems, getLinkedItemIds(freshItems, downstream.id));
           for (const linkedId of linkedIds) {
             if (linkedId === downstream.id || movedIds.has(linkedId)) continue;
             const linked = freshItems.find((i) => i.id === linkedId);
@@ -506,7 +576,7 @@ export function rateStretchItem(
             if (neighbor) {
               movedIds.add(neighbor.id);
               moveUpdates.push({ id: neighbor.id, from: neighbor.from + endDelta });
-              const linkedIds = getLinkedItemIds(freshItems, neighbor.id);
+              const linkedIds = expandItemIdsWithAttachedCaptions(freshItems, getLinkedItemIds(freshItems, neighbor.id));
               for (const linkedId of linkedIds) {
                 if (linkedId === neighbor.id || movedIds.has(linkedId)) continue;
                 const linked = freshItems.find((i) => i.id === linkedId);
@@ -537,7 +607,7 @@ export function rateStretchItem(
           movedIds.add(upstream.id);
           moveUpdates.push({ id: upstream.id, from: Math.max(0, upstream.from + fromDelta) });
 
-          const linkedIds = getLinkedItemIds(freshItems, upstream.id);
+          const linkedIds = expandItemIdsWithAttachedCaptions(freshItems, getLinkedItemIds(freshItems, upstream.id));
           for (const linkedId of linkedIds) {
             if (linkedId === upstream.id || movedIds.has(linkedId)) continue;
             const linked = freshItems.find((i) => i.id === linkedId);
@@ -557,7 +627,7 @@ export function rateStretchItem(
             if (neighbor) {
               movedIds.add(neighbor.id);
               moveUpdates.push({ id: neighbor.id, from: Math.max(0, neighbor.from + fromDelta) });
-              const linkedIds = getLinkedItemIds(freshItems, neighbor.id);
+              const linkedIds = expandItemIdsWithAttachedCaptions(freshItems, getLinkedItemIds(freshItems, neighbor.id));
               for (const linkedId of linkedIds) {
                 if (linkedId === neighbor.id || movedIds.has(linkedId)) continue;
                 const linked = freshItems.find((i) => i.id === linkedId);
@@ -579,6 +649,7 @@ export function rateStretchItem(
     // Repair transitions for all affected clips
     const allAffectedIds = [...allSynchronizedIds, ...movedIds];
     applyTransitionRepairs(allAffectedIds);
+    requestPostEditWarmForItems(allAffectedIds);
 
     useTimelineSettingsStore.getState().markDirty();
   }, { id, newFrom, newDuration, newSpeed });
@@ -703,7 +774,7 @@ export function resetSpeedWithRipple(itemIds: string[]): void {
           moveUpdates.push({ id: downstream.id, from: downstream.from + growth });
 
           // Also move linked companions on other tracks
-          const linkedIds = getLinkedItemIds(freshItems, downstream.id);
+          const linkedIds = expandItemIdsWithAttachedCaptions(freshItems, getLinkedItemIds(freshItems, downstream.id));
           for (const linkedId of linkedIds) {
             if (linkedId === downstream.id || movedIds.has(linkedId)) continue;
             const linked = freshItems.find((i) => i.id === linkedId);
@@ -723,6 +794,7 @@ export function resetSpeedWithRipple(itemIds: string[]): void {
     // Phase 3: Repair transitions for all affected clips
     const allAffectedIds = [...allChangedIds, ...movedIds];
     applyTransitionRepairs(allAffectedIds);
+    requestPostEditWarmForItems(allAffectedIds);
 
     useTimelineSettingsStore.getState().markDirty();
   }, { itemIds });
@@ -861,10 +933,14 @@ export async function insertFreezeFrame(
       updatedAt: Date.now(),
     };
 
-    // Store the frame blob in OPFS
+    // Store the frame blob in OPFS, then mirror it into the workspace folder
+    // so other origins and external tooling can see it on disk.
     const opfsPath = `content/${frameMediaId.slice(0, 2)}/${frameMediaId.slice(2, 4)}/${frameMediaId}/data`;
     await opfsService.saveFile(opfsPath, await frameBlob.arrayBuffer());
     mediaMetadata.opfsPath = opfsPath;
+    void writeMediaSource(frameMediaId, frameBlob, fileName).catch((error) => {
+      getLogger().warn('[insertFreezeFrame] Failed to mirror frame to workspace', error);
+    });
 
     await createMedia(mediaMetadata);
     await associateMediaWithProject(currentProjectId, frameMediaId);
@@ -1067,6 +1143,7 @@ export function rippleTrimItem(id: string, handle: 'start' | 'end', trimDelta: n
       useKeyframesStore.getState()._removeKeyframesForItems(lockedRemoved);
     }
     applyTransitionRepairs(affected, lockedRemoved.length > 0 ? new Set(lockedRemoved) : undefined);
+    requestPostEditWarmForItems(affected);
     useTimelineSettingsStore.getState().markDirty();
   }, { id, handle, trimDelta });
 }
@@ -1122,9 +1199,11 @@ export function rollingTrimItems(leftId: string, rightId: string, editPointDelta
       }
 
     // Repair transitions for both clips
-    applyTransitionRepairs(counterpartPair
+    const affectedIds = counterpartPair
       ? [leftId, rightId, counterpartPair.leftCounterpart.id, counterpartPair.rightCounterpart.id]
-      : [leftId, rightId]);
+      : [leftId, rightId];
+    applyTransitionRepairs(affectedIds);
+    requestPostEditWarmForItems(affectedIds);
 
     useTimelineSettingsStore.getState().markDirty();
   }, { leftId, rightId, editPointDelta });
@@ -1172,7 +1251,9 @@ export function slipItem(id: string, slipDelta: number): void {
       });
     }
 
-    applyTransitionRepairs(synchronizedItems.map((synchronizedItem) => synchronizedItem.id));
+    const affectedIds = synchronizedItems.map((synchronizedItem) => synchronizedItem.id);
+    applyTransitionRepairs(affectedIds);
+    requestPostEditWarmForItems(affectedIds);
 
     useTimelineSettingsStore.getState().markDirty();
   }, { id, slipDelta });
@@ -1346,6 +1427,7 @@ export function slideItem(
       if (cpRightAdj) affectedIds.push(cpRightAdj.id);
     }
     applyTransitionRepairs(affectedIds);
+    requestPostEditWarmForItems(affectedIds);
 
     useTimelineSettingsStore.getState().markDirty();
   }, { id, slideDelta, leftNeighborId, rightNeighborId });
diff --git a/src/features/timeline/stores/actions/linked-edit.test.ts b/src/features/timeline/stores/actions/linked-edit.test.ts
index 0de97b5fa..7f7439c4f 100644
--- a/src/features/timeline/stores/actions/linked-edit.test.ts
+++ b/src/features/timeline/stores/actions/linked-edit.test.ts
@@ -62,6 +62,18 @@ describe('linked-edit helpers', () => {
     const items: TimelineItem[] = [
       makeVideoItem({ id: 'video-2', from: 90, linkedGroupId: 'group-2' }),
       makeAudioItem({ id: 'audio-2', from: 90, linkedGroupId: 'group-2' }),
+      {
+        id: 'caption-2',
+        type: 'text',
+        trackId: 'caption-track',
+        from: 90,
+        durationInFrames: 60,
+        label: 'Caption',
+        text: 'Caption',
+        color: '#fff',
+        textRole: 'caption',
+        captionSource: { type: 'transcript', clipId: 'video-2', mediaId: 'media-1' },
+      },
     ];
 
     expect(
@@ -71,6 +83,7 @@ describe('linked-edit helpers', () => {
     ).toEqual([
       { id: 'video-2', from: 60 },
       { id: 'audio-2', from: 60 },
+      { id: 'caption-2', from: 60 },
     ]);
   });
 
@@ -118,4 +131,24 @@ describe('linked-edit helpers', () => {
       getMatchingSynchronizedLinkedCounterpartForEdit(items, 'video-left', 'audio-track', 'audio', true)?.id
     ).toBe('audio-left');
   });
+
+  it('includes attached captions when expanding ids for deletion', () => {
+    const items: TimelineItem[] = [
+      makeVideoItem({ id: 'video-1', linkedGroupId: undefined }),
+      {
+        id: 'caption-1',
+        type: 'text',
+        trackId: 'caption-track',
+        from: 0,
+        durationInFrames: 60,
+        label: 'Caption',
+        text: 'Caption',
+        color: '#fff',
+        textRole: 'caption',
+        captionSource: { type: 'transcript', clipId: 'video-1', mediaId: 'media-1' },
+      },
+    ];
+
+    expect(expandIdsWithLinkedItems(items, ['video-1'], false)).toEqual(['video-1', 'caption-1']);
+  });
 });
diff --git a/src/features/timeline/stores/actions/linked-edit.ts b/src/features/timeline/stores/actions/linked-edit.ts
index 31a8f6fd0..356975ecf 100644
--- a/src/features/timeline/stores/actions/linked-edit.ts
+++ b/src/features/timeline/stores/actions/linked-edit.ts
@@ -1,6 +1,7 @@
 import type { TimelineItem } from '@/types/timeline';
 import {
   buildSynchronizedLinkedMoveUpdates,
+  expandItemIdsWithAttachedCaptions,
   expandSelectionWithLinkedItems,
   getLinkedItems,
   getMatchingSynchronizedLinkedCounterpart,
@@ -19,10 +20,10 @@ export function expandIdsWithLinkedItems(
   linkedSelectionEnabled: boolean,
 ): string[] {
   if (!linkedSelectionEnabled) {
-    return Array.from(new Set(ids));
+    return expandItemIdsWithAttachedCaptions(items, Array.from(new Set(ids)));
   }
 
-  return expandSelectionWithLinkedItems(items, ids);
+  return expandItemIdsWithAttachedCaptions(items, expandSelectionWithLinkedItems(items, ids));
 }
 
 export function getLinkedItemsForEdit(
@@ -78,8 +79,16 @@ export function buildLinkedLeftShiftUpdates(
   linkedSelectionEnabled: boolean,
 ): Array<{ id: string; from: number }> {
   if (!linkedSelectionEnabled) {
+    const shiftByItemId = new Map(baseShiftByItemId);
+    for (const [itemId, shiftAmount] of baseShiftByItemId) {
+      if (shiftAmount <= 0) continue;
+      for (const attachedId of expandItemIdsWithAttachedCaptions(items, [itemId])) {
+        shiftByItemId.set(attachedId, Math.max(shiftByItemId.get(attachedId) ?? 0, shiftAmount));
+      }
+    }
+
     return items.flatMap((item) => {
-      const shiftAmount = baseShiftByItemId.get(item.id) ?? 0;
+      const shiftAmount = shiftByItemId.get(item.id) ?? 0;
       return shiftAmount > 0
         ? [{ id: item.id, from: item.from - shiftAmount }]
         : [];
@@ -111,6 +120,13 @@ export function buildLinkedLeftShiftUpdates(
     }
   }
 
+  for (const [itemId, shiftAmount] of shiftByItemId) {
+    if (shiftAmount <= 0) continue;
+    for (const attachedId of expandItemIdsWithAttachedCaptions(items, [itemId])) {
+      shiftByItemId.set(attachedId, Math.max(shiftByItemId.get(attachedId) ?? 0, shiftAmount));
+    }
+  }
+
   return items.flatMap((item) => {
     const shiftAmount = shiftByItemId.get(item.id) ?? 0;
     return shiftAmount > 0
diff --git a/src/features/timeline/stores/timeline-persistence.ts b/src/features/timeline/stores/timeline-persistence.ts
index 4bf32a806..c209b7bdc 100644
--- a/src/features/timeline/stores/timeline-persistence.ts
+++ b/src/features/timeline/stores/timeline-persistence.ts
@@ -768,7 +768,7 @@ export async function saveTimeline(projectId: string): Promise<void> {
           });
         }
 
-        // Save thumbnail to IndexedDB
+        // Save thumbnail to workspace storage
         thumbnailId = `project:${projectId}:cover`;
         await saveThumbnail({
           id: thumbnailId,
diff --git a/src/features/timeline/utils/linked-items.test.ts b/src/features/timeline/utils/linked-items.test.ts
index 96a72f68b..90204bcce 100644
--- a/src/features/timeline/utils/linked-items.test.ts
+++ b/src/features/timeline/utils/linked-items.test.ts
@@ -1,11 +1,14 @@
 import { describe, expect, it } from 'vitest';
 import type { TimelineItem } from '@/types/timeline';
 import {
+  expandItemIdsWithAttachedCaptions,
   buildLinkedMovePreviewUpdates,
   canLinkSelection,
   canLinkItems,
   expandSelectionWithLinkedItems,
   filterUnlockedItemIds,
+  getAttachedCaptionItemIds,
+  getLinkedAndAttachedItemIds,
   getLinkedItemIds,
   getLinkedSyncOffsetFrames,
   getUniqueLinkedItemAnchorIds,
@@ -70,6 +73,46 @@ describe('linked items', () => {
     expect(expandSelectionWithLinkedItems(items, ['video-1', 'video-2'])).toEqual(['video-1', 'audio-1', 'video-2']);
   });
 
+  it('finds caption-role text attached to a clip', () => {
+    const items = [
+      makeItem({ id: 'video-1', type: 'video' }),
+      makeItem({
+        id: 'caption-1',
+        type: 'text',
+        text: 'Hello',
+        color: '#fff',
+        textRole: 'caption',
+        captionSource: { type: 'transcript', clipId: 'video-1', mediaId: 'media-1' },
+      }),
+      makeItem({
+        id: 'manual-text',
+        type: 'text',
+        text: 'Manual',
+        color: '#fff',
+      }),
+    ];
+
+    expect(getAttachedCaptionItemIds(items, 'video-1')).toEqual(['caption-1']);
+    expect(expandItemIdsWithAttachedCaptions(items, ['video-1'])).toEqual(['video-1', 'caption-1']);
+  });
+
+  it('includes attached captions when expanding a linked clip pair', () => {
+    const items = [
+      makeItem({ id: 'video-1', linkedGroupId: 'group-1', type: 'video' }),
+      makeItem({ id: 'audio-1', linkedGroupId: 'group-1', type: 'audio' }),
+      makeItem({
+        id: 'caption-1',
+        type: 'text',
+        text: 'Caption',
+        color: '#fff',
+        textRole: 'caption',
+        captionSource: { type: 'transcript', clipId: 'video-1', mediaId: 'media-1' },
+      }),
+    ];
+
+    expect(getLinkedAndAttachedItemIds(items, 'audio-1')).toEqual(['video-1', 'audio-1', 'caption-1']);
+  });
+
   it('dedupes linked groups down to one split anchor', () => {
     const items = [
       makeItem({ id: 'comp-video-1', linkedGroupId: 'group-1', type: 'composition', compositionId: 'comp-1' }),
diff --git a/src/features/timeline/utils/linked-items.ts b/src/features/timeline/utils/linked-items.ts
index be42022ae..3d34d7567 100644
--- a/src/features/timeline/utils/linked-items.ts
+++ b/src/features/timeline/utils/linked-items.ts
@@ -30,6 +30,43 @@ export function getLinkedItemIds(items: TimelineItem[], itemId: string): string[
   return getLinkedItems(items, itemId).map((item) => item.id);
 }
 
+export function getAttachedCaptionItemIds(items: TimelineItem[], itemId: string): string[] {
+  const anchor = items.find((item) => item.id === itemId);
+  if (!anchor || anchor.type === 'text') {
+    return [];
+  }
+
+  return items
+    .filter((item) =>
+      item.type === 'text'
+      && (item.textRole === 'caption' || item.captionSource !== undefined)
+      && item.captionSource?.clipId === anchor.id
+    )
+    .map((item) => item.id);
+}
+
+export function expandItemIdsWithAttachedCaptions(items: TimelineItem[], itemIds: string[]): string[] {
+  const expandedIds = new Set<string>();
+  const captionIds = new Set<string>();
+
+  for (const itemId of itemIds) {
+    expandedIds.add(itemId);
+    for (const captionId of getAttachedCaptionItemIds(items, itemId)) {
+      captionIds.add(captionId);
+    }
+  }
+
+  for (const captionId of captionIds) {
+    expandedIds.add(captionId);
+  }
+
+  return Array.from(expandedIds);
+}
+
+export function getLinkedAndAttachedItemIds(items: TimelineItem[], itemId: string): string[] {
+  return expandItemIdsWithAttachedCaptions(items, getLinkedItemIds(items, itemId));
+}
+
 export function filterUnlockedItemIds(
   items: TimelineItem[],
   tracks: Pick<TimelineTrack, 'id' | 'locked'>[],
diff --git a/src/features/timeline/utils/zoom-anchor.test.ts b/src/features/timeline/utils/zoom-anchor.test.ts
new file mode 100644
index 000000000..627a41baa
--- /dev/null
+++ b/src/features/timeline/utils/zoom-anchor.test.ts
@@ -0,0 +1,50 @@
+import { describe, expect, it } from 'vitest';
+
+import {
+  getAnchoredZoomScrollLeft,
+  getCursorZoomAnchor,
+  getPlayheadZoomAnchor,
+} from './zoom-anchor';
+
+describe('zoom-anchor', () => {
+  it('derives a cursor anchor from the visible cursor position', () => {
+    expect(getCursorZoomAnchor({
+      currentZoomLevel: 1,
+      cursorScreenX: 180,
+      maxDurationSeconds: 10,
+      scrollLeft: 40,
+    })).toEqual({
+      anchorScreenX: 180,
+      anchorTimeSeconds: 2.2,
+    });
+  });
+
+  it('derives a playhead anchor from the current playhead frame', () => {
+    expect(getPlayheadZoomAnchor({
+      currentFrame: 60,
+      currentZoomLevel: 1,
+      fps: 30,
+      maxDurationSeconds: 10,
+      scrollLeft: 50,
+    })).toEqual({
+      anchorScreenX: 150,
+      anchorTimeSeconds: 2,
+    });
+  });
+
+  it('computes scrollLeft so the playhead stays in place while zooming', () => {
+    const playheadAnchor = getPlayheadZoomAnchor({
+      currentFrame: 60,
+      currentZoomLevel: 1,
+      fps: 30,
+      maxDurationSeconds: 10,
+      scrollLeft: 50,
+    });
+
+    expect(getAnchoredZoomScrollLeft({
+      anchor: playheadAnchor,
+      maxDurationSeconds: 10,
+      nextZoomLevel: 2,
+    })).toBe(250);
+  });
+});
diff --git a/src/features/timeline/utils/zoom-anchor.ts b/src/features/timeline/utils/zoom-anchor.ts
new file mode 100644
index 000000000..38563737d
--- /dev/null
+++ b/src/features/timeline/utils/zoom-anchor.ts
@@ -0,0 +1,69 @@
+import { ZOOM_MAX, ZOOM_MIN } from '../constants';
+
+const PIXELS_PER_SECOND_AT_100_PERCENT = 100;
+
+function zoomLevelToPixelsPerSecond(zoomLevel: number): number {
+  return zoomLevel * PIXELS_PER_SECOND_AT_100_PERCENT;
+}
+
+function clampTimeSeconds(timeSeconds: number, maxDurationSeconds: number): number {
+  return Math.max(0, Math.min(timeSeconds, maxDurationSeconds));
+}
+
+export interface TimelineZoomAnchor {
+  anchorScreenX: number;
+  anchorTimeSeconds: number;
+}
+
+export function getCursorZoomAnchor(params: {
+  currentZoomLevel: number;
+  cursorScreenX: number;
+  maxDurationSeconds: number;
+  scrollLeft: number;
+}): TimelineZoomAnchor {
+  const currentPixelsPerSecond = zoomLevelToPixelsPerSecond(params.currentZoomLevel);
+  const anchorContentX = params.scrollLeft + params.cursorScreenX;
+
+  return {
+    anchorScreenX: params.cursorScreenX,
+    anchorTimeSeconds: clampTimeSeconds(
+      anchorContentX / currentPixelsPerSecond,
+      params.maxDurationSeconds,
+    ),
+  };
+}
+
+export function getPlayheadZoomAnchor(params: {
+  currentFrame: number;
+  currentZoomLevel: number;
+  fps: number;
+  maxDurationSeconds: number;
+  scrollLeft: number;
+}): TimelineZoomAnchor {
+  const safeFps = params.fps > 0 ? params.fps : 1;
+  const anchorTimeSeconds = clampTimeSeconds(
+    params.currentFrame / safeFps,
+    params.maxDurationSeconds,
+  );
+  const anchorContentX = anchorTimeSeconds * zoomLevelToPixelsPerSecond(params.currentZoomLevel);
+
+  return {
+    anchorScreenX: anchorContentX - params.scrollLeft,
+    anchorTimeSeconds,
+  };
+}
+
+export function getAnchoredZoomScrollLeft(params: {
+  anchor: TimelineZoomAnchor;
+  maxDurationSeconds: number;
+  nextZoomLevel: number;
+}): number {
+  const clampedZoomLevel = Math.max(ZOOM_MIN, Math.min(ZOOM_MAX, params.nextZoomLevel));
+  const anchorTimeSeconds = clampTimeSeconds(
+    params.anchor.anchorTimeSeconds,
+    params.maxDurationSeconds,
+  );
+  const nextAnchorContentX = anchorTimeSeconds * zoomLevelToPixelsPerSecond(clampedZoomLevel);
+
+  return Math.max(0, nextAnchorContentX - params.anchor.anchorScreenX);
+}
diff --git a/src/features/timeline/workers/filmstrip-extraction-worker.ts b/src/features/timeline/workers/filmstrip-extraction-worker.ts
index 293851396..38f8e16ec 100644
--- a/src/features/timeline/workers/filmstrip-extraction-worker.ts
+++ b/src/features/timeline/workers/filmstrip-extraction-worker.ts
@@ -1,23 +1,15 @@
 /**
  * Filmstrip Extraction Worker
  *
- * Extracts video frames using mediabunny's CanvasSink and saves
- * directly to OPFS. All heavy work happens in the worker.
- *
- * Storage structure:
- *   filmstrips/{mediaId}/
- *     meta.json - { width, height, isComplete, frameCount }
- *     0.jpg, 1.jpg, 2.jpg, ... (legacy caches may still include .webp)
+ * Extracts video frames using mediabunny's CanvasSink.
+ * All heavy decode and JPEG encode work happens in the worker; the
+ * main thread persists the resulting blobs into the workspace.
  */
 
 import { createMediabunnyInputSource } from '@/infrastructure/browser/mediabunny-input-source';
 import type { ObjectUrlSourceMetadata } from '@/infrastructure/browser/object-url-registry';
-import { safeWrite } from '../utils/opfs-safe-write';
-
-const FILMSTRIP_DIR = 'filmstrips';
 const IMAGE_FORMAT = 'image/jpeg';
 const IMAGE_QUALITY = 0.7; // JPEG is substantially faster to encode for tiny thumbnails
-const FRAME_FILE_EXT = 'jpg';
 const FRAME_RATE = 1; // 1fps for filmstrip thumbnails
 
 // Message types
@@ -39,7 +31,7 @@ export interface ExtractRequest {
   endIndex?: number; // End frame index (exclusive)
   totalFrames?: number; // Total frames across all workers (for progress)
   workerId?: number; // Worker identifier for debugging
-  maxParallelSaves?: number; // Optional memory-pressure throttle from main thread
+  maxParallelSaves?: number; // Reserved for future worker-local throttling
 }
 
 export interface AbortRequest {
@@ -93,37 +85,15 @@ function getRequestIdFromMessage(data: unknown): string {
 const loadMediabunny = () => import('mediabunny');
 
 /**
- * Get or create OPFS directory for filmstrip storage
- */
-async function getFilmstripDir(mediaId: string): Promise<FileSystemDirectoryHandle> {
-  const root = await navigator.storage.getDirectory();
-  const filmstripRoot = await root.getDirectoryHandle(FILMSTRIP_DIR, { create: true });
-  return filmstripRoot.getDirectoryHandle(mediaId, { create: true });
-}
-
-/**
- * Save a frame to OPFS
- */
-async function saveFrame(
-  dir: FileSystemDirectoryHandle,
-  index: number,
-  blob: Blob
-): Promise<void> {
-  const fileHandle = await dir.getFileHandle(`${index}.${FRAME_FILE_EXT}`, { create: true });
-  const writable = await fileHandle.createWritable();
-  await safeWrite(writable, blob);
-}
-
-/**
- * Extract frames and save directly to OPFS
+ * Extract frames and return encoded JPEG blobs to the main thread.
  */
 async function extractAndSave(
   request: ExtractRequest,
   state: { aborted: boolean }
 ): Promise<void> {
   const {
-    requestId, mediaId, blobUrl, blob, sourceMetadata, duration, width, height, skipIndices, priorityIndices, targetIndices,
-    startIndex, endIndex, totalFrames: totalFramesOverride, maxParallelSaves
+    requestId, blobUrl, blob, sourceMetadata, duration, width, height, skipIndices, priorityIndices, targetIndices,
+    startIndex, endIndex, totalFrames: totalFramesOverride
   } = request;
 
   // Calculate frame range - support both full extraction and chunked
@@ -183,9 +153,6 @@ async function extractAndSave(
     return;
   }
 
-  // Get OPFS directory
-  const dir = await getFilmstripDir(mediaId);
-
   // Load mediabunny
   const { Input, CanvasSink, ALL_FORMATS } = await loadMediabunny();
 
@@ -231,13 +198,11 @@ async function extractAndSave(
     }
 
     // Two parallel pipelines per frame:
-    // 1. FAST: createImageBitmap → transfer to main thread (instant display, no encode)
-    // 2. SLOW: convertToBlob (JPEG) → save to OPFS (persistence, runs in background)
+    // 1. FAST: createImageBitmap -> transfer to main thread (instant display, no encode)
+    // 2. SLOW: convertToBlob (JPEG) -> send blob to main thread for persistence
     //
     // Bitmaps are sent immediately on every decoded frame for instant UI updates.
-    // JPEG encode + OPFS save runs concurrently, blobs reported when ready.
-    const pendingSaves: Promise<void>[] = [];
-    const MAX_PARALLEL_SAVES = Math.max(1, Math.min(6, maxParallelSaves ?? 4));
+    // JPEG encode runs concurrently, with blobs reported as soon as they are ready.
     let pendingEncode: Promise<{ blob: Blob; frameIndex: number }> | null = null;
     let bitmapsSinceLastReport: Array<{ index: number; bitmap: ImageBitmap }> = [];
 
@@ -245,15 +210,7 @@ async function extractAndSave(
       if (!pendingEncode) return;
       const { blob, frameIndex } = await pendingEncode;
       pendingEncode = null;
-      const savePromise = saveFrame(dir, frameIndex, blob).then(() => {
-        const idx = pendingSaves.indexOf(savePromise);
-        if (idx > -1) pendingSaves.splice(idx, 1);
-        savedSinceLastReport.push({ index: frameIndex, blob });
-      });
-      pendingSaves.push(savePromise);
-      if (pendingSaves.length >= MAX_PARALLEL_SAVES) {
-        await Promise.race(pendingSaves);
-      }
+      savedSinceLastReport.push({ index: frameIndex, blob });
     };
 
     for await (const wrapped of sink.canvasesAtTimestamps(timestampGenerator())) {
@@ -281,7 +238,7 @@ async function extractAndSave(
       // Queue bitmap for immediate transfer to main thread (no JPEG encode needed)
       bitmapsSinceLastReport.push({ index: frameIndex, bitmap: displayBitmap });
 
-      // Flush prior encode, then start JPEG encode in background for OPFS persistence
+      // Flush prior encode, then start JPEG encode in background for workspace persistence
       await flushPendingEncode();
       const encodeCanvas = new OffscreenCanvas(encodeBitmap.width, encodeBitmap.height);
       const encodeCtx = encodeCanvas.getContext('2d')!;
@@ -296,7 +253,7 @@ async function extractAndSave(
       frameListIndex++;
 
       // Send progress with bitmaps on every frame for instant display.
-      // savedFrames/savedIndices lag behind as JPEG encode + OPFS write complete.
+      // savedFrames/savedIndices lag behind as JPEG encode completes.
       const shouldReport = extractedCount <= 3 || extractedCount % 10 === 0
         || bitmapsSinceLastReport.length > 0;
       if (shouldReport) {
@@ -325,11 +282,6 @@ async function extractAndSave(
     // Flush the last pipelined encode
     await flushPendingEncode();
 
-    // Wait for all pending saves to complete
-    if (pendingSaves.length > 0) {
-      await Promise.all(pendingSaves);
-    }
-
     // Emit any saved frames that completed after the final progress report.
     if (savedSinceLastReport.length > 0) {
       const progress = Math.round((extractedCount / totalFrames) * 100);
diff --git a/src/infrastructure/analysis/index.ts b/src/infrastructure/analysis/index.ts
index d4267f818..02dd6138e 100644
--- a/src/infrastructure/analysis/index.ts
+++ b/src/infrastructure/analysis/index.ts
@@ -4,10 +4,38 @@
  */
 
 export { detectScenes, clearSceneCache } from '@/lib/analysis';
-export type { SceneCut, SceneDetectionProgress, VerificationModel } from '@/lib/analysis';
+export type {
+  SceneCut,
+  SceneDetectionProgress,
+  VerificationModel,
+} from '@/lib/analysis';
 export {
   getSceneVerificationModelLabel,
   getSceneVerificationModelOptions,
 } from '@/lib/analysis';
 export { captionVideo, captionImage } from '@/lib/analysis';
 export type { MediaCaption, CaptioningProgress, CaptioningOptions } from '@/lib/analysis';
+export {
+  embeddingsProvider,
+  EMBEDDING_MODEL_ID,
+  EMBEDDING_MODEL_DIM,
+  clipProvider,
+  CLIP_MODEL_ID,
+  CLIP_EMBEDDING_DIM,
+  buildEmbeddingText,
+  sliceTranscript,
+  extractDominantColors,
+  extractDominantColorPhrase,
+  rgbToLab,
+  deltaE76,
+  deltaE2000,
+} from '@/lib/analysis';
+export type {
+  EmbeddingsOptions,
+  EmbeddingsProgress,
+  EmbeddingsProvider,
+  BuildEmbeddingTextInput,
+  TranscriptSegment,
+  PaletteEntry,
+  LabColor,
+} from '@/lib/analysis';
diff --git a/src/infrastructure/storage/cache-version.ts b/src/infrastructure/storage/cache-version.ts
index 32b825fb1..f0dea0018 100644
--- a/src/infrastructure/storage/cache-version.ts
+++ b/src/infrastructure/storage/cache-version.ts
@@ -25,7 +25,7 @@ const VERSION_PREFIX = 'cache-version-';
 const CACHE_VERSIONS = {
   filmstrip: 9,    // OPFS filmstrip frames (v9: invalidate incorrect partial-complete prewarms)
   waveform: 3,     // OPFS waveform data (v3: stereo interleaved L/R peaks)
-  thumbnail: 1,    // IndexedDB thumbnails
+  thumbnail: 1,    // Workspace-backed thumbnails
   media: 1,        // OPFS media files
 } as const;
 
diff --git a/src/infrastructure/storage/index.ts b/src/infrastructure/storage/index.ts
index 46ae9bedc..ba07549c4 100644
--- a/src/infrastructure/storage/index.ts
+++ b/src/infrastructure/storage/index.ts
@@ -86,6 +86,44 @@ export {
   deleteTranscript,
 } from '@/infrastructure/storage/workspace-fs/transcripts';
 
+// AI captions (vision-language-model frame descriptions)
+export {
+  getCaptions,
+  saveCaptions,
+  deleteCaptions,
+  saveCaptionThumbnail,
+  getCaptionThumbnailBlob,
+  probeCaptionThumbnail,
+  deleteCaptionThumbnails,
+  saveCaptionEmbeddings,
+  getCaptionEmbeddings,
+  getCaptionsEmbeddingsMeta,
+  deleteCaptionEmbeddings,
+  saveCaptionImageEmbeddings,
+  getCaptionImageEmbeddings,
+} from '@/infrastructure/storage/workspace-fs/captions';
+
+// Scene-detection results
+export {
+  getScenes,
+  saveScenes,
+  deleteScenes,
+  type SavedScenes,
+} from '@/infrastructure/storage/workspace-fs/scenes';
+
+// Generic AI-output envelope (use these directly for new AI services)
+export {
+  readAiOutput,
+  writeAiOutput,
+  deleteAiOutput,
+  listAiOutputs,
+  getMediaIdsWithAiOutput,
+  AI_OUTPUT_SCHEMA_VERSION,
+  type AiOutput,
+  type AiOutputKind,
+  type AiOutputPayloads,
+} from '@/infrastructure/storage/workspace-fs/ai-outputs';
+
 // Orphan cache sweep
 export {
   sweepWorkspaceOrphans,
diff --git a/src/infrastructure/storage/workspace-fs/README.template.md b/src/infrastructure/storage/workspace-fs/README.template.md
index 816c33044..ed229aff6 100644
--- a/src/infrastructure/storage/workspace-fs/README.template.md
+++ b/src/infrastructure/storage/workspace-fs/README.template.md
@@ -1,6 +1,6 @@
 # FreeCut Workspace
 
-This folder is your FreeCut project workspace — the app's source of truth
+This folder is your FreeCut project workspace - the app's source of truth
 for everything: projects, media metadata, thumbnails, waveforms, caches.
 
 Everything here is **plain files** you can `cat`, `grep`, and diff with
@@ -10,30 +10,33 @@ normal tools. AI coding agents can read them directly without a browser.
 
 ```
 ./
-├── README.md                  ← this file
-├── .freecut-workspace.json    ← marker + schema version
-├── index.json                 ← fast project list
-├── projects/
-│   └── <projectId>/
-│       ├── project.json       ← timeline, settings, keyframes, markers, transitions
-│       ├── thumbnail.jpg
-│       └── media-links.json   ← which media this project uses
-├── media/
-│   └── <mediaId>/
-│       ├── metadata.json      ← codec, duration, resolution, etc.
-│       ├── source.<ext>       ← inline source file
-│       ├── source.link.json   ← OR a link descriptor to an external file
-│       ├── thumbnail.jpg
-│       └── cache/
-│           ├── filmstrip/     ← timeline thumbnails
-│           ├── waveform/      ← audio peaks (binned binary)
-│           ├── gif-frames/    ← pre-extracted GIF frames
-│           ├── decoded-audio/ ← preview audio for non-browser codecs
-│           └── transcript.json
-└── content/
-    └── <hash>/
-        ├── refs.json          ← reference count
-        └── data.<ext>         ← deduped blob
+|-- README.md                  <- this file
+|-- .freecut-workspace.json    <- marker + schema version
+|-- index.json                 <- fast project list
+|-- projects/
+|   `-- <projectId>/
+|       |-- project.json       <- timeline, settings, keyframes, markers, transitions
+|       |-- thumbnail.jpg
+|       `-- media-links.json   <- which media this project uses
+|-- media/
+|   `-- <mediaId>/
+|       |-- metadata.json      <- codec, duration, resolution, etc.
+|       |-- source.<ext>       <- inline source file
+|       |-- source.link.json   <- OR a link descriptor to an external file
+|       |-- thumbnail.jpg
+|       `-- cache/
+|           |-- waveform/      <- audio peaks (binned binary)
+|           |-- gif-frames/    <- pre-extracted GIF frames
+|           |-- decoded-audio/ <- preview audio for non-browser codecs
+|           `-- transcript.json
+|-- filmstrips/
+|   `-- <mediaId>/             <- timeline thumbnail cache
+|       |-- meta.json
+|       `-- 0.jpg, 1.jpg, ...
+`-- content/
+    `-- <hash>/
+        |-- refs.json          <- reference count
+        `-- data.<ext>         <- deduped blob
 ```
 
 ## Safe to edit?
@@ -41,10 +44,10 @@ normal tools. AI coding agents can read them directly without a browser.
 Everything except media source bytes is safe to inspect. Editing
 `project.json` externally works; FreeCut picks up changes on next load.
 
-Binary caches (waveforms, decoded audio) are regeneratable — delete and
-the app will rebuild them on demand.
+Binary caches (waveforms, decoded audio, filmstrips) are regeneratable -
+delete them and the app will rebuild them on demand.
 
 ## Moving the workspace
 
-You can move this folder to a new location — the app just needs you to
+You can move this folder to a new location - the app just needs you to
 re-pick it via the "Reconnect" prompt on next launch.
diff --git a/src/infrastructure/storage/workspace-fs/ai-outputs/ai-outputs.test.ts b/src/infrastructure/storage/workspace-fs/ai-outputs/ai-outputs.test.ts
new file mode 100644
index 000000000..5de646e47
--- /dev/null
+++ b/src/infrastructure/storage/workspace-fs/ai-outputs/ai-outputs.test.ts
@@ -0,0 +1,146 @@
+import { afterEach, describe, expect, it, vi } from 'vitest';
+
+vi.mock('@/shared/logging/logger', () => ({
+  createLogger: () => ({
+    info: vi.fn(),
+    warn: vi.fn(),
+    error: vi.fn(),
+    debug: vi.fn(),
+    event: vi.fn(),
+    startEvent: () => ({ set: vi.fn(), merge: vi.fn(), success: vi.fn(), failure: vi.fn() }),
+    child: vi.fn(),
+    setLevel: vi.fn(),
+  }),
+  createOperationId: () => 'op-test',
+}));
+
+import {
+  deleteAiOutput,
+  getMediaIdsWithAiOutput,
+  listAiOutputs,
+  readAiOutput,
+  writeAiOutput,
+} from './io';
+import { setWorkspaceRoot } from '../root';
+import { asHandle, createRoot } from '../__tests__/in-memory-handle';
+
+afterEach(() => setWorkspaceRoot(null));
+
+describe('workspace-fs ai-outputs', () => {
+  it('round-trips a captions envelope', async () => {
+    setWorkspaceRoot(asHandle(createRoot()));
+    const written = await writeAiOutput({
+      mediaId: 'm1',
+      kind: 'captions',
+      service: 'lfm-captioning',
+      model: 'lfm-2.5-vl',
+      params: { sampleIntervalSec: 2 },
+      data: { sampleIntervalSec: 2, captions: [{ timeSec: 0, text: 'hello' }] },
+    });
+    expect(written.createdAt).toBeGreaterThan(0);
+    expect(written.updatedAt).toBeGreaterThanOrEqual(written.createdAt);
+
+    const loaded = await readAiOutput('m1', 'captions');
+    expect(loaded?.data.captions[0]?.text).toBe('hello');
+    expect(loaded?.service).toBe('lfm-captioning');
+  });
+
+  it('preserves createdAt across updates', async () => {
+    setWorkspaceRoot(asHandle(createRoot()));
+    const first = await writeAiOutput({
+      mediaId: 'm1',
+      kind: 'captions',
+      service: 'lfm-captioning',
+      model: 'lfm-2.5-vl',
+      data: { captions: [{ timeSec: 0, text: 'v1' }] },
+    });
+
+    await new Promise((r) => setTimeout(r, 2));
+    const second = await writeAiOutput({
+      mediaId: 'm1',
+      kind: 'captions',
+      service: 'lfm-captioning',
+      model: 'lfm-2.5-vl',
+      data: { captions: [{ timeSec: 0, text: 'v2' }] },
+    });
+
+    expect(second.createdAt).toBe(first.createdAt);
+    expect(second.updatedAt).toBeGreaterThanOrEqual(first.updatedAt);
+  });
+
+  it('readAiOutput returns undefined when missing', async () => {
+    setWorkspaceRoot(asHandle(createRoot()));
+    expect(await readAiOutput('missing', 'captions')).toBeUndefined();
+  });
+
+  it('deleteAiOutput removes the file', async () => {
+    setWorkspaceRoot(asHandle(createRoot()));
+    await writeAiOutput({
+      mediaId: 'm1',
+      kind: 'scenes',
+      service: 'scene-detect',
+      model: 'histogram',
+      data: {
+        method: 'histogram',
+        sampleIntervalMs: 250,
+        fps: 30,
+        cuts: [],
+      },
+    });
+    await deleteAiOutput('m1', 'scenes');
+    expect(await readAiOutput('m1', 'scenes')).toBeUndefined();
+  });
+
+  it('listAiOutputs returns saved kinds', async () => {
+    setWorkspaceRoot(asHandle(createRoot()));
+    await writeAiOutput({
+      mediaId: 'm1',
+      kind: 'captions',
+      service: 'lfm-captioning',
+      model: 'lfm-2.5-vl',
+      data: { captions: [] },
+    });
+    await writeAiOutput({
+      mediaId: 'm1',
+      kind: 'scenes',
+      service: 'scene-detect',
+      model: 'histogram',
+      data: {
+        method: 'histogram',
+        sampleIntervalMs: 250,
+        fps: 30,
+        cuts: [],
+      },
+    });
+
+    const kinds = await listAiOutputs('m1');
+    expect(new Set(kinds)).toEqual(new Set(['captions', 'scenes']));
+  });
+
+  it('getMediaIdsWithAiOutput filters to ids with output present', async () => {
+    setWorkspaceRoot(asHandle(createRoot()));
+    await writeAiOutput({
+      mediaId: 'm1',
+      kind: 'captions',
+      service: 'lfm-captioning',
+      model: 'lfm-2.5-vl',
+      data: { captions: [] },
+    });
+    await writeAiOutput({
+      mediaId: 'm3',
+      kind: 'captions',
+      service: 'lfm-captioning',
+      model: 'lfm-2.5-vl',
+      data: { captions: [] },
+    });
+
+    const ids = await getMediaIdsWithAiOutput(['m1', 'm2', 'm3'], 'captions');
+    expect(ids).toEqual(new Set(['m1', 'm3']));
+  });
+
+  it('getMediaIdsWithAiOutput short-circuits on empty input', async () => {
+    setWorkspaceRoot(null);
+    const ids = await getMediaIdsWithAiOutput([], 'captions');
+    expect(ids.size).toBe(0);
+  });
+});
diff --git a/src/infrastructure/storage/workspace-fs/ai-outputs/index.ts b/src/infrastructure/storage/workspace-fs/ai-outputs/index.ts
new file mode 100644
index 000000000..06b683d46
--- /dev/null
+++ b/src/infrastructure/storage/workspace-fs/ai-outputs/index.ts
@@ -0,0 +1,21 @@
+export type {
+  AiOutput,
+  AiOutputKind,
+  AiOutputPayloads,
+  TranscriptPayload,
+  CaptionsPayload,
+  ScenesPayload,
+  SceneCutPayload,
+} from './types';
+export {
+  AI_OUTPUT_SCHEMA_VERSION,
+  transcriptFromLegacy,
+  transcriptToLegacy,
+} from './types';
+export {
+  readAiOutput,
+  writeAiOutput,
+  deleteAiOutput,
+  listAiOutputs,
+  getMediaIdsWithAiOutput,
+} from './io';
diff --git a/src/infrastructure/storage/workspace-fs/ai-outputs/io.ts b/src/infrastructure/storage/workspace-fs/ai-outputs/io.ts
new file mode 100644
index 000000000..3e3e48633
--- /dev/null
+++ b/src/infrastructure/storage/workspace-fs/ai-outputs/io.ts
@@ -0,0 +1,134 @@
+/**
+ * Generic CRUD for AI output envelopes under `media/{id}/cache/ai/{kind}.json`.
+ *
+ * Every per-kind wrapper (transcripts, captions, scenes…) delegates here so
+ * the on-disk layout, envelope shape, and error handling stay uniform.
+ */
+
+import { createLogger } from '@/shared/logging/logger';
+
+import { requireWorkspaceRoot } from '../root';
+import {
+  readJson,
+  removeEntry,
+  writeJsonAtomic,
+  listDirectory,
+} from '../fs-primitives';
+import { aiOutputPath, aiOutputsDir } from '../paths';
+
+import {
+  AI_OUTPUT_SCHEMA_VERSION,
+  type AiOutput,
+  type AiOutputKind,
+  type AiOutputPayloads,
+} from './types';
+
+const logger = createLogger('WorkspaceFS:AiOutputs');
+
+export async function readAiOutput<K extends AiOutputKind>(
+  mediaId: string,
+  kind: K,
+): Promise<AiOutput<K> | undefined> {
+  const root = requireWorkspaceRoot();
+  try {
+    const result = await readJson<AiOutput<K>>(root, aiOutputPath(mediaId, kind));
+    return result ?? undefined;
+  } catch (error) {
+    logger.error(`readAiOutput(${mediaId}, ${kind}) failed`, error);
+    throw new Error(`Failed to load AI output ${kind} for ${mediaId}`);
+  }
+}
+
+interface WriteInput<K extends AiOutputKind> {
+  mediaId: string;
+  kind: K;
+  service: string;
+  model: string;
+  params?: Record<string, unknown>;
+  data: AiOutputPayloads[K];
+}
+
+/**
+ * Write an envelope atomically. Sets `createdAt` on first write and updates
+ * `updatedAt` every time. Returns the persisted envelope.
+ */
+export async function writeAiOutput<K extends AiOutputKind>(
+  input: WriteInput<K>,
+): Promise<AiOutput<K>> {
+  const root = requireWorkspaceRoot();
+  const now = Date.now();
+  const existing = await readJson<AiOutput<K>>(root, aiOutputPath(input.mediaId, input.kind));
+
+  const envelope: AiOutput<K> = {
+    schemaVersion: AI_OUTPUT_SCHEMA_VERSION,
+    kind: input.kind,
+    mediaId: input.mediaId,
+    service: input.service,
+    model: input.model,
+    params: input.params ?? {},
+    createdAt: existing?.createdAt ?? now,
+    updatedAt: now,
+    data: input.data,
+  };
+
+  try {
+    await writeJsonAtomic(root, aiOutputPath(input.mediaId, input.kind), envelope);
+    return envelope;
+  } catch (error) {
+    logger.error(`writeAiOutput(${input.mediaId}, ${input.kind}) failed`, error);
+    throw new Error(`Failed to save AI output ${input.kind} for ${input.mediaId}`);
+  }
+}
+
+export async function deleteAiOutput(
+  mediaId: string,
+  kind: AiOutputKind,
+): Promise<void> {
+  const root = requireWorkspaceRoot();
+  try {
+    await removeEntry(root, aiOutputPath(mediaId, kind));
+  } catch (error) {
+    logger.error(`deleteAiOutput(${mediaId}, ${kind}) failed`, error);
+    throw new Error(`Failed to delete AI output ${kind} for ${mediaId}`);
+  }
+}
+
+/**
+ * List every AI output kind present for `mediaId`. Returns the `kind` stems
+ * (no extension). Used by cleanup sweeps and debug UIs.
+ */
+export async function listAiOutputs(mediaId: string): Promise<AiOutputKind[]> {
+  const root = requireWorkspaceRoot();
+  try {
+    const entries = await listDirectory(root, aiOutputsDir(mediaId));
+    return entries
+      .filter((entry) => entry.kind === 'file' && entry.name.endsWith('.json'))
+      .map((entry) => entry.name.slice(0, -'.json'.length) as AiOutputKind);
+  } catch (error) {
+    logger.warn(`listAiOutputs(${mediaId}) failed`, error);
+    return [];
+  }
+}
+
+/**
+ * Bulk existence probe. Returns the subset of `mediaIds` that have a saved
+ * output of `kind`. Concurrent reads — callers should pre-batch by kind.
+ */
+export async function getMediaIdsWithAiOutput(
+  mediaIds: string[],
+  kind: AiOutputKind,
+): Promise<Set<string>> {
+  if (mediaIds.length === 0) return new Set();
+  const root = requireWorkspaceRoot();
+  const ready = new Set<string>();
+  const results = await Promise.all(
+    mediaIds.map(async (id) => {
+      const env = await readJson<AiOutput<typeof kind>>(root, aiOutputPath(id, kind));
+      return env ? id : null;
+    }),
+  );
+  for (const id of results) {
+    if (id) ready.add(id);
+  }
+  return ready;
+}
diff --git a/src/infrastructure/storage/workspace-fs/ai-outputs/types.ts b/src/infrastructure/storage/workspace-fs/ai-outputs/types.ts
new file mode 100644
index 000000000..00411af3e
--- /dev/null
+++ b/src/infrastructure/storage/workspace-fs/ai-outputs/types.ts
@@ -0,0 +1,143 @@
+/**
+ * Shared envelope for every AI-derived analysis output stored under
+ * `media/{id}/cache/ai/{kind}.json`.
+ *
+ * One file per `AiOutputKind`. Envelope fields are identical across kinds so
+ * invalidation logic ("is this transcript from the same model we use today?")
+ * works uniformly. Service-specific data goes inside `data`.
+ */
+
+import type { MediaCaption } from '@/infrastructure/analysis';
+import type { MediaTranscript, MediaTranscriptModel, MediaTranscriptQuantization } from '@/types/storage';
+
+/**
+ * Registry of AI output kinds. Adding a new AI service means:
+ * 1. Add its name here.
+ * 2. Add its payload type to `AiOutputPayloads` below.
+ * 3. (Optional) Add a thin wrapper in `workspace-fs/` that calls
+ *    `readAiOutput/writeAiOutput` with that kind.
+ */
+export type AiOutputKind =
+  | 'transcript'
+  | 'captions'
+  | 'scenes';
+
+/**
+ * Typed payload per kind. Matches the `data` field on `AiOutput<T>`.
+ * New kinds must be registered here so the storage API stays strongly typed.
+ */
+export interface AiOutputPayloads {
+  transcript: TranscriptPayload;
+  captions: CaptionsPayload;
+  scenes: ScenesPayload;
+}
+
+/**
+ * Current schema version for the envelope itself. Bump when the envelope
+ * shape changes (not when a payload changes — that's the payload's concern).
+ */
+export const AI_OUTPUT_SCHEMA_VERSION = 1;
+
+export interface AiOutput<K extends AiOutputKind> {
+  schemaVersion: typeof AI_OUTPUT_SCHEMA_VERSION;
+  kind: K;
+  mediaId: string;
+  /** Stable service identifier, e.g. `"whisper-wasm"`, `"lfm-captioning"`. */
+  service: string;
+  /** Model id/version, e.g. `"whisper-small"`, `"lfm-2.5-vl"`. */
+  model: string;
+  /** Service-specific inputs that affect the output (quantization, threshold, sample interval). */
+  params: Record<string, unknown>;
+  createdAt: number;
+  updatedAt: number;
+  data: AiOutputPayloads[K];
+}
+
+/* ───────────────── Payload shapes ───────────────── */
+
+export interface TranscriptPayload {
+  language?: string;
+  quantization: MediaTranscriptQuantization;
+  modelVariant: MediaTranscriptModel;
+  text: string;
+  segments: Array<{ text: string; start: number; end: number }>;
+}
+
+export type CaptionsPayload = {
+  sampleIntervalSec?: number;
+  /**
+   * Identifier of the text embedding model whose vectors live in the
+   * companion `captions-embeddings.bin` file. Absence means embeddings
+   * haven't been computed yet (keyword search still works).
+   */
+  embeddingModel?: string;
+  /** Dimension of each text embedding vector, e.g. 384 for all-MiniLM-L6-v2. */
+  embeddingDim?: number;
+  /**
+   * Identifier of the image (CLIP) embedding model whose vectors live
+   * in `captions-image-embeddings.bin`. Independent of the text model;
+   * present only when thumbnails have been visually indexed.
+   */
+  imageEmbeddingModel?: string;
+  /** Dimension of each image embedding vector, e.g. 512 for CLIP base. */
+  imageEmbeddingDim?: number;
+  captions: MediaCaption[];
+};
+
+export interface SceneCutPayload {
+  frame: number;
+  time: number;
+  /** Service-defined motion metadata (histogram distance, flow magnitude, etc.). */
+  motion: unknown;
+  verified?: boolean;
+}
+
+export interface ScenesPayload {
+  method: 'histogram' | 'optical-flow';
+  sampleIntervalMs: number;
+  verificationModel?: string;
+  fps: number;
+  cuts: SceneCutPayload[];
+}
+
+/* ───────────────── Conversions ───────────────── */
+
+/**
+ * Adapter: build a transcript envelope from the legacy {@link MediaTranscript}
+ * record shape. Keeps callers that already construct `MediaTranscript` working
+ * unchanged during the migration.
+ */
+export function transcriptFromLegacy(record: MediaTranscript): AiOutput<'transcript'> {
+  return {
+    schemaVersion: AI_OUTPUT_SCHEMA_VERSION,
+    kind: 'transcript',
+    mediaId: record.mediaId,
+    service: 'whisper',
+    model: record.model,
+    params: { quantization: record.quantization, language: record.language },
+    createdAt: record.createdAt,
+    updatedAt: record.updatedAt ?? record.createdAt,
+    data: {
+      language: record.language,
+      quantization: record.quantization,
+      modelVariant: record.model,
+      text: record.text,
+      segments: record.segments,
+    },
+  };
+}
+
+/** Inverse of {@link transcriptFromLegacy}. */
+export function transcriptToLegacy(envelope: AiOutput<'transcript'>): MediaTranscript {
+  return {
+    id: envelope.mediaId,
+    mediaId: envelope.mediaId,
+    model: envelope.data.modelVariant,
+    language: envelope.data.language,
+    quantization: envelope.data.quantization,
+    text: envelope.data.text,
+    segments: envelope.data.segments,
+    createdAt: envelope.createdAt,
+    updatedAt: envelope.updatedAt,
+  };
+}
diff --git a/src/infrastructure/storage/workspace-fs/captions.ts b/src/infrastructure/storage/workspace-fs/captions.ts
new file mode 100644
index 000000000..25821876a
--- /dev/null
+++ b/src/infrastructure/storage/workspace-fs/captions.ts
@@ -0,0 +1,321 @@
+/**
+ * Per-media AI captions (vision-language-model frame descriptions).
+ *
+ * Stored at `media/{mediaId}/cache/ai/captions.json` as an {@link AiOutput}
+ * envelope. A denormalized copy lives on `MediaMetadata.aiCaptions` as a
+ * read-path convenience for UI consumers — writers must keep them in sync.
+ */
+
+import type { MediaCaption } from '@/infrastructure/analysis';
+import { createLogger } from '@/shared/logging/logger';
+
+import { readAiOutput, writeAiOutput, deleteAiOutput } from './ai-outputs';
+import { readArrayBuffer, readBlob, removeEntry, writeBlob } from './fs-primitives';
+import {
+  captionEmbeddingsPath,
+  captionImageEmbeddingsPath,
+  captionThumbPath,
+  captionThumbRelPath,
+  captionThumbsDir,
+} from './paths';
+import { requireWorkspaceRoot } from './root';
+
+const logger = createLogger('WorkspaceFS:Captions');
+
+interface SaveCaptionsInput {
+  mediaId: string;
+  captions: MediaCaption[];
+  /** Stable provider id, e.g. `"lfm-captioning"`. */
+  service: string;
+  /** Model id/version reported by the provider, e.g. `"lfm-2.5-vl"`. */
+  model: string;
+  /** Sample interval used at generation time — kept for invalidation. */
+  sampleIntervalSec?: number;
+  /** Text-embedding model id whose vectors are stored in the companion `.bin`. */
+  embeddingModel?: string;
+  /** Dimension of each text embedding vector. */
+  embeddingDim?: number;
+  /** CLIP image-embedding model id (separate bin). */
+  imageEmbeddingModel?: string;
+  /** Dimension of each image embedding vector. */
+  imageEmbeddingDim?: number;
+}
+
+export async function getCaptions(
+  mediaId: string,
+): Promise<MediaCaption[] | undefined> {
+  try {
+    const envelope = await readAiOutput(mediaId, 'captions');
+    return envelope?.data.captions;
+  } catch (error) {
+    logger.error(`getCaptions(${mediaId}) failed`, error);
+    throw new Error(`Failed to load captions: ${mediaId}`);
+  }
+}
+
+export async function saveCaptions(input: SaveCaptionsInput): Promise<MediaCaption[]> {
+  try {
+    const written = await writeAiOutput({
+      mediaId: input.mediaId,
+      kind: 'captions',
+      service: input.service,
+      model: input.model,
+      params: input.sampleIntervalSec !== undefined ? { sampleIntervalSec: input.sampleIntervalSec } : {},
+      data: {
+        sampleIntervalSec: input.sampleIntervalSec,
+        embeddingModel: input.embeddingModel,
+        embeddingDim: input.embeddingDim,
+        imageEmbeddingModel: input.imageEmbeddingModel,
+        imageEmbeddingDim: input.imageEmbeddingDim,
+        captions: input.captions,
+      },
+    });
+    return written.data.captions;
+  } catch (error) {
+    logger.error(`saveCaptions(${input.mediaId}) failed`, error);
+    throw new Error(`Failed to save captions: ${input.mediaId}`);
+  }
+}
+
+/**
+ * Read the raw embedding metadata saved alongside captions — both text
+ * and image model identifiers so ranking can decide whether each bin is
+ * safe to load back in.
+ */
+export async function getCaptionsEmbeddingsMeta(
+  mediaId: string,
+): Promise<{
+  embeddingModel?: string;
+  embeddingDim?: number;
+  imageEmbeddingModel?: string;
+  imageEmbeddingDim?: number;
+} | null> {
+  const envelope = await readAiOutput(mediaId, 'captions');
+  if (!envelope) return null;
+  return {
+    embeddingModel: envelope.data.embeddingModel,
+    embeddingDim: envelope.data.embeddingDim,
+    imageEmbeddingModel: envelope.data.imageEmbeddingModel,
+    imageEmbeddingDim: envelope.data.imageEmbeddingDim,
+  };
+}
+
+/**
+ * Persist caption embeddings as a contiguous `Float32Array`. Layout is
+ * `captionCount * embeddingDim` floats, stored in caption index order.
+ * The companion `captions.json` records {@link embeddingModel} and
+ * {@link embeddingDim} so a later read can detect model-drift before
+ * trusting the payload.
+ */
+export async function saveCaptionEmbeddings(
+  mediaId: string,
+  vectors: Float32Array[],
+  embeddingDim: number,
+): Promise<void> {
+  if (vectors.length === 0) return;
+  const root = requireWorkspaceRoot();
+  const packed = new Float32Array(vectors.length * embeddingDim);
+  vectors.forEach((vector, index) => {
+    if (vector.length !== embeddingDim) {
+      throw new Error(
+        `Embedding dim mismatch at index ${index}: got ${vector.length}, expected ${embeddingDim}`,
+      );
+    }
+    packed.set(vector, index * embeddingDim);
+  });
+  try {
+    await writeBlob(root, captionEmbeddingsPath(mediaId), packed.buffer);
+  } catch (error) {
+    logger.error(`saveCaptionEmbeddings(${mediaId}) failed`, error);
+    throw new Error(`Failed to save caption embeddings: ${mediaId}`);
+  }
+}
+
+/**
+ * Load caption embeddings back into an array of `Float32Array`s. Returns
+ * `null` when no `.bin` exists (pre-feature captions) or when the saved
+ * vector count doesn't match `expectedCount` (captions changed under our
+ * feet and the bin is stale).
+ */
+export async function getCaptionEmbeddings(
+  mediaId: string,
+  embeddingDim: number,
+  expectedCount: number,
+): Promise<Float32Array[] | null> {
+  if (expectedCount === 0) return [];
+  const root = requireWorkspaceRoot();
+  try {
+    const buffer = await readArrayBuffer(root, captionEmbeddingsPath(mediaId));
+    if (!buffer) return null;
+    const expectedFloats = expectedCount * embeddingDim;
+    const got = buffer.byteLength / Float32Array.BYTES_PER_ELEMENT;
+    if (got !== expectedFloats) {
+      logger.warn(
+        `getCaptionEmbeddings(${mediaId}): bin has ${got} floats, expected ${expectedFloats} — treating as stale`,
+      );
+      return null;
+    }
+    const packed = new Float32Array(buffer);
+    const vectors: Float32Array[] = [];
+    for (let i = 0; i < expectedCount; i += 1) {
+      vectors.push(packed.slice(i * embeddingDim, (i + 1) * embeddingDim));
+    }
+    return vectors;
+  } catch (error) {
+    logger.warn(`getCaptionEmbeddings(${mediaId}) failed`, error);
+    return null;
+  }
+}
+
+export async function deleteCaptionEmbeddings(mediaId: string): Promise<void> {
+  const root = requireWorkspaceRoot();
+  try {
+    await removeEntry(root, captionEmbeddingsPath(mediaId));
+  } catch (error) {
+    logger.warn(`deleteCaptionEmbeddings(${mediaId}) failed`, error);
+  }
+  try {
+    await removeEntry(root, captionImageEmbeddingsPath(mediaId));
+  } catch (error) {
+    logger.warn(`deleteCaptionImageEmbeddings(${mediaId}) failed`, error);
+  }
+}
+
+/**
+ * Persist per-caption CLIP image embeddings. Same layout as text
+ * embeddings — `captionCount * embeddingDim` packed floats in caption
+ * order. Safe to call independently of {@link saveCaptionEmbeddings};
+ * either bin can exist without the other.
+ */
+export async function saveCaptionImageEmbeddings(
+  mediaId: string,
+  vectors: Float32Array[],
+  embeddingDim: number,
+): Promise<void> {
+  if (vectors.length === 0) return;
+  const root = requireWorkspaceRoot();
+  const packed = new Float32Array(vectors.length * embeddingDim);
+  vectors.forEach((vector, index) => {
+    if (vector.length !== embeddingDim) {
+      throw new Error(
+        `Image embedding dim mismatch at index ${index}: got ${vector.length}, expected ${embeddingDim}`,
+      );
+    }
+    packed.set(vector, index * embeddingDim);
+  });
+  try {
+    await writeBlob(root, captionImageEmbeddingsPath(mediaId), packed.buffer);
+  } catch (error) {
+    logger.error(`saveCaptionImageEmbeddings(${mediaId}) failed`, error);
+    throw new Error(`Failed to save caption image embeddings: ${mediaId}`);
+  }
+}
+
+export async function getCaptionImageEmbeddings(
+  mediaId: string,
+  embeddingDim: number,
+  expectedCount: number,
+): Promise<Float32Array[] | null> {
+  if (expectedCount === 0) return [];
+  const root = requireWorkspaceRoot();
+  try {
+    const buffer = await readArrayBuffer(root, captionImageEmbeddingsPath(mediaId));
+    if (!buffer) return null;
+    const expectedFloats = expectedCount * embeddingDim;
+    const got = buffer.byteLength / Float32Array.BYTES_PER_ELEMENT;
+    if (got !== expectedFloats) {
+      logger.warn(
+        `getCaptionImageEmbeddings(${mediaId}): bin has ${got} floats, expected ${expectedFloats} — treating as stale`,
+      );
+      return null;
+    }
+    const packed = new Float32Array(buffer);
+    const vectors: Float32Array[] = [];
+    for (let i = 0; i < expectedCount; i += 1) {
+      vectors.push(packed.slice(i * embeddingDim, (i + 1) * embeddingDim));
+    }
+    return vectors;
+  } catch (error) {
+    logger.warn(`getCaptionImageEmbeddings(${mediaId}) failed`, error);
+    return null;
+  }
+}
+
+export async function deleteCaptions(mediaId: string): Promise<void> {
+  try {
+    await deleteAiOutput(mediaId, 'captions');
+    await deleteCaptionThumbnails(mediaId);
+    await deleteCaptionEmbeddings(mediaId);
+  } catch (error) {
+    logger.error(`deleteCaptions(${mediaId}) failed`, error);
+    throw new Error(`Failed to delete captions: ${mediaId}`);
+  }
+}
+
+/**
+ * Persist a single caption thumbnail JPEG. Returns the workspace-relative
+ * path to stash on the corresponding `MediaCaption.thumbRelPath` so the
+ * Scene Browser can load the blob back on demand.
+ */
+export async function saveCaptionThumbnail(
+  mediaId: string,
+  index: number,
+  blob: Blob,
+): Promise<string> {
+  const root = requireWorkspaceRoot();
+  try {
+    await writeBlob(root, captionThumbPath(mediaId, index), blob);
+    return captionThumbRelPath(mediaId, index);
+  } catch (error) {
+    logger.error(`saveCaptionThumbnail(${mediaId}, ${index}) failed`, error);
+    throw new Error(`Failed to save caption thumbnail: ${mediaId}#${index}`);
+  }
+}
+
+/**
+ * Load a previously-saved caption thumbnail by its workspace-relative path.
+ * Returns `null` when the file is missing (captions from before the feature
+ * landed, or the directory was pruned).
+ */
+export async function getCaptionThumbnailBlob(
+  relPath: string,
+): Promise<Blob | null> {
+  const root = requireWorkspaceRoot();
+  const segments = relPath.split('/').filter(Boolean);
+  if (segments.length === 0) return null;
+  try {
+    return await readBlob(root, segments);
+  } catch (error) {
+    logger.warn(`getCaptionThumbnailBlob(${relPath}) failed`, error);
+    return null;
+  }
+}
+
+/**
+ * Probe the conventional caption thumbnail path for a (mediaId, captionIndex)
+ * pair. Returns the workspace-relative path when the file exists so the
+ * caller can reuse it without regenerating — useful for captions whose
+ * `thumbRelPath` pointer was dropped across a reload but whose JPEG is
+ * still on disk.
+ */
+export async function probeCaptionThumbnail(
+  mediaId: string,
+  captionIndex: number,
+): Promise<string | null> {
+  const relPath = captionThumbRelPath(mediaId, captionIndex);
+  const blob = await getCaptionThumbnailBlob(relPath);
+  return blob ? relPath : null;
+}
+
+/**
+ * Remove the `captions-thumbs` directory for a media item. No-op when the
+ * directory is absent; never throws — thumbnail cleanup is opportunistic.
+ */
+export async function deleteCaptionThumbnails(mediaId: string): Promise<void> {
+  const root = requireWorkspaceRoot();
+  try {
+    await removeEntry(root, captionThumbsDir(mediaId), { recursive: true });
+  } catch (error) {
+    logger.warn(`deleteCaptionThumbnails(${mediaId}) failed`, error);
+  }
+}
diff --git a/src/infrastructure/storage/workspace-fs/paths.ts b/src/infrastructure/storage/workspace-fs/paths.ts
index 0f2553ca5..f60584d9e 100644
--- a/src/infrastructure/storage/workspace-fs/paths.ts
+++ b/src/infrastructure/storage/workspace-fs/paths.ts
@@ -28,7 +28,11 @@
  * │           ├── waveform/{meta.json,bin-N.bin}
  * │           ├── gif-frames/{meta.json,frame-N.png}
  * │           ├── decoded-audio/{meta.json,left-N.bin,right-N.bin}
- * │           └── transcript.json
+ * │           └── ai/
+ * │               ├── transcript.json
+ * │               ├── captions.json
+ * │               ├── scenes.json
+ * │               └── {kind}.json          # new AI outputs go here, one file per kind
  * └── content/
  *     └── {hash[0:2]}/{hash}/
  *         ├── refs.json
@@ -67,11 +71,18 @@ export const MEDIA_THUMBNAIL_FILENAME = 'thumbnail.jpg';
 export const MEDIA_SOURCE_LINK_FILENAME = 'source.link.json';
 export const MEDIA_CACHE_DIR = 'cache';
 
-export const CACHE_FILMSTRIP_DIR = 'filmstrip';
 export const CACHE_WAVEFORM_DIR = 'waveform';
 export const CACHE_GIF_FRAMES_DIR = 'gif-frames';
 export const CACHE_DECODED_AUDIO_DIR = 'decoded-audio';
-export const CACHE_TRANSCRIPT_FILENAME = 'transcript.json';
+export const CACHE_AI_DIR = 'ai';
+/** Per-caption thumbnail JPEGs captured alongside LFM caption generation. */
+export const CACHE_CAPTION_THUMBS_DIR = 'captions-thumbs';
+/**
+ * Legacy path for transcripts — was `cache/transcript.json` before AI outputs
+ * were consolidated under `cache/ai/`. Readers fall back to this on miss; a
+ * subsequent save rewrites to the new path.
+ */
+export const CACHE_TRANSCRIPT_FILENAME_LEGACY = 'transcript.json';
 export const CACHE_META_FILENAME = 'meta.json';
 
 export const CONTENT_REFS_FILENAME = 'refs.json';
@@ -180,14 +191,6 @@ export function mediaCacheDir(id: string): string[] {
   return [...mediaDir(id), MEDIA_CACHE_DIR];
 }
 
-export function filmstripDir(mediaId: string): string[] {
-  return [...mediaCacheDir(mediaId), CACHE_FILMSTRIP_DIR];
-}
-
-export function filmstripFramePath(mediaId: string, frameIndex: number): string[] {
-  return [...filmstripDir(mediaId), `frame-${frameIndex}.jpg`];
-}
-
 export function waveformDir(mediaId: string): string[] {
   return [...mediaCacheDir(mediaId), CACHE_WAVEFORM_DIR];
 }
@@ -216,8 +219,65 @@ export function decodedAudioBinPath(
   return [...decodedAudioDir(mediaId), `${channel}-${binIndex}.bin`];
 }
 
-export function transcriptPath(mediaId: string): string[] {
-  return [...mediaCacheDir(mediaId), CACHE_TRANSCRIPT_FILENAME];
+/**
+ * Segments for `media/{id}/cache/ai/` — home for AI-derived analysis outputs
+ * (transcripts, captions, scene cuts, etc.). One file per `AiOutputKind`.
+ */
+export function aiOutputsDir(mediaId: string): string[] {
+  return [...mediaCacheDir(mediaId), CACHE_AI_DIR];
+}
+
+/**
+ * Segments for `media/{id}/cache/ai/{kind}.json`. The caller owns the `kind`
+ * enum (see `ai-outputs/types.ts`) — this helper only does path assembly.
+ */
+export function aiOutputPath(mediaId: string, kind: string): string[] {
+  return [...aiOutputsDir(mediaId), `${kind}.json`];
+}
+
+/** Segments for `media/{id}/cache/ai/captions-thumbs/`. */
+export function captionThumbsDir(mediaId: string): string[] {
+  return [...aiOutputsDir(mediaId), CACHE_CAPTION_THUMBS_DIR];
+}
+
+/** Segments for `media/{id}/cache/ai/captions-thumbs/{index}.jpg`. */
+export function captionThumbPath(mediaId: string, index: number): string[] {
+  return [...captionThumbsDir(mediaId), `${index}.jpg`];
+}
+
+/**
+ * Segments for `media/{id}/cache/ai/captions-embeddings.bin`. Stored as a
+ * contiguous `Float32Array` so 384-dim * N-caption embeddings stay compact
+ * (e.g. 500 captions = 750 KB vs ~4 MB if round-tripped through JSON).
+ */
+export function captionEmbeddingsPath(mediaId: string): string[] {
+  return [...aiOutputsDir(mediaId), 'captions-embeddings.bin'];
+}
+
+/**
+ * Segments for `media/{id}/cache/ai/captions-image-embeddings.bin`. Same
+ * packing as the text embeddings bin but in the CLIP joint embedding
+ * space (typically 512-dim), so semantic queries can fall back to
+ * matching on what the clip *looks like* when caption text is thin.
+ */
+export function captionImageEmbeddingsPath(mediaId: string): string[] {
+  return [...aiOutputsDir(mediaId), 'captions-image-embeddings.bin'];
+}
+
+/**
+ * Workspace-root-relative path (forward-slash separated) for a caption thumb,
+ * safe to persist in JSON / `MediaCaption.thumbRelPath`.
+ */
+export function captionThumbRelPath(mediaId: string, index: number): string {
+  return captionThumbPath(mediaId, index).join('/');
+}
+
+/**
+ * Legacy path kept only for read-fallback. New writes go through
+ * `aiOutputPath(mediaId, 'transcript')`.
+ */
+export function legacyTranscriptPath(mediaId: string): string[] {
+  return [...mediaCacheDir(mediaId), CACHE_TRANSCRIPT_FILENAME_LEGACY];
 }
 
 export function cacheMetaPath(dir: string[]): string[] {
@@ -239,10 +299,11 @@ export function contentDataPath(hash: string, extension: string): string[] {
   return [...contentDir(hash), `data.${ext}`];
 }
 
-/* ───────────────── Mirrored OPFS caches (shared across origins) ─────────────── */
+/* ---------------- Shared persisted caches ---------------- */
 //
-// These caches are primary in OPFS for speed but are also mirrored into the
-// workspace folder so other origins can read them without regenerating.
+// These caches live outside the media metadata tree so other origins can reuse
+// them without regenerating. Some still hydrate from legacy OPFS on demand.
+// Filmstrips intentionally use top-level `filmstrips/{mediaId}/...`.
 
 export const WORKSPACE_PROXIES_DIR = 'proxies';
 export const WORKSPACE_FILMSTRIPS_DIR = 'filmstrips';
@@ -266,13 +327,16 @@ export function filmstripMetaPath(mediaId: string): string[] {
 }
 
 export function previewAudioPath(relativePath: string): string[] {
-  // relativePath like 'm-123/track-left.wav' — keep original OPFS layout.
-  return [WORKSPACE_PREVIEW_AUDIO_DIR, ...relativePath.split('/')];
+  // Legacy metadata may already include the top-level 'preview-audio/' prefix.
+  // Normalize that away so workspace paths stay rooted at one shared folder.
+  const normalized = relativePath.replace(/^preview-audio\//, '');
+  return [WORKSPACE_PREVIEW_AUDIO_DIR, ...normalized.split('/').filter(Boolean)];
 }
 
 /**
- * Fast multi-resolution waveform binary — the OPFS-primary cache used by the
- * timeline renderer, mirrored here for cross-origin reuse. Different from
+ * Fast multi-resolution waveform binary. The timeline renderer still keeps an
+ * OPFS-local copy for range reads, while the workspace mirror enables
+ * cross-origin reuse. Different from
  * `waveformBinPath` above, which addresses bins inside the per-media cache.
  */
 export function waveformBinaryPath(mediaId: string): string[] {
diff --git a/src/infrastructure/storage/workspace-fs/scenes.ts b/src/infrastructure/storage/workspace-fs/scenes.ts
new file mode 100644
index 000000000..3548fb3af
--- /dev/null
+++ b/src/infrastructure/storage/workspace-fs/scenes.ts
@@ -0,0 +1,109 @@
+/**
+ * Per-media scene-detection results.
+ *
+ * Stored at `media/{mediaId}/cache/ai/scenes.json` as an {@link AiOutput}
+ * envelope. Scene cuts are a property of the source media (not the timeline
+ * clip), so caching by `mediaId` survives trim/split edits.
+ *
+ * Detection parameters (method, sample interval, verification model) are
+ * persisted in the envelope so consumers can skip the expensive recompute
+ * when the requested parameters match, and re-run when they don't.
+ */
+
+import type { SceneCut } from '@/infrastructure/analysis';
+import { createLogger } from '@/shared/logging/logger';
+
+import { readAiOutput, writeAiOutput, deleteAiOutput } from './ai-outputs';
+import type { ScenesPayload, SceneCutPayload } from './ai-outputs';
+
+const logger = createLogger('WorkspaceFS:Scenes');
+
+export interface SavedScenes {
+  method: 'histogram' | 'optical-flow';
+  sampleIntervalMs: number;
+  verificationModel?: string;
+  fps: number;
+  cuts: SceneCut[];
+}
+
+interface SaveScenesInput extends SavedScenes {
+  mediaId: string;
+  /** Stable provider id (e.g. `"scene-detect-histogram"`, `"scene-detect-optical-flow"`). */
+  service: string;
+  /** Detector/model identifier — for histogram this is just `"histogram"`. */
+  model: string;
+}
+
+function cutsToPayload(cuts: SceneCut[]): SceneCutPayload[] {
+  return cuts.map((cut) => ({
+    frame: cut.frame,
+    time: cut.time,
+    motion: cut.motion,
+    verified: cut.verified,
+  }));
+}
+
+function payloadToCuts(cuts: SceneCutPayload[]): SceneCut[] {
+  return cuts as unknown as SceneCut[];
+}
+
+export async function getScenes(mediaId: string): Promise<SavedScenes | undefined> {
+  try {
+    const envelope = await readAiOutput(mediaId, 'scenes');
+    if (!envelope) return undefined;
+    const data: ScenesPayload = envelope.data;
+    return {
+      method: data.method,
+      sampleIntervalMs: data.sampleIntervalMs,
+      verificationModel: data.verificationModel,
+      fps: data.fps,
+      cuts: payloadToCuts(data.cuts),
+    };
+  } catch (error) {
+    logger.error(`getScenes(${mediaId}) failed`, error);
+    throw new Error(`Failed to load scenes: ${mediaId}`);
+  }
+}
+
+export async function saveScenes(input: SaveScenesInput): Promise<SavedScenes> {
+  try {
+    const payload: ScenesPayload = {
+      method: input.method,
+      sampleIntervalMs: input.sampleIntervalMs,
+      verificationModel: input.verificationModel,
+      fps: input.fps,
+      cuts: cutsToPayload(input.cuts),
+    };
+    await writeAiOutput({
+      mediaId: input.mediaId,
+      kind: 'scenes',
+      service: input.service,
+      model: input.model,
+      params: {
+        method: input.method,
+        sampleIntervalMs: input.sampleIntervalMs,
+        verificationModel: input.verificationModel ?? null,
+      },
+      data: payload,
+    });
+    return {
+      method: input.method,
+      sampleIntervalMs: input.sampleIntervalMs,
+      verificationModel: input.verificationModel,
+      fps: input.fps,
+      cuts: input.cuts,
+    };
+  } catch (error) {
+    logger.error(`saveScenes(${input.mediaId}) failed`, error);
+    throw new Error(`Failed to save scenes: ${input.mediaId}`);
+  }
+}
+
+export async function deleteScenes(mediaId: string): Promise<void> {
+  try {
+    await deleteAiOutput(mediaId, 'scenes');
+  } catch (error) {
+    logger.error(`deleteScenes(${mediaId}) failed`, error);
+    throw new Error(`Failed to delete scenes: ${mediaId}`);
+  }
+}
diff --git a/src/infrastructure/storage/workspace-fs/transcripts.test.ts b/src/infrastructure/storage/workspace-fs/transcripts.test.ts
index bf9a358f5..9c8ecd501 100644
--- a/src/infrastructure/storage/workspace-fs/transcripts.test.ts
+++ b/src/infrastructure/storage/workspace-fs/transcripts.test.ts
@@ -23,6 +23,8 @@ import {
 } from './transcripts';
 import { setWorkspaceRoot } from './root';
 import { asHandle, createRoot } from './__tests__/in-memory-handle';
+import { writeJsonAtomic } from './fs-primitives';
+import { legacyTranscriptPath, aiOutputPath } from './paths';
 
 function makeTranscript(mediaId: string): MediaTranscript {
   return {
@@ -75,4 +77,33 @@ describe('workspace-fs transcripts', () => {
     await deleteTranscript('m1');
     expect(await getTranscript('m1')).toBeUndefined();
   });
+
+  it('reads a legacy cache/transcript.json written before the ai/ migration', async () => {
+    const root = createRoot();
+    setWorkspaceRoot(asHandle(root));
+    await writeJsonAtomic(asHandle(root), legacyTranscriptPath('legacy-id'), makeTranscript('legacy-id'));
+
+    const loaded = await getTranscript('legacy-id');
+    expect(loaded?.mediaId).toBe('legacy-id');
+    expect(loaded?.segments[0]?.text).toBe('hello');
+  });
+
+  it('saveTranscript migrates legacy path to ai/ envelope', async () => {
+    const root = createRoot();
+    setWorkspaceRoot(asHandle(root));
+    await writeJsonAtomic(asHandle(root), legacyTranscriptPath('m2'), makeTranscript('m2'));
+
+    // Round-trip through save rewrites to the new path.
+    await saveTranscript(makeTranscript('m2'));
+
+    // Allow the fire-and-forget legacy cleanup to settle.
+    await Promise.resolve();
+
+    const { readJson } = await import('./fs-primitives');
+    const legacy = await readJson(asHandle(root), legacyTranscriptPath('m2'));
+    expect(legacy).toBeNull();
+
+    const envelope = await readJson(asHandle(root), aiOutputPath('m2', 'transcript'));
+    expect(envelope).toBeTruthy();
+  });
 });
diff --git a/src/infrastructure/storage/workspace-fs/transcripts.ts b/src/infrastructure/storage/workspace-fs/transcripts.ts
index 5234b8dd1..aad1685c3 100644
--- a/src/infrastructure/storage/workspace-fs/transcripts.ts
+++ b/src/infrastructure/storage/workspace-fs/transcripts.ts
@@ -1,30 +1,47 @@
 /**
  * Per-media transcripts backed by the workspace folder.
  *
- * Stored at `media/{mediaId}/cache/transcript.json`. Pure JSON record —
- * no binary data or handles involved.
+ * Persisted at `media/{mediaId}/cache/ai/transcript.json` as an
+ * {@link AiOutput} envelope. Reads fall back to the legacy
+ * `cache/transcript.json` path and rewrite-forward on next save, so this is
+ * invisible to callers.
+ *
+ * The public API still exposes {@link MediaTranscript} — the flat record
+ * shape predates the envelope and is what the UI and indexers consume.
  */
 
 import type { MediaTranscript } from '@/types/storage';
 import { createLogger } from '@/shared/logging/logger';
 
 import { requireWorkspaceRoot } from './root';
+import { readJson, removeEntry } from './fs-primitives';
+import { legacyTranscriptPath } from './paths';
 import {
-  readJson,
-  removeEntry,
-  writeJsonAtomic,
-} from './fs-primitives';
-import { transcriptPath } from './paths';
+  readAiOutput,
+  writeAiOutput,
+  deleteAiOutput,
+  getMediaIdsWithAiOutput,
+  transcriptFromLegacy,
+  transcriptToLegacy,
+} from './ai-outputs';
 
 const logger = createLogger('WorkspaceFS:Transcripts');
 
+async function readLegacyTranscript(mediaId: string): Promise<MediaTranscript | undefined> {
+  const root = requireWorkspaceRoot();
+  const legacy = await readJson<MediaTranscript>(root, legacyTranscriptPath(mediaId));
+  return legacy ?? undefined;
+}
+
 export async function getTranscript(
   mediaId: string,
 ): Promise<MediaTranscript | undefined> {
-  const root = requireWorkspaceRoot();
   try {
-    const transcript = await readJson<MediaTranscript>(root, transcriptPath(mediaId));
-    return transcript ?? undefined;
+    const envelope = await readAiOutput(mediaId, 'transcript');
+    if (envelope) return transcriptToLegacy(envelope);
+
+    const legacy = await readLegacyTranscript(mediaId);
+    return legacy ?? undefined;
   } catch (error) {
     logger.error(`getTranscript(${mediaId}) failed`, error);
     throw new Error(`Failed to load transcript: ${mediaId}`);
@@ -35,18 +52,17 @@ export async function getTranscriptMediaIds(
   mediaIds: string[],
 ): Promise<Set<string>> {
   if (mediaIds.length === 0) return new Set();
-  const root = requireWorkspaceRoot();
   try {
-    const ready = new Set<string>();
-    const results = await Promise.all(
-      mediaIds.map(async (id) => {
-        const t = await readJson<MediaTranscript>(root, transcriptPath(id));
-        return t ?? null;
-      }),
-    );
-    results.forEach((r) => {
-      if (r?.mediaId) ready.add(r.mediaId);
-    });
+    const ready = await getMediaIdsWithAiOutput(mediaIds, 'transcript');
+    const missing = mediaIds.filter((id) => !ready.has(id));
+    if (missing.length > 0) {
+      const legacyResults = await Promise.all(
+        missing.map(async (id) => ((await readLegacyTranscript(id)) ? id : null)),
+      );
+      for (const id of legacyResults) {
+        if (id) ready.add(id);
+      }
+    }
     return ready;
   } catch (error) {
     logger.error('getTranscriptMediaIds failed', error);
@@ -57,10 +73,24 @@ export async function getTranscriptMediaIds(
 export async function saveTranscript(
   transcript: MediaTranscript,
 ): Promise<MediaTranscript> {
-  const root = requireWorkspaceRoot();
   try {
-    await writeJsonAtomic(root, transcriptPath(transcript.mediaId), transcript);
-    return transcript;
+    const envelope = transcriptFromLegacy(transcript);
+    const written = await writeAiOutput({
+      mediaId: envelope.mediaId,
+      kind: 'transcript',
+      service: envelope.service,
+      model: envelope.model,
+      params: envelope.params,
+      data: envelope.data,
+    });
+
+    // Fire-and-forget legacy-path cleanup on successful migration.
+    const root = requireWorkspaceRoot();
+    void removeEntry(root, legacyTranscriptPath(transcript.mediaId)).catch(
+      (error) => logger.warn(`legacy transcript cleanup failed for ${transcript.mediaId}`, error),
+    );
+
+    return transcriptToLegacy(written);
   } catch (error) {
     logger.error(`saveTranscript(${transcript.mediaId}) failed`, error);
     throw new Error(`Failed to save transcript: ${transcript.mediaId}`);
@@ -68,9 +98,10 @@ export async function saveTranscript(
 }
 
 export async function deleteTranscript(mediaId: string): Promise<void> {
-  const root = requireWorkspaceRoot();
   try {
-    await removeEntry(root, transcriptPath(mediaId));
+    await deleteAiOutput(mediaId, 'transcript');
+    const root = requireWorkspaceRoot();
+    await removeEntry(root, legacyTranscriptPath(mediaId));
   } catch (error) {
     logger.error(`deleteTranscript(${mediaId}) failed`, error);
     throw new Error(`Failed to delete transcript: ${mediaId}`);
diff --git a/src/lib/analysis/captioning/lfm-captioning-provider.ts b/src/lib/analysis/captioning/lfm-captioning-provider.ts
index a3800825b..3b5bf2dc8 100644
--- a/src/lib/analysis/captioning/lfm-captioning-provider.ts
+++ b/src/lib/analysis/captioning/lfm-captioning-provider.ts
@@ -99,8 +99,13 @@ function waitForReady(
   });
 }
 
-function captionSingle(worker: Worker, id: number, imageBlob: Blob, signal?: AbortSignal): Promise<string> {
-  return new Promise<string>((resolve, reject) => {
+function captionSingle(
+  worker: Worker,
+  id: number,
+  imageBlob: Blob,
+  signal?: AbortSignal,
+): Promise<Pick<MediaCaption, 'text' | 'sceneData'>> {
+  return new Promise<Pick<MediaCaption, 'text' | 'sceneData'>>((resolve, reject) => {
     const onAbort = () => {
       cleanup();
       reject(signal!.reason);
@@ -115,7 +120,10 @@ function captionSingle(worker: Worker, id: number, imageBlob: Blob, signal?: Abo
     const onMessage = (event: MessageEvent) => {
       if (event.data.type === 'caption' && event.data.id === id) {
         cleanup();
-        resolve(event.data.caption ?? '');
+        resolve({
+          text: event.data.caption ?? '',
+          sceneData: event.data.sceneData,
+        });
       }
     };
 
@@ -145,6 +153,7 @@ export const lfmCaptioningProvider: MediaCaptioningProvider = {
       onProgress,
       signal,
       sampleIntervalSec: rawSampleInterval = DEFAULT_SAMPLE_INTERVAL_SEC,
+      saveThumbnail,
     } = options;
     const sampleIntervalSec = Number.isFinite(rawSampleInterval) && rawSampleInterval > 0
       ? rawSampleInterval
@@ -191,15 +200,25 @@ export const lfmCaptioningProvider: MediaCaptioningProvider = {
           totalFrames: timestamps.length,
         });
 
-        const text = await captionSingle(worker, index, blob, signal);
-        if (text) {
+        const result = await captionSingle(worker, index, blob, signal);
+        if (result.text) {
+          let thumbRelPath: string | undefined;
+          if (saveThumbnail) {
+            try {
+              thumbRelPath = await saveThumbnail(index, blob);
+            } catch (error) {
+              log.warn('Caption thumbnail persist failed — skipping', { index, error });
+            }
+          }
           captions.push({
             timeSec: Math.round(timeSec * 10) / 10,
-            text,
+            text: result.text,
+            ...(result.sceneData ? { sceneData: result.sceneData } : {}),
+            ...(thumbRelPath ? { thumbRelPath } : {}),
           });
         }
 
-        log.info('Frame caption', { frame: index, time: timeSec.toFixed(1), length: text.length });
+        log.info('Frame caption', { frame: index, time: timeSec.toFixed(1), length: result.text.length });
       }
 
       return captions;
@@ -225,7 +244,7 @@ export const lfmCaptioningProvider: MediaCaptioningProvider = {
         totalFrames: 1,
       });
 
-      const text = await captionSingle(worker, 0, imageBlob, signal);
+      const result = await captionSingle(worker, 0, imageBlob, signal);
 
       onProgress?.({
         stage: 'captioning',
@@ -234,8 +253,14 @@ export const lfmCaptioningProvider: MediaCaptioningProvider = {
         totalFrames: 1,
       });
 
-      log.info('Image caption', { length: text.length });
-      return text ? [{ timeSec: 0, text }] : [];
+      log.info('Image caption', { length: result.text.length });
+      return result.text
+        ? [{
+          timeSec: 0,
+          text: result.text,
+          ...(result.sceneData ? { sceneData: result.sceneData } : {}),
+        }]
+        : [];
     } finally {
       worker.postMessage({ type: 'dispose' });
       setTimeout(() => worker.terminate(), 500);
diff --git a/src/lib/analysis/captioning/scene-caption-format.test.ts b/src/lib/analysis/captioning/scene-caption-format.test.ts
new file mode 100644
index 000000000..72935f897
--- /dev/null
+++ b/src/lib/analysis/captioning/scene-caption-format.test.ts
@@ -0,0 +1,137 @@
+import { describe, expect, it } from 'vitest';
+import {
+  formatSceneCaption,
+  formatSceneCaptionFromData,
+  LFM_SCENE_CAPTION_PROMPT,
+  normalizeShotVocabulary,
+  normalizeSceneCaptionData,
+  parseSceneCaptionResponse,
+} from './scene-caption-format';
+
+describe('LFM_SCENE_CAPTION_PROMPT', () => {
+  it('asks for JSON only with structured scene fields', () => {
+    expect(LFM_SCENE_CAPTION_PROMPT).toContain('return a valid JSON object only');
+    expect(LFM_SCENE_CAPTION_PROMPT).toContain('"caption": string');
+    expect(LFM_SCENE_CAPTION_PROMPT).toContain('"shotType": string | null');
+    expect(LFM_SCENE_CAPTION_PROMPT).toContain('Use null for missing scalar fields and [] for missing subjects');
+    expect(LFM_SCENE_CAPTION_PROMPT).toContain('The first character of the response must be { and the last character must be }');
+    expect(LFM_SCENE_CAPTION_PROMPT).toContain('Use double quotes around every key and every string value');
+    expect(LFM_SCENE_CAPTION_PROMPT).toContain('Do not mention camera motion');
+  });
+});
+
+describe('normalizeShotVocabulary', () => {
+  it('normalizes common shot-term spelling and hyphenation inside prose', () => {
+    expect(normalizeShotVocabulary('A medium close up of a singer')).toBe('A medium close-up of a singer');
+    expect(normalizeShotVocabulary('An extreme closeup of an eye')).toBe('An extreme close-up of an eye');
+    expect(normalizeShotVocabulary('A medium wide shot of a street')).toBe('A medium-wide shot of a street');
+  });
+});
+
+describe('normalizeSceneCaptionData', () => {
+  it('canonicalizes shotType aliases and strips empty fields', () => {
+    expect(normalizeSceneCaptionData({
+      caption: 'A singer under stage lights.',
+      shot_type: 'medium close up',
+      subjects: ['singer', ' ', 'microphone'],
+      weather: 'unknown',
+    })).toEqual({
+      caption: 'A singer under stage lights.',
+      shotType: 'medium close-up',
+      subjects: ['singer', 'microphone'],
+    });
+  });
+});
+
+describe('formatSceneCaption', () => {
+  it('strips lead-ins and standardizes leading shot phrasing', () => {
+    expect(formatSceneCaption('This image shows a medium wide shot of a woman in a cafe')).toBe(
+      'Medium-wide shot of a woman in a cafe.',
+    );
+  });
+
+  it('collapses multi-sentence output to one sentence', () => {
+    expect(formatSceneCaption('Wide shot of two people crossing a city street. Rain falls in the distance.')).toBe(
+      'Wide shot of two people crossing a city street.',
+    );
+  });
+
+  it('drops uncertain time-of-day or weather clauses instead of persisting guesses', () => {
+    expect(formatSceneCaption('Close up of a woman indoors, possibly at dusk')).toBe(
+      'Close-up of a woman indoors.',
+    );
+    expect(formatSceneCaption('A wide shot of a street, maybe rainy')).toBe(
+      'Wide shot of a street.',
+    );
+  });
+});
+
+describe('formatSceneCaptionFromData', () => {
+  it('builds a readable fallback sentence from structured fields', () => {
+    expect(formatSceneCaptionFromData({
+      shotType: 'wide shot',
+      subjects: ['two people'],
+      action: 'walking across the street',
+      setting: 'city street',
+      timeOfDay: 'dusk',
+      weather: 'rainy',
+    })).toBe('Wide shot of two people walking across the street in city street in rainy weather at dusk.');
+  });
+});
+
+describe('parseSceneCaptionResponse', () => {
+  it('parses JSON responses and preserves structured scene data', () => {
+    expect(parseSceneCaptionResponse(
+      '{"caption":"A woman in a red coat walks through a rainy city street at dusk.","shotType":"wide shot","subjects":["woman"],"action":"walking through the street","setting":"city street","lighting":"dim evening light","timeOfDay":"dusk","weather":"rainy"}',
+    )).toEqual({
+      text: 'A woman in a red coat walks through a rainy city street at dusk.',
+      sceneData: {
+        caption: 'A woman in a red coat walks through a rainy city street at dusk.',
+        shotType: 'wide shot',
+        subjects: ['woman'],
+        action: 'walking through the street',
+        setting: 'city street',
+        lighting: 'dim evening light',
+        timeOfDay: 'dusk',
+        weather: 'rainy',
+      },
+    });
+  });
+
+  it('accepts fenced JSON and falls back to the structured fields when caption is missing', () => {
+    expect(parseSceneCaptionResponse(
+      '```json\n{"shotType":"medium close up","subjects":["singer"],"action":"singing into a microphone","setting":"stage","timeOfDay":null,"weather":null}\n```',
+    )).toEqual({
+      text: 'Medium close-up of singer singing into a microphone in stage.',
+      sceneData: {
+        caption: 'Medium close-up of singer singing into a microphone in stage.',
+        shotType: 'medium close-up',
+        subjects: ['singer'],
+        action: 'singing into a microphone',
+        setting: 'stage',
+      },
+    });
+  });
+
+  it('falls back to freeform text formatting when JSON parsing fails', () => {
+    expect(parseSceneCaptionResponse('This image shows a close up of a hand holding a glass')).toEqual({
+      text: 'Close-up of a hand holding a glass.',
+    });
+  });
+
+  it('recovers known fields from json-ish output when strict parsing fails', () => {
+    expect(parseSceneCaptionResponse(
+      'Json ["caption":"A dimly lit corridor illuminated by hanging lanterns, with a text overlay in Chinese at the bottom.","shotType":"medium wide shot","subjects":["lanterns","corridor","text"],"action":"glowing softly","setting":"interior corridor","lighting":"golden lantern light","timeOfDay":null,"weather":null}.',
+    )).toEqual({
+      text: 'A dimly lit corridor illuminated by hanging lanterns, with a text overlay in Chinese at the bottom.',
+      sceneData: {
+        caption: 'A dimly lit corridor illuminated by hanging lanterns, with a text overlay in Chinese at the bottom.',
+        shotType: 'medium-wide shot',
+        subjects: ['lanterns', 'corridor', 'text'],
+        action: 'glowing softly',
+        setting: 'interior corridor',
+        lighting: 'golden lantern light',
+      },
+    });
+  });
+});
diff --git a/src/lib/analysis/captioning/scene-caption-format.ts b/src/lib/analysis/captioning/scene-caption-format.ts
new file mode 100644
index 000000000..5b50fed5f
--- /dev/null
+++ b/src/lib/analysis/captioning/scene-caption-format.ts
@@ -0,0 +1,428 @@
+import type { SceneCaptionData } from './types';
+
+export const CANONICAL_SHOT_SIZES = [
+  'extreme wide shot',
+  'wide shot',
+  'medium-wide shot',
+  'medium shot',
+  'medium close-up',
+  'close-up',
+  'extreme close-up',
+] as const;
+
+export const LFM_SCENE_CAPTION_PROMPT =
+  'Analyze this single video frame and return a valid JSON object only.\n\n'
+  + 'Use this exact schema:\n'
+  + '{'
+  + '"caption": string, '
+  + '"shotType": string | null, '
+  + '"subjects": string[], '
+  + '"action": string | null, '
+  + '"setting": string | null, '
+  + '"lighting": string | null, '
+  + '"timeOfDay": string | null, '
+  + '"weather": string | null'
+  + '}\n\n'
+  + 'Rules:\n'
+  + '- "caption" must be one detailed natural sentence.\n'
+  + '- Describe the visible subject, action, setting, lighting, time of day, and weather when clearly visible.\n'
+  + `- "shotType" is optional and must be one of: ${CANONICAL_SHOT_SIZES.join(', ')}.\n`
+  + '- If shot size is not unmistakable, use null.\n'
+  + '- If time of day or weather is unclear, use null.\n'
+  + '- Use null for missing scalar fields and [] for missing subjects.\n'
+  + '- The first character of the response must be { and the last character must be }.\n'
+  + '- Use double quotes around every key and every string value.\n'
+  + '- Do not mention camera motion, camera movement, editing, or uncertainty.\n'
+  + '- Do not wrap the JSON in markdown fences or prose.';
+
+const LABEL_PREFIX_PATTERN = /^(?:caption|scene|description)\s*:\s*/i;
+const JSON_LEAD_IN_PATTERN = /^(?:json(?:\s+(?:object|response))?|response|output)\s*[:-]?\s*/i;
+const SHOT_LABEL_PREFIX_PATTERN = /^shot(?:\s+type)?\s*:\s*/i;
+const LEAD_IN_PATTERNS = [
+  /^(?:this|the)\s+(?:image|frame|scene|shot)\s+(?:shows|depicts|features)\s+/i,
+  /^(?:we can see|we see)\s+/i,
+] as const;
+const SHOT_ONLY_PATTERN =
+  /^(?:shot(?:\s+type)?\s*:\s*)?(?:extreme wide shot|wide shot|medium-wide shot|medium shot|medium close-up|close-up|extreme close-up)$/i;
+const UNCERTAIN_ENVIRONMENT_TAIL_PATTERN =
+  /(?:,\s*|\s+-\s+|\s+)(?:possibly|maybe|perhaps|likely|apparently|seemingly|it\s+seems\s+to\s+be|it\s+appears\s+to\s+be|appears\s+to\s+be|seems\s+to\s+be)\s+(?:at\s+)?(?:sunrise|dawn|morning|day(?:time)?|afternoon|golden\s+hour|sunset|dusk|night(?:time)?|rain(?:y|ing)?|snow(?:y|ing)?|fog(?:gy)?|mist(?:y)?|overcast|cloudy|sunny|storm(?:y)?|clear(?:\s+sk(?:y|ies))?)\b[^.?!,;:]*$/i;
+const EMPTY_FIELD_PATTERN = /^(?:null|none|n\/a|unknown|unclear|not visible|not obvious)$/i;
+const QUOTE_WRAPPER_PATTERN = /^"(.*)"$/s;
+
+function normalizeWhitespace(text: string): string {
+  return text.replace(/\s+/g, ' ').trim();
+}
+
+function stripOuterQuotes(text: string): string {
+  return text.replace(/^[`"']+|[`"']+$/g, '');
+}
+
+function stripLeadIns(text: string): string {
+  let next = text.trim();
+  next = next.replace(/^[\s\-*]+/, '');
+  next = next.replace(JSON_LEAD_IN_PATTERN, '');
+  next = next.replace(LABEL_PREFIX_PATTERN, '');
+
+  for (const pattern of LEAD_IN_PATTERNS) {
+    next = next.replace(pattern, '');
+  }
+
+  return next.trim();
+}
+
+function stripTerminalPunctuation(text: string): string {
+  return text.replace(/[.!?]+$/u, '').trim();
+}
+
+function lowerCaseFirst(text: string): string {
+  if (text.length === 0) return text;
+  return text.charAt(0).toLowerCase() + text.slice(1);
+}
+
+function upperCaseFirst(text: string): string {
+  if (text.length === 0) return text;
+  return text.charAt(0).toUpperCase() + text.slice(1);
+}
+
+export function normalizeShotVocabulary(text: string): string {
+  let next = text;
+
+  const replacements: Array<[RegExp, string]> = [
+    [/\bextreme[\s-]+long shot\b/gi, 'extreme wide shot'],
+    [/\bextreme[\s-]+wide shot\b/gi, 'extreme wide shot'],
+    [/\bmedium[\s-]+wide shot\b/gi, 'medium-wide shot'],
+    [/\bmedium[\s-]+close[\s-]*up\b/gi, 'medium close-up'],
+    [/\bmedium[\s-]+close shot\b/gi, 'medium close-up'],
+    [/\bextreme[\s-]+close[\s-]*up\b/gi, 'extreme close-up'],
+    [/\bclose[\s-]*up\b/gi, 'close-up'],
+    [/\blong shot\b/gi, 'wide shot'],
+    [/\bwide shot\b/gi, 'wide shot'],
+    [/\bmedium shot\b/gi, 'medium shot'],
+  ];
+
+  for (const [pattern, replacement] of replacements) {
+    next = next.replace(pattern, replacement);
+  }
+
+  return next;
+}
+
+function collapseToSingleSentence(text: string): string {
+  const fragments = text
+    .split(/(?:\r?\n)+|(?<=[.!?])\s+|;\s+/u)
+    .map((fragment) => normalizeWhitespace(stripOuterQuotes(fragment)))
+    .filter(Boolean);
+
+  if (fragments.length === 0) return '';
+  if (fragments.length === 1) return fragments[0]!;
+
+  const first = normalizeShotVocabulary(stripTerminalPunctuation(fragments[0]!));
+  if (SHOT_ONLY_PATTERN.test(first)) {
+    const shot = stripTerminalPunctuation(first.replace(SHOT_LABEL_PREFIX_PATTERN, '')).toLowerCase();
+    const followUp = stripTerminalPunctuation(stripLeadIns(fragments[1]!));
+    if (followUp.length > 0) {
+      return `${shot} in which ${lowerCaseFirst(followUp)}`;
+    }
+  }
+
+  return fragments[0]!;
+}
+
+function stripUncertainEnvironmentTail(text: string): string {
+  return text.replace(UNCERTAIN_ENVIRONMENT_TAIL_PATTERN, '');
+}
+
+function stripLeadingShotArticle(text: string): string {
+  return text.replace(
+    /^(?:a|an)\s+(extreme wide shot|wide shot|medium-wide shot|medium shot|medium close-up|close-up|extreme close-up)\b/i,
+    (_, shot: string) => shot.toLowerCase(),
+  );
+}
+
+function sanitizeScalar(value: unknown): string | undefined {
+  if (typeof value !== 'string') return undefined;
+  const normalized = normalizeWhitespace(stripOuterQuotes(value));
+  if (normalized.length === 0 || EMPTY_FIELD_PATTERN.test(normalized)) return undefined;
+  return normalized;
+}
+
+function sanitizeSubjects(value: unknown): string[] | undefined {
+  if (!Array.isArray(value)) return undefined;
+  const subjects = value
+    .map((entry) => sanitizeScalar(entry))
+    .filter((entry): entry is string => Boolean(entry));
+  return subjects.length > 0 ? subjects : undefined;
+}
+
+function normalizeShotType(value: unknown): string | undefined {
+  const scalar = sanitizeScalar(value);
+  if (!scalar) return undefined;
+  const compact = stripTerminalPunctuation(scalar).toLowerCase();
+  const aliasMap: Record<string, string> = {
+    'extreme wide': 'extreme wide shot',
+    'extreme wide shot': 'extreme wide shot',
+    'extreme long shot': 'extreme wide shot',
+    'wide': 'wide shot',
+    'wide shot': 'wide shot',
+    'long shot': 'wide shot',
+    'medium wide': 'medium-wide shot',
+    'medium wide shot': 'medium-wide shot',
+    'medium-wide shot': 'medium-wide shot',
+    'medium': 'medium shot',
+    'medium shot': 'medium shot',
+    'medium close': 'medium close-up',
+    'medium close up': 'medium close-up',
+    'medium close-up': 'medium close-up',
+    'close': 'close-up',
+    'close up': 'close-up',
+    'close-up': 'close-up',
+    'extreme close': 'extreme close-up',
+    'extreme close up': 'extreme close-up',
+    'extreme close-up': 'extreme close-up',
+  };
+  const normalized = aliasMap[compact] ?? normalizeShotVocabulary(compact).toLowerCase();
+  return CANONICAL_SHOT_SIZES.find((shot) => shot === normalized);
+}
+
+function hasStructuredFields(data: SceneCaptionData): boolean {
+  return Boolean(
+    data.caption
+    || data.shotType
+    || (data.subjects && data.subjects.length > 0)
+    || data.action
+    || data.setting
+    || data.lighting
+    || data.timeOfDay
+    || data.weather,
+  );
+}
+
+function escapeRegExp(text: string): string {
+  return text.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+}
+
+function decodeLooseValue(raw: string): string | null | undefined {
+  const trimmed = raw.trim();
+  if (trimmed.length === 0) return undefined;
+  if (/^null$/i.test(trimmed)) return null;
+
+  if (/^"(?:\\.|[^"])*"$/s.test(trimmed)) {
+    try {
+      const parsed = JSON.parse(trimmed) as unknown;
+      return typeof parsed === 'string' ? parsed : undefined;
+    } catch {
+      return stripOuterQuotes(trimmed);
+    }
+  }
+
+  if (/^'(?:\\.|[^'])*'$/s.test(trimmed)) {
+    return stripOuterQuotes(trimmed)
+      .replace(/\\'/g, '\'')
+      .replace(/\\"/g, '"')
+      .replace(/\\\\/g, '\\');
+  }
+
+  return trimmed.replace(/[.,;:]+$/u, '').trim();
+}
+
+function extractLooseScalar(raw: string, keys: string[]): string | null | undefined {
+  const pattern = new RegExp(
+    String.raw`(?:["']?(?:${keys.map(escapeRegExp).join('|')})["']?)\s*:\s*(null|"(?:\\.|[^"])*"|'(?:\\.|[^'])*'|[^,\r\n}]+)`,
+    'i',
+  );
+  const match = raw.match(pattern);
+  return match?.[1] ? decodeLooseValue(match[1]) : undefined;
+}
+
+function extractLooseSubjects(raw: string): string[] | undefined {
+  const match = raw.match(/(?:["']?subjects["']?)\s*:\s*\[([\s\S]*?)\]/i);
+  if (!match) return undefined;
+
+  const entries = Array.from(match[1].matchAll(/"(?:\\.|[^"])*"|'(?:\\.|[^'])*'|[^,\]]+/g))
+    .map((entry) => decodeLooseValue(entry[0]))
+    .filter((entry): entry is string => typeof entry === 'string' && entry.length > 0);
+
+  return entries;
+}
+
+function parseLooseJsonObject(raw: string): Record<string, unknown> | null {
+  const object: Record<string, unknown> = {};
+  const normalized = stripLeadIns(raw);
+
+  const caption = extractLooseScalar(normalized, ['caption']);
+  if (caption !== undefined) object.caption = caption;
+
+  const shotType = extractLooseScalar(normalized, ['shotType', 'shot_type']);
+  if (shotType !== undefined) object.shotType = shotType;
+
+  const subjects = extractLooseSubjects(normalized);
+  if (subjects !== undefined) object.subjects = subjects;
+
+  const action = extractLooseScalar(normalized, ['action']);
+  if (action !== undefined) object.action = action;
+
+  const setting = extractLooseScalar(normalized, ['setting']);
+  if (setting !== undefined) object.setting = setting;
+
+  const lighting = extractLooseScalar(normalized, ['lighting']);
+  if (lighting !== undefined) object.lighting = lighting;
+
+  const timeOfDay = extractLooseScalar(normalized, ['timeOfDay', 'time_of_day']);
+  if (timeOfDay !== undefined) object.timeOfDay = timeOfDay;
+
+  const weather = extractLooseScalar(normalized, ['weather']);
+  if (weather !== undefined) object.weather = weather;
+
+  return Object.keys(object).length > 0 ? object : null;
+}
+
+function extractJsonCandidate(raw: string): string | null {
+  const fenced = raw.match(/```(?:json)?\s*([\s\S]*?)```/i);
+  if (fenced?.[1]) {
+    return fenced[1].trim();
+  }
+
+  const start = raw.indexOf('{');
+  const end = raw.lastIndexOf('}');
+  if (start >= 0 && end > start) {
+    return raw.slice(start, end + 1);
+  }
+
+  return null;
+}
+
+function parseJsonObject(raw: string): Record<string, unknown> | null {
+  const candidate = extractJsonCandidate(raw);
+  if (!candidate) return null;
+
+  try {
+    const parsed = JSON.parse(candidate) as unknown;
+    return parsed && typeof parsed === 'object' && !Array.isArray(parsed)
+      ? parsed as Record<string, unknown>
+      : null;
+  } catch {
+    return null;
+  }
+}
+
+function readField(object: Record<string, unknown>, ...keys: string[]): unknown {
+  for (const key of keys) {
+    if (key in object) return object[key];
+  }
+  return undefined;
+}
+
+export function normalizeSceneCaptionData(object: Record<string, unknown>): SceneCaptionData {
+  const sceneData: SceneCaptionData = {
+    caption: sanitizeScalar(readField(object, 'caption')),
+    shotType: normalizeShotType(readField(object, 'shotType', 'shot_type')),
+    subjects: sanitizeSubjects(readField(object, 'subjects')),
+    action: sanitizeScalar(readField(object, 'action')),
+    setting: sanitizeScalar(readField(object, 'setting')),
+    lighting: sanitizeScalar(readField(object, 'lighting')),
+    timeOfDay: sanitizeScalar(readField(object, 'timeOfDay', 'time_of_day')),
+    weather: sanitizeScalar(readField(object, 'weather')),
+  };
+
+  return hasStructuredFields(sceneData) ? sceneData : {};
+}
+
+function maybeWrapWithOf(fragment: string): string {
+  return /^(?:of|in|on|at)\b/i.test(fragment) ? fragment : `of ${fragment}`;
+}
+
+export function formatSceneCaption(raw: string): string {
+  let next = normalizeWhitespace(stripOuterQuotes(raw));
+  if (next.length === 0) return '';
+
+  const quoted = next.match(QUOTE_WRAPPER_PATTERN);
+  if (quoted?.[1]) {
+    next = quoted[1];
+  }
+
+  next = stripLeadIns(next);
+  next = collapseToSingleSentence(next);
+  next = stripUncertainEnvironmentTail(next);
+  next = stripLeadIns(next);
+  next = normalizeShotVocabulary(next);
+  next = stripLeadingShotArticle(next);
+  next = stripTerminalPunctuation(normalizeWhitespace(next));
+
+  if (next.length === 0) return '';
+
+  next = upperCaseFirst(next);
+  return /[.!?]$/u.test(next) ? next : `${next}.`;
+}
+
+export function formatSceneCaptionFromData(data: SceneCaptionData): string {
+  const subjectText = data.subjects?.join(', ');
+  let body = '';
+
+  if (subjectText && data.action) {
+    body = `${subjectText} ${lowerCaseFirst(data.action)}`;
+  } else if (subjectText) {
+    body = subjectText;
+  } else if (data.action) {
+    body = data.action;
+  } else if (data.setting) {
+    body = `scene in ${data.setting}`;
+  } else if (data.lighting) {
+    body = `scene in ${data.lighting}`;
+  }
+
+  if (data.setting && body && !body.toLowerCase().includes(data.setting.toLowerCase())) {
+    body = `${body} in ${data.setting}`;
+  }
+
+  if (data.weather) {
+    body = body ? `${body} in ${data.weather} weather` : `${data.weather} weather`;
+  }
+
+  if (data.timeOfDay) {
+    body = body ? `${body} at ${data.timeOfDay}` : data.timeOfDay;
+  }
+
+  if (!body && data.caption) {
+    body = data.caption;
+  }
+
+  if (!body) return '';
+
+  if (data.shotType) {
+    return formatSceneCaption(`${data.shotType} ${maybeWrapWithOf(body)}`);
+  }
+
+  return formatSceneCaption(body);
+}
+
+export function parseSceneCaptionResponse(raw: string): {
+  text: string;
+  sceneData?: SceneCaptionData;
+} {
+  const parsed = parseJsonObject(raw) ?? parseLooseJsonObject(raw);
+  if (!parsed) {
+    const text = formatSceneCaption(raw);
+    return text ? { text } : { text: '' };
+  }
+
+  const sceneData = normalizeSceneCaptionData(parsed);
+  const text = sceneData.caption
+    ? formatSceneCaption(sceneData.caption)
+    : formatSceneCaptionFromData(sceneData) || formatSceneCaption(raw);
+
+  if (!text) {
+    return { text: '' };
+  }
+
+  if (!hasStructuredFields(sceneData)) {
+    return { text };
+  }
+
+  return {
+    text,
+    sceneData: {
+      ...sceneData,
+      caption: text,
+    },
+  };
+}
diff --git a/src/lib/analysis/captioning/types.ts b/src/lib/analysis/captioning/types.ts
index d6f53e12c..1fa39cf19 100644
--- a/src/lib/analysis/captioning/types.ts
+++ b/src/lib/analysis/captioning/types.ts
@@ -1,6 +1,43 @@
+export interface SceneCaptionData {
+  caption?: string;
+  shotType?: string;
+  subjects?: string[];
+  action?: string;
+  setting?: string;
+  lighting?: string;
+  timeOfDay?: string;
+  weather?: string;
+}
+
 export interface MediaCaption {
   timeSec: number;
   text: string;
+  /**
+   * Structured scene metadata emitted by the caption model. Preserved for
+   * future semantic/indexing work while `text` remains the user-facing and
+   * search-facing sentence.
+   */
+  sceneData?: SceneCaptionData;
+  /**
+   * Workspace-relative path to a captured JPEG thumbnail for this scene,
+   * e.g. `media/{mediaId}/cache/ai/captions-thumbs/{index}.jpg`. Absent on
+   * captions generated before the Scene Browser feature landed.
+   */
+  thumbRelPath?: string;
+  /**
+   * Dense sentence embedding of the caption's embed-text (caption +
+   * transcript + colors). 384-dim for all-MiniLM-L6-v2. When present,
+   * enables semantic text search.
+   */
+  embedding?: number[];
+  /**
+   * Structural dominant-color palette for the thumbnail, in CIELAB
+   * with pixel-coverage weights. Powers ∆E-based color-query ranking
+   * independent of CLIP — Lab distances are perceptually uniform so
+   * "red" queries actually hit red scenes rather than whatever CLIP
+   * happens to associate with the token.
+   */
+  palette?: Array<{ l: number; a: number; b: number; weight: number }>;
 }
 
 export interface CaptioningProgress {
@@ -14,6 +51,13 @@ export interface CaptioningOptions {
   onProgress?: (progress: CaptioningProgress) => void;
   signal?: AbortSignal;
   sampleIntervalSec?: number;
+  /**
+   * Optional persistence hook invoked once per captioned frame with the
+   * JPEG the provider already captured for VLM inference. Return a
+   * workspace-relative path to stash on `MediaCaption.thumbRelPath`;
+   * return `undefined` to skip the thumbnail for that frame.
+   */
+  saveThumbnail?: (index: number, blob: Blob) => Promise<string | undefined>;
 }
 
 export interface MediaCaptioningProvider {
diff --git a/src/lib/analysis/embeddings/clip-provider.ts b/src/lib/analysis/embeddings/clip-provider.ts
new file mode 100644
index 000000000..1444ce497
--- /dev/null
+++ b/src/lib/analysis/embeddings/clip-provider.ts
@@ -0,0 +1,221 @@
+/**
+ * Singleton provider over the CLIP image/text worker.
+ *
+ * Exposes three operations — `ensureReady`, `embedImages`, and
+ * `embedTextForImages` — that together let the Scene Browser index
+ * thumbnails and search them by free-form text queries, both running
+ * off-thread so the UI stays responsive while the model downloads.
+ */
+
+import { createLogger } from '@/shared/logging/logger';
+import { createClipWorker } from './create-clip-worker';
+import type { EmbeddingsOptions } from './types';
+
+const log = createLogger('ClipProvider');
+
+export const CLIP_MODEL_ID = 'Xenova/clip-vit-base-patch32';
+export const CLIP_EMBEDDING_DIM = 512;
+
+const INIT_TIMEOUT_MS = 120_000;
+
+let worker: Worker | null = null;
+let readyPromise: Promise<void> | null = null;
+let nextId = 0;
+
+function getWorker(): Worker {
+  if (!worker) {
+    worker = createClipWorker();
+    worker.addEventListener('error', (event) => {
+      log.error('CLIP worker errored', event.message);
+    });
+  }
+  return worker;
+}
+
+function ensureReady(options: EmbeddingsOptions = {}): Promise<void> {
+  if (readyPromise) return readyPromise;
+  const w = getWorker();
+
+  readyPromise = new Promise<void>((resolve, reject) => {
+    const timeout = setTimeout(() => {
+      cleanup();
+      reject(new Error('CLIP worker init timed out'));
+    }, INIT_TIMEOUT_MS);
+
+    const cleanup = () => {
+      clearTimeout(timeout);
+      w.removeEventListener('message', onMessage);
+      options.signal?.removeEventListener('abort', onAbort);
+    };
+
+    const onAbort = () => {
+      cleanup();
+      reject(options.signal?.reason ?? new Error('CLIP init aborted'));
+    };
+
+    const onMessage = (event: MessageEvent) => {
+      const message = event.data;
+      if (message.type === 'ready') {
+        cleanup();
+        resolve();
+        return;
+      }
+      if (message.type === 'progress') {
+        options.onProgress?.({ stage: 'loading-model', percent: message.percent ?? 0 });
+        return;
+      }
+      if (message.type === 'error' && message.id === undefined) {
+        cleanup();
+        reject(new Error(message.message ?? 'CLIP worker init failed'));
+      }
+    };
+
+    if (options.signal?.aborted) {
+      cleanup();
+      reject(options.signal.reason);
+      return;
+    }
+    options.signal?.addEventListener('abort', onAbort, { once: true });
+
+    w.addEventListener('message', onMessage);
+    w.postMessage({ type: 'init' });
+  });
+
+  readyPromise.catch(() => {
+    readyPromise = null;
+  });
+
+  return readyPromise;
+}
+
+type EmbedRequest =
+  | { kind: 'images'; payload: Blob[] }
+  | { kind: 'text'; payload: string[] };
+
+function runEmbed(request: EmbedRequest, options: EmbeddingsOptions = {}): Promise<Float32Array[]> {
+  if (request.payload.length === 0) return Promise.resolve([]);
+
+  return ensureReady(options).then(() => new Promise<Float32Array[]>((resolve, reject) => {
+    const id = ++nextId;
+    const w = getWorker();
+
+    const cleanup = () => {
+      w.removeEventListener('message', onMessage);
+      options.signal?.removeEventListener('abort', onAbort);
+    };
+
+    const onAbort = () => {
+      cleanup();
+      reject(options.signal?.reason ?? new Error('CLIP embed aborted'));
+    };
+
+    const onMessage = (event: MessageEvent) => {
+      const message = event.data;
+      if (message.id !== id) return;
+      if (message.type === 'vectors') {
+        cleanup();
+        resolve(message.vectors as Float32Array[]);
+        return;
+      }
+      if (message.type === 'error') {
+        cleanup();
+        reject(new Error(message.message ?? 'CLIP embed failed'));
+      }
+    };
+
+    if (options.signal?.aborted) {
+      cleanup();
+      reject(options.signal.reason);
+      return;
+    }
+    options.signal?.addEventListener('abort', onAbort, { once: true });
+
+    w.addEventListener('message', onMessage);
+    if (request.kind === 'images') {
+      w.postMessage({ type: 'embed-images', id, blobs: request.payload });
+    } else {
+      w.postMessage({ type: 'embed-text', id, texts: request.payload });
+    }
+  }));
+}
+
+/**
+ * Natural-sentence templates for CLIP query expansion. CLIP was trained
+ * on descriptive captions (`"a photo of a cat sitting on a windowsill"`),
+ * not bare keywords, so a one-word query like `"fighting"` embeds into a
+ * lonely corner of the joint space where random unrelated images can
+ * score higher than they should (a vertical tower against "fighting"
+ * scored 0.21 in one test run — classic short-query noise).
+ *
+ * Embedding the query through each template and averaging the resulting
+ * vectors re-anchors it inside the distribution of sentences CLIP was
+ * trained on, boosting real matches and suppressing noise. Standard
+ * retrieval-quality trick; ~5–15 points of nDCG in the literature.
+ */
+const CLIP_QUERY_TEMPLATES = [
+  (q: string) => `a photo of ${q}`,
+  (q: string) => `a picture of ${q}`,
+  (q: string) => `a scene showing ${q}`,
+  (q: string) => q,
+];
+
+function averageAndNormalize(vectors: Float32Array[]): Float32Array {
+  const dim = vectors[0]!.length;
+  const out = new Float32Array(dim);
+  for (const v of vectors) {
+    for (let i = 0; i < dim; i += 1) out[i] += v[i]!;
+  }
+  let sum = 0;
+  for (let i = 0; i < dim; i += 1) sum += out[i]! * out[i]!;
+  const norm = Math.sqrt(sum) || 1;
+  for (let i = 0; i < dim; i += 1) out[i] /= norm;
+  return out;
+}
+
+export const clipProvider = {
+  ensureReady,
+
+  /**
+   * Embed a batch of image blobs with the CLIP vision encoder. Returned
+   * vectors live in the 512-dim joint space, so cosine similarity against
+   * text-encoder outputs is meaningful.
+   */
+  embedImages(blobs: Blob[], options?: EmbeddingsOptions): Promise<Float32Array[]> {
+    return runEmbed({ kind: 'images', payload: blobs }, options);
+  },
+
+  /**
+   * Embed text with the CLIP text encoder so results can be compared to
+   * stored image embeddings. This is the low-level path — call it when
+   * you're indexing canonical text (e.g. captions). For user search
+   * queries use {@link embedQueryForImages} instead; it averages a few
+   * natural-language templates to counter CLIP's well-known short-query
+   * noise.
+   */
+  embedTextForImages(texts: string[], options?: EmbeddingsOptions): Promise<Float32Array[]> {
+    return runEmbed({ kind: 'text', payload: texts }, options);
+  },
+
+  /**
+   * Embed a single user query by ensembling across {@link CLIP_QUERY_TEMPLATES}.
+   * Returns one 512-dim vector — the L2-normalized mean of the
+   * per-template embeddings — suitable for cosine-similarity ranking
+   * against stored image embeddings.
+   */
+  async embedQueryForImages(query: string, options?: EmbeddingsOptions): Promise<Float32Array | null> {
+    const trimmed = query.trim();
+    if (!trimmed) return null;
+    const templates = CLIP_QUERY_TEMPLATES.map((t) => t(trimmed));
+    const vectors = await runEmbed({ kind: 'text', payload: templates }, options);
+    if (vectors.length === 0) return null;
+    return averageAndNormalize(vectors);
+  },
+
+  dispose(): void {
+    if (!worker) return;
+    worker.postMessage({ type: 'dispose' });
+    worker.terminate();
+    worker = null;
+    readyPromise = null;
+  },
+};
diff --git a/src/lib/analysis/embeddings/clip-worker.ts b/src/lib/analysis/embeddings/clip-worker.ts
new file mode 100644
index 000000000..603b3d4f5
--- /dev/null
+++ b/src/lib/analysis/embeddings/clip-worker.ts
@@ -0,0 +1,217 @@
+/**
+ * Web Worker for CLIP image + text embeddings.
+ *
+ * Loads both halves of `Xenova/clip-vit-base-patch32` (q8 quantized,
+ * ~90 MB total) so the same worker can embed:
+ *   - scene thumbnails at caption time (image encoder), producing
+ *     vectors that get stored in `captions-image-embeddings.bin`, and
+ *   - search queries at query time (text encoder), producing a vector
+ *     in the *same* 512-dim space so cosine similarity against image
+ *     embeddings is meaningful.
+ *
+ * Kept separate from the all-MiniLM text worker because the models are
+ * large and users who never switch to semantic search shouldn't pay the
+ * CLIP download cost.
+ *
+ * Messages:
+ *   → { type: 'init' }
+ *   → { type: 'embed-images', id, blobs: Blob[] }
+ *   → { type: 'embed-text',   id, texts: string[] }
+ *   → { type: 'dispose' }
+ *   ← { type: 'ready', dim: number }
+ *   ← { type: 'progress', percent: number }
+ *   ← { type: 'vectors', id, vectors: Float32Array[] }
+ *   ← { type: 'error', id?, message }
+ */
+
+import {
+  AutoProcessor,
+  AutoTokenizer,
+  CLIPTextModelWithProjection,
+  CLIPVisionModelWithProjection,
+  RawImage,
+  env,
+  type PreTrainedTokenizer,
+  type Processor,
+  type PreTrainedModel,
+} from '@huggingface/transformers';
+
+const MODEL_ID = 'Xenova/clip-vit-base-patch32';
+
+env.useBrowserCache = true;
+env.allowLocalModels = false;
+
+/* eslint-disable @typescript-eslint/no-explicit-any -- transformers.js
+   tensor types vary by version; the worker stays schema-stable. */
+let tokenizer: PreTrainedTokenizer | null = null;
+let processor: Processor | null = null;
+let textModel: PreTrainedModel | null = null;
+let visionModel: PreTrainedModel | null = null;
+let loading = false;
+let disposed = false;
+let loadGeneration = 0;
+let embeddingDim = 512;
+
+function post(msg: Record<string, unknown>): void {
+  self.postMessage(msg);
+}
+
+async function loadModel(): Promise<void> {
+  if (tokenizer && processor && textModel && visionModel) {
+    post({ type: 'ready', dim: embeddingDim });
+    return;
+  }
+  if (loading) return;
+  loading = true;
+  disposed = false;
+  const thisGen = ++loadGeneration;
+
+  try {
+    let lastPct = 0;
+    const onProgress = (info: { status?: string; total?: number; loaded?: number }) => {
+      if (info.status === 'progress' && info.total && info.loaded) {
+        const pct = (info.loaded / info.total) * 100;
+        if (pct - lastPct > 2) {
+          lastPct = pct;
+          post({ type: 'progress', percent: Math.round(pct) });
+        }
+      }
+    };
+
+    const [loadedTokenizer, loadedProcessor, loadedTextModel, loadedVisionModel] = await Promise.all([
+      AutoTokenizer.from_pretrained(MODEL_ID),
+      AutoProcessor.from_pretrained(MODEL_ID),
+      CLIPTextModelWithProjection.from_pretrained(MODEL_ID, {
+        dtype: 'q8',
+        progress_callback: onProgress,
+      } as any),
+      CLIPVisionModelWithProjection.from_pretrained(MODEL_ID, {
+        dtype: 'q8',
+        progress_callback: onProgress,
+      } as any),
+    ]);
+
+    if (disposed || thisGen !== loadGeneration) return;
+
+    tokenizer = loadedTokenizer;
+    processor = loadedProcessor;
+    textModel = loadedTextModel;
+    visionModel = loadedVisionModel;
+
+    // Probe the projection dim with a tiny warmup; different CLIP
+    // variants project to 512, 768, or 1024 dims and we want to be sure
+    // before callers start packing bins.
+    try {
+      const tokens = tokenizer(['probe'], { padding: true, truncation: true }) as any;
+      const output = (await (textModel as any)(tokens)) as any;
+      const dims: number[] | undefined = output?.text_embeds?.dims;
+      if (Array.isArray(dims) && dims.length > 0) {
+        embeddingDim = Number(dims[dims.length - 1]);
+      }
+    } catch {
+      // Stick with the default dim if the probe fails — the real embed
+      // calls will surface a more specific error if the model is bad.
+    }
+
+    post({ type: 'ready', dim: embeddingDim });
+  } catch (error) {
+    post({ type: 'error', message: error instanceof Error ? error.message : String(error) });
+  } finally {
+    loading = false;
+  }
+}
+
+function normalize(vector: Float32Array): Float32Array {
+  let sum = 0;
+  for (let i = 0; i < vector.length; i += 1) sum += vector[i]! * vector[i]!;
+  const norm = Math.sqrt(sum) || 1;
+  const out = new Float32Array(vector.length);
+  for (let i = 0; i < vector.length; i += 1) out[i] = vector[i]! / norm;
+  return out;
+}
+
+function splitPacked(packed: Float32Array, count: number, dim: number): Float32Array[] {
+  const vectors: Float32Array[] = [];
+  for (let i = 0; i < count; i += 1) {
+    vectors.push(normalize(packed.slice(i * dim, (i + 1) * dim)));
+  }
+  return vectors;
+}
+
+async function embedImages(id: number, blobs: Blob[]): Promise<void> {
+  if (!processor || !visionModel) {
+    post({ type: 'error', id, message: 'CLIP worker not ready (vision)' });
+    return;
+  }
+  if (blobs.length === 0) {
+    post({ type: 'vectors', id, vectors: [] });
+    return;
+  }
+  try {
+    const images = await Promise.all(blobs.map((blob) => RawImage.fromBlob(blob)));
+    const inputs = await (processor as any)(images);
+    const output = (await (visionModel as any)(inputs)) as any;
+    const data = output?.image_embeds?.data as Float32Array | undefined;
+    if (!data) throw new Error('CLIP vision model returned no image_embeds');
+    post({ type: 'vectors', id, vectors: splitPacked(data, blobs.length, embeddingDim) });
+  } catch (error) {
+    post({ type: 'error', id, message: error instanceof Error ? error.message : String(error) });
+  }
+}
+
+async function embedTexts(id: number, texts: string[]): Promise<void> {
+  if (!tokenizer || !textModel) {
+    post({ type: 'error', id, message: 'CLIP worker not ready (text)' });
+    return;
+  }
+  if (texts.length === 0) {
+    post({ type: 'vectors', id, vectors: [] });
+    return;
+  }
+  try {
+    const tokens = (tokenizer as any)(texts, { padding: true, truncation: true });
+    const output = (await (textModel as any)(tokens)) as any;
+    const data = output?.text_embeds?.data as Float32Array | undefined;
+    if (!data) throw new Error('CLIP text model returned no text_embeds');
+    post({ type: 'vectors', id, vectors: splitPacked(data, texts.length, embeddingDim) });
+  } catch (error) {
+    post({ type: 'error', id, message: error instanceof Error ? error.message : String(error) });
+  }
+}
+
+self.addEventListener('message', (event: MessageEvent) => {
+  const message = event.data;
+  if (!message || typeof message.type !== 'string') return;
+
+  if (message.type === 'init') {
+    void loadModel();
+    return;
+  }
+
+  if (message.type === 'embed-images') {
+    const id = typeof message.id === 'number' ? message.id : 0;
+    const blobs = Array.isArray(message.blobs) ? (message.blobs as Blob[]) : [];
+    void embedImages(id, blobs);
+    return;
+  }
+
+  if (message.type === 'embed-text') {
+    const id = typeof message.id === 'number' ? message.id : 0;
+    const texts = Array.isArray(message.texts)
+      ? (message.texts as unknown[]).filter((t): t is string => typeof t === 'string')
+      : [];
+    void embedTexts(id, texts);
+    return;
+  }
+
+  if (message.type === 'dispose') {
+    disposed = true;
+    tokenizer = null;
+    processor = null;
+    textModel = null;
+    visionModel = null;
+    loading = false;
+    return;
+  }
+});
+/* eslint-enable @typescript-eslint/no-explicit-any */
diff --git a/src/lib/analysis/embeddings/context.test.ts b/src/lib/analysis/embeddings/context.test.ts
new file mode 100644
index 000000000..fccbb957c
--- /dev/null
+++ b/src/lib/analysis/embeddings/context.test.ts
@@ -0,0 +1,160 @@
+import { describe, expect, it } from 'vitest';
+import {
+  buildEmbeddingText,
+  sliceTranscript,
+} from './context';
+import { parseSceneCaptionResponse } from '../captioning/scene-caption-format';
+
+describe('sliceTranscript', () => {
+  const segments = [
+    { text: 'In the beginning.', start: 0, end: 2 },
+    { text: 'We see a mountain.', start: 2, end: 5 },
+    { text: 'The chef prepares dinner.', start: 10, end: 13 },
+    { text: 'Later that night.', start: 30, end: 32 },
+  ];
+
+  it('pulls segments overlapping the caption window', () => {
+    expect(sliceTranscript(segments, 11, 2)).toBe('The chef prepares dinner.');
+  });
+
+  it('joins adjacent overlapping segments with a space', () => {
+    expect(sliceTranscript(segments, 3, 2)).toBe('In the beginning. We see a mountain.');
+  });
+
+  it('returns empty string when transcript is missing', () => {
+    expect(sliceTranscript(null, 10)).toBe('');
+    expect(sliceTranscript(undefined, 10)).toBe('');
+    expect(sliceTranscript([], 10)).toBe('');
+  });
+
+  it('returns empty string when nothing overlaps', () => {
+    expect(sliceTranscript(segments, 20, 1)).toBe('');
+  });
+
+  it('clips long transcripts to a word boundary', () => {
+    const long = Array.from({ length: 50 }, (_, i) => ({
+      text: `this is sentence number ${i}`,
+      start: i,
+      end: i + 1,
+    }));
+    const result = sliceTranscript(long, 25, 20);
+    expect(result.length).toBeLessThanOrEqual(220);
+    expect(result.endsWith(' ')).toBe(false);
+    expect(result.split(' ').pop()).not.toMatch(/^[a-z]*\d{1,2}$/);
+  });
+});
+
+describe('buildEmbeddingText', () => {
+  const base = {
+    caption: { text: 'A tree with orange leaves.', timeSec: 10 },
+  };
+
+  it('always starts with SCENE: <caption>', () => {
+    const result = buildEmbeddingText(base);
+    expect(result.startsWith('SCENE: A tree with orange leaves.')).toBe(true);
+  });
+
+  it('includes structured scene metadata when supplied', () => {
+    const result = buildEmbeddingText({
+      ...base,
+      sceneData: {
+        shotType: 'medium close-up',
+        timeOfDay: 'dusk',
+        weather: 'rainy',
+      },
+    });
+    expect(result).toMatch(/SHOT: medium close-up/);
+    expect(result).toMatch(/TIME: dusk/);
+    expect(result).toMatch(/WEATHER: rainy/);
+  });
+
+  it('omits SPEECH: when transcript is missing or unmatched', () => {
+    const result = buildEmbeddingText(base);
+    expect(result).not.toMatch(/SPEECH:/);
+  });
+
+  it('includes SPEECH: when transcript overlaps caption timestamp', () => {
+    const result = buildEmbeddingText({
+      ...base,
+      transcriptSegments: [{ text: 'and here is hokkaido', start: 9, end: 11 }],
+    });
+    expect(result).toMatch(/SPEECH: and here is hokkaido/);
+  });
+
+  it('does not emit SOURCE: because filename was dropped from context', () => {
+    const result = buildEmbeddingText(base);
+    expect(result).not.toMatch(/SOURCE:/);
+  });
+
+  it('includes COLORS: when a phrase is provided', () => {
+    const result = buildEmbeddingText({ ...base, colorPhrase: 'warm orange, teal' });
+    expect(result).toMatch(/COLORS: warm orange, teal/);
+  });
+
+  it('omits COLORS: for empty string', () => {
+    const result = buildEmbeddingText({ ...base, colorPhrase: '   ' });
+    expect(result).not.toMatch(/COLORS:/);
+  });
+
+  it('preserves scene metadata before transcript and colors', () => {
+    const result = buildEmbeddingText({
+      ...base,
+      sceneData: {
+        shotType: 'wide shot',
+        timeOfDay: 'dusk',
+        weather: 'foggy',
+      },
+      transcriptSegments: [{ text: 'speech here', start: 9, end: 11 }],
+      colorPhrase: 'deep blue',
+    });
+    const sceneIdx = result.indexOf('SCENE:');
+    const shotIdx = result.indexOf('SHOT:');
+    const timeIdx = result.indexOf('TIME:');
+    const weatherIdx = result.indexOf('WEATHER:');
+    const speechIdx = result.indexOf('SPEECH:');
+    const colorsIdx = result.indexOf('COLORS:');
+    expect(sceneIdx).toBeLessThan(shotIdx);
+    expect(shotIdx).toBeLessThan(timeIdx);
+    expect(timeIdx).toBeLessThan(weatherIdx);
+    expect(weatherIdx).toBeLessThan(speechIdx);
+    expect(speechIdx).toBeLessThan(colorsIdx);
+  });
+
+  it('produces a valid string even with only a caption', () => {
+    const result = buildEmbeddingText({
+      caption: { text: 'Minimal scene.', timeSec: 0 },
+    });
+    expect(result).toBe('SCENE: Minimal scene.');
+  });
+
+  it('preserves richer scene captions verbatim for downstream semantic indexing', () => {
+    const result = buildEmbeddingText({
+      caption: { text: 'Medium close-up of a singer on a rainy street at dusk.', timeSec: 12 },
+      sceneData: {
+        shotType: 'medium close-up',
+        timeOfDay: 'dusk',
+        weather: 'rainy',
+      },
+    });
+    expect(result).toBe(
+      'SCENE: Medium close-up of a singer on a rainy street at dusk.\n'
+      + 'SHOT: medium close-up\n'
+      + 'TIME: dusk\n'
+      + 'WEATHER: rainy',
+    );
+  });
+
+  it('turns json-ish caption model output into clean embedding text', () => {
+    const parsed = parseSceneCaptionResponse(
+      'Json ["caption":"A dimly lit corridor illuminated by hanging lanterns, with a text overlay in Chinese at the bottom.","shotType":"medium wide shot","subjects":["lanterns","corridor","text"],"action":"glowing softly","setting":"interior corridor","lighting":"golden lantern light","timeOfDay":null,"weather":null}.',
+    );
+
+    expect(buildEmbeddingText({
+      caption: { text: parsed.text, timeSec: 9 },
+      sceneData: parsed.sceneData,
+    })).toBe(
+      'SCENE: A dimly lit corridor illuminated by hanging lanterns, with a text overlay in Chinese at the bottom.\n'
+      + 'SHOT: medium-wide shot',
+    );
+  });
+});
diff --git a/src/lib/analysis/embeddings/context.ts b/src/lib/analysis/embeddings/context.ts
new file mode 100644
index 000000000..e8daebd6d
--- /dev/null
+++ b/src/lib/analysis/embeddings/context.ts
@@ -0,0 +1,117 @@
+import type { SceneCaptionData } from '../captioning/types';
+
+/**
+ * Embedding context builder.
+ *
+ * The caption text alone carries a lot of semantic signal, but the Scene
+ * Browser gets dramatically better results when adjacent context is
+ * folded into the string before embedding. We concatenate same-space
+ * signals into one structured input so that:
+ *
+ *  - a query like "sunset in hokkaido" matches on caption + source
+ *    filename even when neither alone is sufficient,
+ *  - "orange sky" matches scenes whose caption doesn't name colors but
+ *    whose thumbnail is dominated by warm tones,
+ *  - "she explains the recipe" matches scenes where the caption is
+ *    terse ("woman in kitchen") but the nearby transcript is rich.
+ *
+ * Missing signals are simply omitted — a no-transcript b-roll scene
+ * produces a shorter string, not a weaker vector. This is the whole
+ * reason we chose concat-and-embed-once over parallel vectors for
+ * same-modality signals.
+ */
+
+export interface TranscriptSegment {
+  text: string;
+  start: number;
+  end: number;
+}
+
+export interface BuildEmbeddingTextInput {
+  caption: { text: string; timeSec: number };
+  sceneData?: SceneCaptionData;
+  /**
+   * Retained for call-site compatibility but unused — filename tokens
+   * turned out to be noise for editor workflows (proxied filenames,
+   * generic "final_export" stems drifted meaning more than they helped).
+   */
+  fileName?: string;
+  /** Full transcript for the source media, used to slice per-caption. */
+  transcriptSegments?: TranscriptSegment[] | null;
+  /**
+   * Human-readable dominant-color phrase for the caption's thumbnail,
+   * e.g. `"warm orange, deep teal, near black"`. Computed off the JPEG
+   * the captioning provider already captured at analyze time. This is
+   * a fuzzy hint for the transformer; the structural Lab palette in
+   * `paletteForLab` is what powers exact color-query ranking.
+   */
+  colorPhrase?: string;
+}
+
+/** ± radius in seconds around the caption timestamp to pull transcript from. */
+const DEFAULT_TRANSCRIPT_RADIUS_SEC = 2;
+
+/** Longer values drown the caption signal in transcript chatter. */
+const TRANSCRIPT_MAX_CHARS = 220;
+
+/**
+ * Pull transcript text that overlaps with a caption's time window. Joins
+ * the chosen segments and caps length so long speeches don't dominate
+ * the embedding input (all-MiniLM truncates around 256 tokens anyway).
+ */
+export function sliceTranscript(
+  segments: TranscriptSegment[] | null | undefined,
+  timeSec: number,
+  radiusSec: number = DEFAULT_TRANSCRIPT_RADIUS_SEC,
+): string {
+  if (!segments || segments.length === 0) return '';
+  const from = timeSec - radiusSec;
+  const to = timeSec + radiusSec;
+  const chunks: string[] = [];
+  for (const segment of segments) {
+    if (segment.end < from || segment.start > to) continue;
+    const text = segment.text.trim();
+    if (text) chunks.push(text);
+  }
+  const joined = chunks.join(' ').replace(/\s+/g, ' ').trim();
+  if (joined.length <= TRANSCRIPT_MAX_CHARS) return joined;
+  // Clip to a word boundary so the truncation doesn't leave half-words
+  // in the embedding input.
+  const clipped = joined.slice(0, TRANSCRIPT_MAX_CHARS);
+  const lastSpace = clipped.lastIndexOf(' ');
+  return lastSpace > TRANSCRIPT_MAX_CHARS * 0.6 ? clipped.slice(0, lastSpace) : clipped;
+}
+
+/**
+ * Compose the string that actually gets embedded. Ordering matters a
+ * little — caption first because it's the primary signal, optional
+ * context lines after. Line prefixes like `SCENE:` aren't magic; they
+ * just give the transformer a small semantic anchor.
+ *
+ * Note: we deliberately don't include filename/filepath tokens here.
+ * They tested poorly in practice (proxied renders, generic "export"
+ * stems, project-template names) and shifted embeddings toward the
+ * *filename* rather than the scene content.
+ */
+export function buildEmbeddingText(input: BuildEmbeddingTextInput): string {
+  const lines: string[] = [];
+  const caption = input.caption.text.trim();
+  lines.push(`SCENE: ${caption}`);
+
+  const shotType = input.sceneData?.shotType?.trim();
+  if (shotType) lines.push(`SHOT: ${shotType}`);
+
+  const timeOfDay = input.sceneData?.timeOfDay?.trim();
+  if (timeOfDay) lines.push(`TIME: ${timeOfDay}`);
+
+  const weather = input.sceneData?.weather?.trim();
+  if (weather) lines.push(`WEATHER: ${weather}`);
+
+  const speech = sliceTranscript(input.transcriptSegments, input.caption.timeSec);
+  if (speech) lines.push(`SPEECH: ${speech}`);
+
+  const colors = input.colorPhrase?.trim();
+  if (colors) lines.push(`COLORS: ${colors}`);
+
+  return lines.join('\n');
+}
diff --git a/src/lib/analysis/embeddings/create-clip-worker.ts b/src/lib/analysis/embeddings/create-clip-worker.ts
new file mode 100644
index 000000000..7023d4409
--- /dev/null
+++ b/src/lib/analysis/embeddings/create-clip-worker.ts
@@ -0,0 +1,5 @@
+import ClipWorker from './clip-worker.ts?worker';
+
+export function createClipWorker(): Worker {
+  return new ClipWorker();
+}
diff --git a/src/lib/analysis/embeddings/create-embeddings-worker.ts b/src/lib/analysis/embeddings/create-embeddings-worker.ts
new file mode 100644
index 000000000..cf74c897e
--- /dev/null
+++ b/src/lib/analysis/embeddings/create-embeddings-worker.ts
@@ -0,0 +1,9 @@
+/**
+ * Vite-aware factory for the sentence-embeddings worker.
+ * Matches the pattern used by `create-lfm-worker.ts`.
+ */
+import EmbeddingsWorker from './embeddings-worker.ts?worker';
+
+export function createEmbeddingsWorker(): Worker {
+  return new EmbeddingsWorker();
+}
diff --git a/src/lib/analysis/embeddings/dominant-colors.ts b/src/lib/analysis/embeddings/dominant-colors.ts
new file mode 100644
index 000000000..330965645
--- /dev/null
+++ b/src/lib/analysis/embeddings/dominant-colors.ts
@@ -0,0 +1,183 @@
+/**
+ * Cheap dominant-color extractor that turns a scene thumbnail into
+ * either a short English phrase ("warm orange, teal, near black")
+ * for embedding context, or a structural Lab palette for exact
+ * color-query ranking — we run one pass and emit both.
+ *
+ * Runs in ~5-15 ms per thumbnail on a downsampled 64×64 grid — much
+ * cheaper than k-means. Quantizes each pixel into a 4×4×4 RGB bucket
+ * (64 bins total), takes the most populated ones, and reports either
+ * a label string or the Lab + weight tuple. Lab coordinates use
+ * D65 sRGB as the source and are the canonical input for ∆E queries.
+ */
+
+import { rgbToLab } from './lab-color';
+
+const SAMPLE_SIZE = 64;
+const TOP_COLOR_COUNT = 4; // one more than the phrase variant — palette ranking benefits from extra context
+const MIN_BIN_FRACTION = 0.04; // ignore colors that cover <4% of the frame
+
+/**
+ * Structural entry in a scene's dominant color palette. Stored
+ * per-caption and queried at rank time via ∆E 2000 against user
+ * color terms.
+ */
+export interface PaletteEntry {
+  /** CIELAB components; `l` ∈ [0, 100], `a`/`b` ≈ [-128, 128]. */
+  l: number;
+  a: number;
+  b: number;
+  /** 0–1 fraction of thumbnail pixels assigned to this bin. */
+  weight: number;
+}
+
+interface BinEntry {
+  count: number;
+  rSum: number;
+  gSum: number;
+  bSum: number;
+}
+
+function rgbToHsl(r: number, g: number, b: number): { h: number; s: number; l: number } {
+  const rf = r / 255;
+  const gf = g / 255;
+  const bf = b / 255;
+  const max = Math.max(rf, gf, bf);
+  const min = Math.min(rf, gf, bf);
+  const l = (max + min) / 2;
+  let h = 0;
+  let s = 0;
+  if (max !== min) {
+    const delta = max - min;
+    s = l > 0.5 ? delta / (2 - max - min) : delta / (max + min);
+    switch (max) {
+      case rf: h = ((gf - bf) / delta + (gf < bf ? 6 : 0)); break;
+      case gf: h = ((bf - rf) / delta + 2); break;
+      default: h = ((rf - gf) / delta + 4); break;
+    }
+    h *= 60;
+  }
+  return { h, s: s * 100, l: l * 100 };
+}
+
+function hueLabel(hue: number): string {
+  // 8-way hue wheel — generic enough that a query like "orange sky"
+  // reliably hits thumbs with warm sunset tones, specific enough that
+  // "green" doesn't collapse into "yellow-green".
+  if (hue < 15 || hue >= 345) return 'red';
+  if (hue < 40) return 'orange';
+  if (hue < 65) return 'yellow';
+  if (hue < 95) return 'yellow green';
+  if (hue < 165) return 'green';
+  if (hue < 200) return 'teal';
+  if (hue < 255) return 'blue';
+  if (hue < 285) return 'purple';
+  if (hue < 345) return 'pink';
+  return 'red';
+}
+
+function colorLabel(r: number, g: number, b: number): string {
+  const { h, s, l } = rgbToHsl(r, g, b);
+  if (l < 12) return 'near black';
+  if (l > 92) return 'near white';
+  if (s < 12) {
+    if (l < 35) return 'dark gray';
+    if (l < 65) return 'gray';
+    return 'light gray';
+  }
+  const hue = hueLabel(h);
+  if (l < 25) return `dark ${hue}`;
+  if (l > 75) return `light ${hue}`;
+  if (l < 45 && s > 40) return `deep ${hue}`;
+  if (s > 70 && l > 50 && (hue === 'orange' || hue === 'red' || hue === 'yellow')) {
+    return `warm ${hue}`;
+  }
+  return hue;
+}
+
+interface ExtractedColors {
+  /** Human-readable phrase for the embedding input. */
+  phrase: string;
+  /** Lab+weight entries ranked by coverage, ready to ∆E against. */
+  palette: PaletteEntry[];
+}
+
+/**
+ * One-pass dominant-color extraction. Returns both the labeled
+ * phrase (for the transformer-visible COLORS: line) and the
+ * structural Lab palette (for ∆E color-query ranking).
+ */
+export async function extractDominantColors(blob: Blob): Promise<ExtractedColors> {
+  let bitmap: ImageBitmap | null = null;
+  try {
+    bitmap = await createImageBitmap(blob);
+  } catch {
+    return { phrase: '', palette: [] };
+  }
+  try {
+    const canvas = new OffscreenCanvas(SAMPLE_SIZE, SAMPLE_SIZE);
+    const context = canvas.getContext('2d');
+    if (!context) return { phrase: '', palette: [] };
+    context.drawImage(bitmap, 0, 0, SAMPLE_SIZE, SAMPLE_SIZE);
+    const { data } = context.getImageData(0, 0, SAMPLE_SIZE, SAMPLE_SIZE);
+
+    const bins = new Map<number, BinEntry>();
+    const totalPixels = data.length / 4;
+    for (let i = 0; i < data.length; i += 4) {
+      const r = data[i]!;
+      const g = data[i + 1]!;
+      const b = data[i + 2]!;
+      const key = ((r >> 6) << 4) | ((g >> 6) << 2) | (b >> 6);
+      const bin = bins.get(key);
+      if (bin) {
+        bin.count += 1;
+        bin.rSum += r;
+        bin.gSum += g;
+        bin.bSum += b;
+      } else {
+        bins.set(key, { count: 1, rSum: r, gSum: g, bSum: b });
+      }
+    }
+
+    const ranked = [...bins.values()]
+      .filter((bin) => bin.count / totalPixels >= MIN_BIN_FRACTION)
+      .sort((a, b) => b.count - a.count)
+      .slice(0, TOP_COLOR_COUNT);
+
+    const labels: string[] = [];
+    const seenLabels = new Set<string>();
+    const palette: PaletteEntry[] = [];
+
+    for (const bin of ranked) {
+      const r = Math.round(bin.rSum / bin.count);
+      const g = Math.round(bin.gSum / bin.count);
+      const b = Math.round(bin.bSum / bin.count);
+
+      const label = colorLabel(r, g, b);
+      if (!seenLabels.has(label)) {
+        seenLabels.add(label);
+        labels.push(label);
+      }
+
+      const lab = rgbToLab(r, g, b);
+      palette.push({
+        l: Number(lab.l.toFixed(2)),
+        a: Number(lab.a.toFixed(2)),
+        b: Number(lab.b.toFixed(2)),
+        weight: Number((bin.count / totalPixels).toFixed(3)),
+      });
+    }
+
+    return { phrase: labels.join(', '), palette };
+  } finally {
+    bitmap.close();
+  }
+}
+
+/**
+ * Back-compat helper for code paths that only need the human phrase
+ * (embedding input). Equivalent to `extractDominantColors().phrase`.
+ */
+export async function extractDominantColorPhrase(blob: Blob): Promise<string> {
+  return (await extractDominantColors(blob)).phrase;
+}
diff --git a/src/lib/analysis/embeddings/embeddings-provider.ts b/src/lib/analysis/embeddings/embeddings-provider.ts
new file mode 100644
index 000000000..39d13669e
--- /dev/null
+++ b/src/lib/analysis/embeddings/embeddings-provider.ts
@@ -0,0 +1,155 @@
+/**
+ * Singleton sentence-embedding provider built on top of `embeddings-worker`.
+ *
+ * A single worker instance is reused for the lifetime of the tab —
+ * re-instantiating would force another model download. The module-scoped
+ * state is intentional; callers should go through the exported
+ * {@link embeddingsProvider} rather than constructing anything themselves.
+ */
+
+import { createLogger } from '@/shared/logging/logger';
+import { createEmbeddingsWorker } from './create-embeddings-worker';
+import {
+  EMBEDDING_MODEL_DIM,
+  EMBEDDING_MODEL_ID,
+  type EmbeddingsOptions,
+  type EmbeddingsProvider,
+} from './types';
+
+const log = createLogger('EmbeddingsProvider');
+
+const INIT_TIMEOUT_MS = 60_000;
+
+let worker: Worker | null = null;
+let readyPromise: Promise<void> | null = null;
+let nextId = 0;
+
+function getWorker(): Worker {
+  if (!worker) {
+    worker = createEmbeddingsWorker();
+    worker.addEventListener('error', (event) => {
+      log.error('Embeddings worker errored', event.message);
+    });
+  }
+  return worker;
+}
+
+function ensureReady(options: EmbeddingsOptions = {}): Promise<void> {
+  if (readyPromise) return readyPromise;
+  const w = getWorker();
+
+  readyPromise = new Promise<void>((resolve, reject) => {
+    const timeout = setTimeout(() => {
+      cleanup();
+      reject(new Error('Embeddings worker init timed out'));
+    }, INIT_TIMEOUT_MS);
+
+    const cleanup = () => {
+      clearTimeout(timeout);
+      w.removeEventListener('message', onMessage);
+      options.signal?.removeEventListener('abort', onAbort);
+    };
+
+    const onAbort = () => {
+      cleanup();
+      reject(options.signal?.reason ?? new Error('Embedding init aborted'));
+    };
+
+    const onMessage = (event: MessageEvent) => {
+      const message = event.data;
+      if (message.type === 'ready') {
+        cleanup();
+        resolve();
+        return;
+      }
+      if (message.type === 'progress') {
+        options.onProgress?.({ stage: 'loading-model', percent: message.percent ?? 0 });
+        return;
+      }
+      if (message.type === 'error' && message.id === undefined) {
+        cleanup();
+        reject(new Error(message.message ?? 'Embeddings worker init failed'));
+      }
+    };
+
+    if (options.signal?.aborted) {
+      cleanup();
+      reject(options.signal.reason);
+      return;
+    }
+    options.signal?.addEventListener('abort', onAbort, { once: true });
+
+    w.addEventListener('message', onMessage);
+    w.postMessage({ type: 'init' });
+  });
+
+  readyPromise.catch(() => {
+    // A failed init should not pin the promise forever — subsequent calls
+    // will retry (model might have been offline, transient error, etc.).
+    readyPromise = null;
+  });
+
+  return readyPromise;
+}
+
+function embedBatch(texts: string[], options: EmbeddingsOptions = {}): Promise<Float32Array[]> {
+  if (texts.length === 0) return Promise.resolve([]);
+
+  const id = ++nextId;
+  const w = getWorker();
+
+  return ensureReady(options).then(() => new Promise<Float32Array[]>((resolve, reject) => {
+    const cleanup = () => {
+      w.removeEventListener('message', onMessage);
+      options.signal?.removeEventListener('abort', onAbort);
+    };
+
+    const onAbort = () => {
+      cleanup();
+      reject(options.signal?.reason ?? new Error('Embedding aborted'));
+    };
+
+    const onMessage = (event: MessageEvent) => {
+      const message = event.data;
+      if (message.id !== id) return;
+      if (message.type === 'embeddings') {
+        cleanup();
+        resolve(message.vectors as Float32Array[]);
+        return;
+      }
+      if (message.type === 'error') {
+        cleanup();
+        reject(new Error(message.message ?? 'Embedding failed'));
+      }
+    };
+
+    if (options.signal?.aborted) {
+      cleanup();
+      reject(options.signal.reason);
+      return;
+    }
+    options.signal?.addEventListener('abort', onAbort, { once: true });
+
+    w.addEventListener('message', onMessage);
+    w.postMessage({ type: 'embed', id, texts });
+  }));
+}
+
+export const embeddingsProvider: EmbeddingsProvider = {
+  ensureReady,
+  async embed(text, options) {
+    const [vector] = await embedBatch([text], options);
+    if (!vector) throw new Error('Embedding returned no vector');
+    return vector;
+  },
+  embedBatch,
+  dispose() {
+    if (!worker) return;
+    worker.postMessage({ type: 'dispose' });
+    worker.terminate();
+    worker = null;
+    readyPromise = null;
+  },
+};
+
+export { EMBEDDING_MODEL_ID, EMBEDDING_MODEL_DIM };
diff --git a/src/lib/analysis/embeddings/embeddings-worker.ts b/src/lib/analysis/embeddings/embeddings-worker.ts
new file mode 100644
index 000000000..64b4ec511
--- /dev/null
+++ b/src/lib/analysis/embeddings/embeddings-worker.ts
@@ -0,0 +1,125 @@
+/**
+ * Web Worker for sentence-embedding generation using Xenova/all-MiniLM-L6-v2.
+ *
+ * The model is quantized (~22 MB) and runs via `pipeline('feature-extraction')`
+ * from @huggingface/transformers. Loaded lazily on first init, cached in the
+ * browser after download.
+ *
+ * Messages:
+ *   → { type: 'init' }                      — preload model
+ *   → { type: 'embed', id, texts: string[] } — batch embed
+ *   → { type: 'dispose' }                    — release model
+ *   ← { type: 'ready', dim: number }         — model loaded; embedding dimension
+ *   ← { type: 'progress', percent: number }  — model download progress
+ *   ← { type: 'embeddings', id, vectors: Float32Array[] } — batch result
+ *   ← { type: 'error', id?, message }        — error
+ */
+
+import { pipeline, env, type FeatureExtractionPipeline } from '@huggingface/transformers';
+
+const MODEL_ID = 'Xenova/all-MiniLM-L6-v2';
+
+env.useBrowserCache = true;
+env.allowLocalModels = false;
+
+let extractor: FeatureExtractionPipeline | null = null;
+let loading = false;
+let disposed = false;
+let loadGeneration = 0;
+let embeddingDim = 384;
+
+function post(msg: Record<string, unknown>): void {
+  self.postMessage(msg);
+}
+
+async function loadModel(): Promise<void> {
+  if (extractor) {
+    post({ type: 'ready', dim: embeddingDim });
+    return;
+  }
+  if (loading) return;
+  loading = true;
+  disposed = false;
+  const thisGen = ++loadGeneration;
+
+  try {
+    let lastPct = 0;
+    const loaded = await pipeline('feature-extraction', MODEL_ID, {
+      dtype: 'q8',
+      progress_callback: (info: { status?: string; total?: number; loaded?: number }) => {
+        if (info.status === 'progress' && info.total && info.loaded) {
+          const pct = (info.loaded / info.total) * 100;
+          if (pct - lastPct > 2) {
+            lastPct = pct;
+            post({ type: 'progress', percent: Math.round(pct) });
+          }
+        }
+      },
+    });
+
+    if (disposed || thisGen !== loadGeneration) {
+      return;
+    }
+
+    extractor = loaded as FeatureExtractionPipeline;
+    // Probe dimension with a one-token warmup so the first real query isn't
+    // the one that pays the shape-inference cost.
+    const warmup = await extractor('probe', { pooling: 'mean', normalize: true });
+    embeddingDim = Array.isArray(warmup.dims) ? Number(warmup.dims[warmup.dims.length - 1]) : 384;
+
+    post({ type: 'ready', dim: embeddingDim });
+  } catch (error) {
+    post({ type: 'error', message: error instanceof Error ? error.message : String(error) });
+  } finally {
+    loading = false;
+  }
+}
+
+async function embedBatch(id: number, texts: string[]): Promise<void> {
+  if (!extractor) {
+    post({ type: 'error', id, message: 'Embeddings worker not ready' });
+    return;
+  }
+  try {
+    // Mean-pool + L2-normalize so cosine similarity becomes a dot product
+    // at the ranking site — no per-row normalization needed downstream.
+    const tensor = await extractor(texts, { pooling: 'mean', normalize: true });
+    const flat = tensor.data as Float32Array;
+    const dim = embeddingDim;
+    const vectors: Float32Array[] = [];
+    for (let i = 0; i < texts.length; i += 1) {
+      vectors.push(flat.slice(i * dim, (i + 1) * dim));
+    }
+    post(
+      { type: 'embeddings', id, vectors },
+      // Transfer underlying buffers when possible — avoids a copy for each
+      // 384-dim vector across the worker boundary.
+    );
+  } catch (error) {
+    post({ type: 'error', id, message: error instanceof Error ? error.message : String(error) });
+  }
+}
+
+self.addEventListener('message', (event: MessageEvent) => {
+  const message = event.data;
+  if (!message || typeof message.type !== 'string') return;
+
+  if (message.type === 'init') {
+    void loadModel();
+    return;
+  }
+
+  if (message.type === 'embed') {
+    const id = typeof message.id === 'number' ? message.id : 0;
+    const texts = Array.isArray(message.texts) ? message.texts.filter((t: unknown) => typeof t === 'string') : [];
+    void embedBatch(id, texts);
+    return;
+  }
+
+  if (message.type === 'dispose') {
+    disposed = true;
+    extractor = null;
+    loading = false;
+    return;
+  }
+});
diff --git a/src/lib/analysis/embeddings/index.ts b/src/lib/analysis/embeddings/index.ts
new file mode 100644
index 000000000..bcff3e303
--- /dev/null
+++ b/src/lib/analysis/embeddings/index.ts
@@ -0,0 +1,16 @@
+export { embeddingsProvider, EMBEDDING_MODEL_ID, EMBEDDING_MODEL_DIM } from './embeddings-provider';
+export { clipProvider, CLIP_MODEL_ID, CLIP_EMBEDDING_DIM } from './clip-provider';
+export type {
+  EmbeddingsOptions,
+  EmbeddingsProgress,
+  EmbeddingsProvider,
+} from './types';
+export {
+  buildEmbeddingText,
+  sliceTranscript,
+} from './context';
+export type { BuildEmbeddingTextInput, TranscriptSegment } from './context';
+export { extractDominantColors, extractDominantColorPhrase } from './dominant-colors';
+export type { PaletteEntry } from './dominant-colors';
+export { rgbToLab, deltaE76, deltaE2000 } from './lab-color';
+export type { LabColor } from './lab-color';
diff --git a/src/lib/analysis/embeddings/lab-color.test.ts b/src/lib/analysis/embeddings/lab-color.test.ts
new file mode 100644
index 000000000..67df38ab3
--- /dev/null
+++ b/src/lib/analysis/embeddings/lab-color.test.ts
@@ -0,0 +1,80 @@
+import { describe, expect, it } from 'vitest';
+import { deltaE2000, deltaE76, rgbToLab } from './lab-color';
+
+describe('rgbToLab', () => {
+  it('maps pure white to L=100, a=0, b=0', () => {
+    const { l, a, b } = rgbToLab(255, 255, 255);
+    expect(l).toBeCloseTo(100, 1);
+    expect(a).toBeCloseTo(0, 1);
+    expect(b).toBeCloseTo(0, 1);
+  });
+
+  it('maps pure black to L=0', () => {
+    const { l, a, b } = rgbToLab(0, 0, 0);
+    expect(l).toBeCloseTo(0, 1);
+    expect(a).toBeCloseTo(0, 1);
+    expect(b).toBeCloseTo(0, 1);
+  });
+
+  it('maps pure sRGB red to the canonical Lab red region', () => {
+    // Reference values from Bruce Lindbloom's sRGB calculator.
+    const lab = rgbToLab(255, 0, 0);
+    expect(lab.l).toBeCloseTo(53.24, 0);
+    expect(lab.a).toBeCloseTo(80.09, 0);
+    expect(lab.b).toBeCloseTo(67.20, 0);
+  });
+
+  it('maps pure sRGB green to its known Lab coordinates', () => {
+    const lab = rgbToLab(0, 255, 0);
+    expect(lab.l).toBeCloseTo(87.73, 0);
+    expect(lab.a).toBeCloseTo(-86.18, 0);
+    expect(lab.b).toBeCloseTo(83.18, 0);
+  });
+
+  it('maps pure sRGB blue to its known Lab coordinates', () => {
+    const lab = rgbToLab(0, 0, 255);
+    expect(lab.l).toBeCloseTo(32.30, 0);
+    expect(lab.a).toBeCloseTo(79.19, 0);
+    expect(lab.b).toBeCloseTo(-107.86, 0);
+  });
+});
+
+describe('deltaE76', () => {
+  it('returns 0 for identical colors', () => {
+    const red = rgbToLab(255, 0, 0);
+    expect(deltaE76(red, red)).toBeCloseTo(0, 5);
+  });
+
+  it('is larger between red and blue than between red and dark red', () => {
+    const red = rgbToLab(255, 0, 0);
+    const darkRed = rgbToLab(180, 0, 0);
+    const blue = rgbToLab(0, 0, 255);
+    expect(deltaE76(red, blue)).toBeGreaterThan(deltaE76(red, darkRed));
+  });
+});
+
+describe('deltaE2000', () => {
+  it('returns 0 for identical colors', () => {
+    const red = rgbToLab(255, 0, 0);
+    expect(deltaE2000(red, red)).toBeCloseTo(0, 5);
+  });
+
+  it('gives a small delta for near-duplicate reds', () => {
+    const red = rgbToLab(255, 0, 0);
+    const nearRed = rgbToLab(250, 5, 5);
+    expect(deltaE2000(red, nearRed)).toBeLessThan(3);
+  });
+
+  it('gives a large delta for red vs blue', () => {
+    const red = rgbToLab(255, 0, 0);
+    const blue = rgbToLab(0, 0, 255);
+    expect(deltaE2000(red, blue)).toBeGreaterThan(40);
+  });
+
+  it('ranks "orange vs red" closer than "orange vs blue"', () => {
+    const orange = rgbToLab(255, 128, 0);
+    const red = rgbToLab(255, 0, 0);
+    const blue = rgbToLab(0, 0, 255);
+    expect(deltaE2000(orange, red)).toBeLessThan(deltaE2000(orange, blue));
+  });
+});
diff --git a/src/lib/analysis/embeddings/lab-color.ts b/src/lib/analysis/embeddings/lab-color.ts
new file mode 100644
index 000000000..900439ad6
--- /dev/null
+++ b/src/lib/analysis/embeddings/lab-color.ts
@@ -0,0 +1,156 @@
+/**
+ * sRGB → CIELAB conversion and Delta E perceptual distance.
+ *
+ * Color-by-query is notoriously hard for CLIP — CLIP was trained on
+ * captions like `"a photo of a red firetruck"` where color is attached
+ * to an object, so bare color queries drift to weak matches. Industry
+ * CBIR systems (Imgix, TinEye, classic color histograms) use the
+ * CIELAB color space with ∆E distance because Lab is approximately
+ * perceptually uniform — equal ∆E steps correspond to equal visible
+ * differences. This module provides that pipeline.
+ *
+ * Conversion constants come from the D65 reference illuminant, which
+ * matches sRGB's standard viewing conditions. The ∆E 2000 formula is
+ * the industry standard for perceptual distance; ∆E 76 is the
+ * simpler Euclidean version used as a fast fallback.
+ */
+
+export interface LabColor {
+  l: number;
+  a: number;
+  b: number;
+}
+
+// D65 reference white in XYZ.
+const REF_X = 0.95047;
+const REF_Y = 1.0;
+const REF_Z = 1.08883;
+
+function sRgbCompand(v: number): number {
+  // Inverse of the sRGB gamma companding — get back to linear light.
+  const normalized = v / 255;
+  return normalized <= 0.04045
+    ? normalized / 12.92
+    : Math.pow((normalized + 0.055) / 1.055, 2.4);
+}
+
+function labFTransform(t: number): number {
+  const epsilon = 216 / 24389; // 0.008856...
+  const kappa = 24389 / 27;    // 903.3...
+  return t > epsilon
+    ? Math.cbrt(t)
+    : (kappa * t + 16) / 116;
+}
+
+/**
+ * Convert 0–255 sRGB values to CIELAB. Input assumed gamma-encoded
+ * (as JPEGs are). Output `l` is in 0–100, `a`/`b` roughly in -128..127.
+ */
+export function rgbToLab(r: number, g: number, b: number): LabColor {
+  const rLin = sRgbCompand(r);
+  const gLin = sRgbCompand(g);
+  const bLin = sRgbCompand(b);
+
+  // sRGB → XYZ (D65)
+  const x = rLin * 0.4124564 + gLin * 0.3575761 + bLin * 0.1804375;
+  const y = rLin * 0.2126729 + gLin * 0.7151522 + bLin * 0.0721750;
+  const z = rLin * 0.0193339 + gLin * 0.1191920 + bLin * 0.9503041;
+
+  // XYZ → Lab
+  const fx = labFTransform(x / REF_X);
+  const fy = labFTransform(y / REF_Y);
+  const fz = labFTransform(z / REF_Z);
+
+  return {
+    l: 116 * fy - 16,
+    a: 500 * (fx - fy),
+    b: 200 * (fy - fz),
+  };
+}
+
+/**
+ * Simple Euclidean distance in Lab (∆E 76). Cheap, approximate —
+ * values below ~2 are visually indistinguishable, 2–10 is a subtle
+ * change, 10+ is obviously different.
+ */
+export function deltaE76(a: LabColor, b: LabColor): number {
+  const dL = a.l - b.l;
+  const dA = a.a - b.a;
+  const dB = a.b - b.b;
+  return Math.sqrt(dL * dL + dA * dA + dB * dB);
+}
+
+/**
+ * CIEDE 2000 — industry-standard perceptual distance. Corrects for
+ * known issues with ∆E 76 (hue non-linearity, blue/purple cluster
+ * distortion). More expensive but still cheap enough to run per
+ * palette entry per query on the hot path.
+ *
+ * Formula source: Sharma et al. (2005), "The CIEDE2000 Color-Difference
+ * Formula: Implementation Notes, Supplementary Test Data, and
+ * Mathematical Observations."
+ */
+export function deltaE2000(c1: LabColor, c2: LabColor): number {
+  const { l: l1, a: a1, b: b1 } = c1;
+  const { l: l2, a: a2, b: b2 } = c2;
+
+  const avgL = (l1 + l2) / 2;
+  const c1ab = Math.sqrt(a1 * a1 + b1 * b1);
+  const c2ab = Math.sqrt(a2 * a2 + b2 * b2);
+  const avgC = (c1ab + c2ab) / 2;
+
+  const g = 0.5 * (1 - Math.sqrt(Math.pow(avgC, 7) / (Math.pow(avgC, 7) + Math.pow(25, 7))));
+  const a1p = a1 * (1 + g);
+  const a2p = a2 * (1 + g);
+
+  const c1p = Math.sqrt(a1p * a1p + b1 * b1);
+  const c2p = Math.sqrt(a2p * a2p + b2 * b2);
+  const avgCp = (c1p + c2p) / 2;
+
+  const h1p = Math.atan2(b1, a1p) >= 0
+    ? Math.atan2(b1, a1p)
+    : Math.atan2(b1, a1p) + 2 * Math.PI;
+  const h2p = Math.atan2(b2, a2p) >= 0
+    ? Math.atan2(b2, a2p)
+    : Math.atan2(b2, a2p) + 2 * Math.PI;
+
+  const dHp = (() => {
+    if (c1p * c2p === 0) return 0;
+    const diff = h2p - h1p;
+    if (Math.abs(diff) <= Math.PI) return diff;
+    return diff > Math.PI ? diff - 2 * Math.PI : diff + 2 * Math.PI;
+  })();
+
+  const dLp = l2 - l1;
+  const dCp = c2p - c1p;
+  const dHpFinal = 2 * Math.sqrt(c1p * c2p) * Math.sin(dHp / 2);
+
+  const avgHp = (() => {
+    if (c1p * c2p === 0) return h1p + h2p;
+    if (Math.abs(h1p - h2p) <= Math.PI) return (h1p + h2p) / 2;
+    return h1p + h2p < 2 * Math.PI
+      ? (h1p + h2p + 2 * Math.PI) / 2
+      : (h1p + h2p - 2 * Math.PI) / 2;
+  })();
+
+  const t = 1
+    - 0.17 * Math.cos(avgHp - Math.PI / 6)
+    + 0.24 * Math.cos(2 * avgHp)
+    + 0.32 * Math.cos(3 * avgHp + Math.PI / 30)
+    - 0.20 * Math.cos(4 * avgHp - (63 * Math.PI) / 180);
+
+  const sl = 1 + (0.015 * Math.pow(avgL - 50, 2)) / Math.sqrt(20 + Math.pow(avgL - 50, 2));
+  const sc = 1 + 0.045 * avgCp;
+  const sh = 1 + 0.015 * avgCp * t;
+
+  const dTheta = (30 * Math.PI / 180) * Math.exp(-Math.pow((avgHp * 180 / Math.PI - 275) / 25, 2));
+  const rc = 2 * Math.sqrt(Math.pow(avgCp, 7) / (Math.pow(avgCp, 7) + Math.pow(25, 7)));
+  const rt = -rc * Math.sin(2 * dTheta);
+
+  return Math.sqrt(
+    Math.pow(dLp / sl, 2)
+    + Math.pow(dCp / sc, 2)
+    + Math.pow(dHpFinal / sh, 2)
+    + rt * (dCp / sc) * (dHpFinal / sh),
+  );
+}
diff --git a/src/lib/analysis/embeddings/types.ts b/src/lib/analysis/embeddings/types.ts
new file mode 100644
index 000000000..0c075b553
--- /dev/null
+++ b/src/lib/analysis/embeddings/types.ts
@@ -0,0 +1,31 @@
+/**
+ * Public types for the sentence-embedding provider.
+ *
+ * The model identifier and dimension are exposed so consumers can persist
+ * them alongside stored embeddings and detect mismatch on load (e.g. if
+ * we switch to a larger model later, old vectors must be re-generated).
+ */
+
+export const EMBEDDING_MODEL_ID = 'Xenova/all-MiniLM-L6-v2';
+export const EMBEDDING_MODEL_DIM = 384;
+
+export interface EmbeddingsProgress {
+  stage: 'loading-model' | 'idle';
+  percent: number;
+}
+
+export interface EmbeddingsOptions {
+  onProgress?: (progress: EmbeddingsProgress) => void;
+  signal?: AbortSignal;
+}
+
+export interface EmbeddingsProvider {
+  /** Ensures the model is loaded; safe to call repeatedly. */
+  ensureReady(options?: EmbeddingsOptions): Promise<void>;
+  /** Embed one text. Returns a unit-length 384-dim vector. */
+  embed(text: string, options?: EmbeddingsOptions): Promise<Float32Array>;
+  /** Embed a batch. More efficient than calling `embed` in a loop. */
+  embedBatch(texts: string[], options?: EmbeddingsOptions): Promise<Float32Array[]>;
+  /** Release the worker and free the underlying model memory. */
+  dispose(): void;
+}
diff --git a/src/lib/analysis/index.ts b/src/lib/analysis/index.ts
index 1f4da7258..7ba4642e5 100644
--- a/src/lib/analysis/index.ts
+++ b/src/lib/analysis/index.ts
@@ -1,7 +1,12 @@
 export { OpticalFlowAnalyzer } from './optical-flow-analyzer';
 export type { MotionResult } from './optical-flow-analyzer';
 export { detectScenes, clearSceneCache } from './scene-detection';
-export type { SceneCut, SceneDetectionProgress, DetectScenesOptions, VerificationModel } from './scene-detection';
+export type {
+  SceneCut,
+  SceneDetectionProgress,
+  DetectScenesOptions,
+  VerificationModel,
+} from './scene-detection';
 export {
   getDefaultSceneVerificationProvider,
   getSceneVerificationModelLabel,
@@ -14,4 +19,28 @@ export type { HistogramDetectOptions } from './histogram-scene-detection';
 export { seekVideo, deduplicateCuts } from './scene-detection-utils';
 export { captionVideo, captionImage } from './media-tagger';
 export type { MediaCaption, CaptioningProgress, CaptioningOptions } from './media-tagger';
+export {
+  embeddingsProvider,
+  EMBEDDING_MODEL_ID,
+  EMBEDDING_MODEL_DIM,
+  clipProvider,
+  CLIP_MODEL_ID,
+  CLIP_EMBEDDING_DIM,
+  buildEmbeddingText,
+  sliceTranscript,
+  extractDominantColors,
+  extractDominantColorPhrase,
+  rgbToLab,
+  deltaE76,
+  deltaE2000,
+} from './embeddings';
+export type {
+  EmbeddingsOptions,
+  EmbeddingsProgress,
+  EmbeddingsProvider,
+  BuildEmbeddingTextInput,
+  TranscriptSegment,
+  PaletteEntry,
+  LabColor,
+} from './embeddings';
 export { ANALYSIS_WIDTH, ANALYSIS_HEIGHT, PYRAMID_LEVELS } from './optical-flow-shaders';
diff --git a/src/lib/analysis/lfm-scene-worker.ts b/src/lib/analysis/lfm-scene-worker.ts
index 2021b4f01..566d2fd68 100644
--- a/src/lib/analysis/lfm-scene-worker.ts
+++ b/src/lib/analysis/lfm-scene-worker.ts
@@ -22,6 +22,10 @@ import {
   RawImage,
   env,
 } from '@huggingface/transformers';
+import {
+  LFM_SCENE_CAPTION_PROMPT,
+  parseSceneCaptionResponse,
+} from './captioning/scene-caption-format';
 
 const MODEL_ID = 'LiquidAI/LFM2.5-VL-450M-ONNX';
 
@@ -36,6 +40,7 @@ let model: any = null;
 let loading = false;
 let disposed = false;
 let loadGeneration = 0;
+const DESCRIBE_MAX_NEW_TOKENS = 160;
 
 function post(msg: Record<string, unknown>): void {
   self.postMessage(msg);
@@ -192,8 +197,6 @@ async function verifyCandidate(
   }
 }
 
-const DESCRIBE_PROMPT = 'Describe the scene in one sentence.';
-
 async function describeImage(id: number, imageBlob: Blob): Promise<void> {
   if (!model || !processor) {
     post({ type: 'error', message: 'Model not loaded' });
@@ -208,7 +211,7 @@ async function describeImage(id: number, imageBlob: Blob): Promise<void> {
         role: 'user',
         content: [
           { type: 'image' },
-          { type: 'text', text: DESCRIBE_PROMPT },
+          { type: 'text', text: LFM_SCENE_CAPTION_PROMPT },
         ],
       },
     ];
@@ -221,7 +224,7 @@ async function describeImage(id: number, imageBlob: Blob): Promise<void> {
 
     const outputs = await model.generate({
       ...inputs,
-      max_new_tokens: 128,
+      max_new_tokens: DESCRIBE_MAX_NEW_TOKENS,
       do_sample: false,
       repetition_penalty: 1.05,
     });
@@ -231,8 +234,13 @@ async function describeImage(id: number, imageBlob: Blob): Promise<void> {
       { skip_special_tokens: true },
     );
 
-    const caption = (decoded[0] ?? '').trim();
-    post({ type: 'caption', id, caption });
+    const parsed = parseSceneCaptionResponse(decoded[0] ?? '');
+    post({
+      type: 'caption',
+      id,
+      caption: parsed.text,
+      sceneData: parsed.sceneData,
+    });
   } catch (err) {
     post({ type: 'caption', id, caption: '', error: (err as Error).message });
   }
diff --git a/src/main.tsx b/src/main.tsx
index b196aca22..b348cfdf0 100644
--- a/src/main.tsx
+++ b/src/main.tsx
@@ -56,7 +56,8 @@ window.addEventListener('vite:preloadError', () => {
 });
 
 // IMPORTANT: Intentionally do not dispose filmstrip cache on beforeunload.
-// Filmstrip OPFS data is persistent and should survive refresh/reload.
+// Filmstrip cache data is persistent in the workspace and
+// should survive refresh/reload.
 // The browser tears down workers/resources on navigation anyway.
 
 const rootElement = document.getElementById('root');
@@ -70,4 +71,3 @@ createRoot(rootElement).render(
     <App />
   </StrictMode>
 );
-
diff --git a/src/routes/projects/index.tsx b/src/routes/projects/index.tsx
index 1bf4b1725..21b39148c 100644
--- a/src/routes/projects/index.tsx
+++ b/src/routes/projects/index.tsx
@@ -36,7 +36,7 @@ export const Route = createFileRoute('/projects/')({
   // Clean up any media blob URLs when returning to projects page
   beforeLoad: async () => {
     cleanupBlobUrls();
-    // Always reload projects from IndexedDB to get fresh data (thumbnails may have changed)
+    // Always reload projects from storage to get fresh data (thumbnails may have changed)
     const { loadProjects } = useProjectStore.getState();
     await loadProjects();
   },
@@ -473,4 +473,3 @@ function ProjectsIndex() {
     </>
   );
 }
-
diff --git a/src/shared/components/color-scopes-view.tsx b/src/shared/components/color-scopes-view.tsx
index 5a2052b9c..b14686e45 100644
--- a/src/shared/components/color-scopes-view.tsx
+++ b/src/shared/components/color-scopes-view.tsx
@@ -1,6 +1,6 @@
 import { memo, useCallback, useEffect, useRef, useState } from 'react';
 import { Activity } from 'lucide-react';
-import { usePlaybackStore } from '@/shared/state/playback';
+import { getResolvedPlaybackFrame, usePlaybackStore } from '@/shared/state/playback';
 import { usePreviewBridgeStore } from '@/shared/state/preview-bridge';
 import { cn } from '@/shared/ui/cn';
 import { Button } from '@/components/ui/button';
@@ -602,8 +602,15 @@ export const ColorScopesView = memo(function ColorScopesView({
     if (!captureFrameImageData && !captureFrame) return;
 
     const getRequestedFrame = () => {
-      const s = usePlaybackStore.getState();
-      return s.previewFrame ?? s.currentFrame;
+      const playbackState = usePlaybackStore.getState();
+      return getResolvedPlaybackFrame({
+        currentFrame: playbackState.currentFrame,
+        currentFrameEpoch: playbackState.currentFrameEpoch,
+        previewFrame: playbackState.previewFrame,
+        previewFrameEpoch: playbackState.previewFrameEpoch,
+        isPlaying: playbackState.isPlaying,
+        displayedFrame: usePreviewBridgeStore.getState().displayedFrame,
+      });
     };
     const requestedFrame = getRequestedFrame();
 
@@ -715,21 +722,66 @@ export const ColorScopesView = memo(function ColorScopesView({
 
     scheduleDraw();
 
-    const unsubscribe = usePlaybackStore.subscribe((state, previousState) => {
+    const scheduleIfFrameChanged = (nextRequestedFrame: number, previousRequestedFrame: number) => {
+      if (nextRequestedFrame !== previousRequestedFrame) {
+        scheduleDraw();
+      }
+    };
+
+    const unsubscribePlayback = usePlaybackStore.subscribe((state, previousState) => {
       if (state.isPlaying) {
         return;
       }
 
-      const nextRequestedFrame = state.previewFrame ?? state.currentFrame;
-      const previousRequestedFrame = previousState.previewFrame ?? previousState.currentFrame;
+      const nextRequestedFrame = getResolvedPlaybackFrame({
+        currentFrame: state.currentFrame,
+        currentFrameEpoch: state.currentFrameEpoch,
+        previewFrame: state.previewFrame,
+        previewFrameEpoch: state.previewFrameEpoch,
+        isPlaying: state.isPlaying,
+        displayedFrame: usePreviewBridgeStore.getState().displayedFrame,
+      });
+      const previousRequestedFrame = getResolvedPlaybackFrame({
+        currentFrame: previousState.currentFrame,
+        currentFrameEpoch: previousState.currentFrameEpoch,
+        previewFrame: previousState.previewFrame,
+        previewFrameEpoch: previousState.previewFrameEpoch,
+        isPlaying: previousState.isPlaying,
+        displayedFrame: usePreviewBridgeStore.getState().displayedFrame,
+      });
 
-      if (nextRequestedFrame !== previousRequestedFrame) {
-        scheduleDraw();
+      scheduleIfFrameChanged(nextRequestedFrame, previousRequestedFrame);
+    });
+
+    const unsubscribePreviewBridge = usePreviewBridgeStore.subscribe((bridgeState, previousBridgeState) => {
+      const playbackState = usePlaybackStore.getState();
+      if (playbackState.isPlaying) {
+        return;
       }
+
+      const nextRequestedFrame = getResolvedPlaybackFrame({
+        currentFrame: playbackState.currentFrame,
+        currentFrameEpoch: playbackState.currentFrameEpoch,
+        previewFrame: playbackState.previewFrame,
+        previewFrameEpoch: playbackState.previewFrameEpoch,
+        isPlaying: playbackState.isPlaying,
+        displayedFrame: bridgeState.displayedFrame,
+      });
+      const previousRequestedFrame = getResolvedPlaybackFrame({
+        currentFrame: playbackState.currentFrame,
+        currentFrameEpoch: playbackState.currentFrameEpoch,
+        previewFrame: playbackState.previewFrame,
+        previewFrameEpoch: playbackState.previewFrameEpoch,
+        isPlaying: playbackState.isPlaying,
+        displayedFrame: previousBridgeState.displayedFrame,
+      });
+
+      scheduleIfFrameChanged(nextRequestedFrame, previousRequestedFrame);
     });
 
     return () => {
-      unsubscribe();
+      unsubscribePlayback();
+      unsubscribePreviewBridge();
       if (rafId !== null) {
         cancelAnimationFrame(rafId);
       }
diff --git a/src/shared/state/playback/index.ts b/src/shared/state/playback/index.ts
index 3a6715481..745d9808d 100644
--- a/src/shared/state/playback/index.ts
+++ b/src/shared/state/playback/index.ts
@@ -1,5 +1,6 @@
 export { usePlaybackStore } from './store';
 export { getResolvedPlaybackFrame } from './frame-resolution';
+export { commitPreviewFrameToCurrentFrame } from './preview-handoff';
 export type {
   CaptureOptions,
   PreviewQuality,
diff --git a/src/shared/state/playback/preview-handoff.test.ts b/src/shared/state/playback/preview-handoff.test.ts
new file mode 100644
index 000000000..62e8ceda0
--- /dev/null
+++ b/src/shared/state/playback/preview-handoff.test.ts
@@ -0,0 +1,45 @@
+import { beforeEach, describe, expect, it } from 'vitest';
+import { usePlaybackStore } from './store';
+import { commitPreviewFrameToCurrentFrame } from './preview-handoff';
+
+describe('commitPreviewFrameToCurrentFrame', () => {
+  beforeEach(() => {
+    usePlaybackStore.setState({
+      currentFrame: 12,
+      currentFrameEpoch: 0,
+      isPlaying: false,
+      playbackRate: 1,
+      loop: false,
+      volume: 1,
+      muted: false,
+      masterBusDb: 0,
+      busAudioEq: undefined,
+      zoom: -1,
+      previewFrame: null,
+      previewFrameEpoch: 0,
+      frameUpdateEpoch: 0,
+      previewItemId: null,
+      useProxy: true,
+      previewQuality: 1,
+    });
+  });
+
+  it('promotes the active preview frame before clearing it', () => {
+    usePlaybackStore.getState().setPreviewFrame(48, 'item-1');
+
+    commitPreviewFrameToCurrentFrame();
+
+    const state = usePlaybackStore.getState();
+    expect(state.currentFrame).toBe(48);
+    expect(state.previewFrame).toBeNull();
+    expect(state.previewItemId).toBeNull();
+  });
+
+  it('does nothing when there is no active preview frame', () => {
+    commitPreviewFrameToCurrentFrame();
+
+    const state = usePlaybackStore.getState();
+    expect(state.currentFrame).toBe(12);
+    expect(state.previewFrame).toBeNull();
+  });
+});
diff --git a/src/shared/state/playback/preview-handoff.ts b/src/shared/state/playback/preview-handoff.ts
new file mode 100644
index 000000000..55ddc1577
--- /dev/null
+++ b/src/shared/state/playback/preview-handoff.ts
@@ -0,0 +1,16 @@
+import { usePlaybackStore } from './store';
+
+/**
+ * Promote an active transient skim/preview frame into the authoritative
+ * current frame before clearing preview state. This prevents edit gestures
+ * from briefly snapping back to the stale pre-skim playhead frame.
+ */
+export function commitPreviewFrameToCurrentFrame(): void {
+  const playback = usePlaybackStore.getState();
+  if (playback.previewFrame === null) {
+    return;
+  }
+
+  playback.setScrubFrame(playback.previewFrame, playback.previewItemId);
+  playback.setPreviewFrame(null);
+}
diff --git a/src/shared/state/preview-bridge/store.test.ts b/src/shared/state/preview-bridge/store.test.ts
index b914e443a..b2f5970ab 100644
--- a/src/shared/state/preview-bridge/store.test.ts
+++ b/src/shared/state/preview-bridge/store.test.ts
@@ -8,6 +8,7 @@ describe('preview-bridge-store', () => {
       captureFrame: null,
       captureFrameImageData: null,
       captureCanvasSource: null,
+      postEditWarmRequest: null,
     });
   });
 
@@ -17,6 +18,7 @@ describe('preview-bridge-store', () => {
       captureFrame: null,
       captureFrameImageData: null,
       captureCanvasSource: null,
+      postEditWarmRequest: null,
     });
   });
 
@@ -60,4 +62,24 @@ describe('preview-bridge-store', () => {
     expect(await state.captureFrameImageData?.()).toBeNull();
     expect(await state.captureCanvasSource?.()).toBeNull();
   });
+
+  it('stores post-edit warm requests with normalized frames and incrementing tokens', () => {
+    const store = usePreviewBridgeStore.getState();
+
+    store.requestPostEditWarm(48.6, ['clip-1'], [48.6, 47.8, 48.6, -2]);
+    expect(usePreviewBridgeStore.getState().postEditWarmRequest).toEqual({
+      frame: 49,
+      frames: [49, 48, 0],
+      itemIds: ['clip-1'],
+      token: 1,
+    });
+
+    store.requestPostEditWarm(-2, ['clip-2', 'clip-3']);
+    expect(usePreviewBridgeStore.getState().postEditWarmRequest).toEqual({
+      frame: 0,
+      frames: [0],
+      itemIds: ['clip-2', 'clip-3'],
+      token: 2,
+    });
+  });
 });
diff --git a/src/shared/state/preview-bridge/store.ts b/src/shared/state/preview-bridge/store.ts
index 5027c7351..920601c11 100644
--- a/src/shared/state/preview-bridge/store.ts
+++ b/src/shared/state/preview-bridge/store.ts
@@ -12,11 +12,26 @@ function normalizeFrame(frame: number | null): number | null {
   return Math.max(0, Math.round(frame));
 }
 
+function normalizeFrames(frames: number[]): number[] {
+  const normalized: number[] = [];
+  const seen = new Set<number>();
+
+  for (const frame of frames) {
+    const nextFrame = normalizeFrame(frame);
+    if (nextFrame == null || seen.has(nextFrame)) continue;
+    seen.add(nextFrame);
+    normalized.push(nextFrame);
+  }
+
+  return normalized;
+}
+
 export const usePreviewBridgeStore = create<PreviewBridgeState & PreviewBridgeActions>()((set) => ({
   displayedFrame: null,
   captureFrame: null,
   captureFrameImageData: null,
   captureCanvasSource: null,
+  postEditWarmRequest: null,
 
   setDisplayedFrame: (frame) =>
     set((state) => {
@@ -27,4 +42,18 @@ export const usePreviewBridgeStore = create<PreviewBridgeState & PreviewBridgeAc
   setCaptureFrame: (fn) => set({ captureFrame: fn }),
   setCaptureFrameImageData: (fn) => set({ captureFrameImageData: fn }),
   setCaptureCanvasSource: (fn) => set({ captureCanvasSource: fn }),
+  requestPostEditWarm: (frame, itemIds, frames = []) =>
+    set((state) => {
+      const normalizedFrame = normalizeFrame(frame) ?? 0;
+      const normalizedFrames = normalizeFrames(frames.length > 0 ? frames : [normalizedFrame]);
+
+      return {
+        postEditWarmRequest: {
+          frame: normalizedFrame,
+          frames: normalizedFrames,
+          itemIds: [...itemIds],
+          token: (state.postEditWarmRequest?.token ?? 0) + 1,
+        },
+      };
+    }),
 }));
diff --git a/src/shared/state/preview-bridge/types.ts b/src/shared/state/preview-bridge/types.ts
index 435144cb9..719b02fd5 100644
--- a/src/shared/state/preview-bridge/types.ts
+++ b/src/shared/state/preview-bridge/types.ts
@@ -1,5 +1,12 @@
 import type { CaptureOptions } from '@/shared/state/playback';
 
+export interface PostEditWarmRequest {
+  frame: number;
+  frames: number[];
+  itemIds: string[];
+  token: number;
+}
+
 export interface PreviewBridgeState {
   /** Frame currently presented to the user in preview output (null when Player path is active) */
   displayedFrame: number | null;
@@ -9,6 +16,8 @@ export interface PreviewBridgeState {
   captureFrameImageData: ((options?: CaptureOptions) => Promise<ImageData | null>) | null;
   /** Returns the rendered canvas directly for GPU-accelerated scope analysis (near-zero-copy) */
   captureCanvasSource: (() => Promise<OffscreenCanvas | HTMLCanvasElement | null>) | null;
+  /** Latest request to prewarm the preview renderer after an edit commit. */
+  postEditWarmRequest: PostEditWarmRequest | null;
 }
 
 export interface PreviewBridgeActions {
@@ -19,4 +28,5 @@ export interface PreviewBridgeActions {
   setCaptureFrameImageData: (fn: ((options?: CaptureOptions) => Promise<ImageData | null>) | null) => void;
   /** Register canvas source capture for GPU scopes (optional) */
   setCaptureCanvasSource: (fn: (() => Promise<OffscreenCanvas | HTMLCanvasElement | null>) | null) => void;
+  requestPostEditWarm: (frame: number, itemIds: string[], frames?: number[]) => void;
 }
diff --git a/src/shared/state/source-player/store.test.ts b/src/shared/state/source-player/store.test.ts
index 94b49196f..754a3cf8c 100644
--- a/src/shared/state/source-player/store.test.ts
+++ b/src/shared/state/source-player/store.test.ts
@@ -8,6 +8,7 @@ describe('source-player-store', () => {
       playerMethods: null,
       currentMediaId: null,
       currentSourceFrame: 0,
+      previewSourceFrame: null,
       inPoint: null,
       outPoint: null,
       pendingSeekFrame: null,
@@ -25,6 +26,7 @@ describe('source-player-store', () => {
     expect(useSourcePlayerStore.getState()).toMatchObject({
       currentMediaId: null,
       currentSourceFrame: 0,
+      previewSourceFrame: null,
       inPoint: null,
       outPoint: null,
     });
@@ -44,6 +46,7 @@ describe('source-player-store', () => {
       currentMediaId: 'media-2',
       inPoint: 75,
       outPoint: 150,
+      previewSourceFrame: null,
       pendingSeekFrame: 75,
     });
   });
diff --git a/src/shared/state/source-player/store.ts b/src/shared/state/source-player/store.ts
index 7285d7521..f03e0807d 100644
--- a/src/shared/state/source-player/store.ts
+++ b/src/shared/state/source-player/store.ts
@@ -6,20 +6,39 @@ export const useSourcePlayerStore = create<SourcePlayerState>((set) => ({
   playerMethods: null,
   currentMediaId: null,
   currentSourceFrame: 0,
+  previewSourceFrame: null,
   inPoint: null,
   outPoint: null,
   pendingSeekFrame: null,
+  pendingPlay: false,
   setHoveredPanel: (panel) => set({ hoveredPanel: panel }),
   setPlayerMethods: (methods) => set({ playerMethods: methods }),
   setCurrentMediaId: (id) => set((state) => {
     if (id === state.currentMediaId) return state;
-    return { currentMediaId: id, inPoint: null, outPoint: null, currentSourceFrame: 0 };
+    return {
+      currentMediaId: id,
+      inPoint: null,
+      outPoint: null,
+      currentSourceFrame: 0,
+      previewSourceFrame: null,
+      pendingSeekFrame: null,
+      pendingPlay: false,
+    };
   }),
   releaseCurrentMediaId: (id) => set((state) => {
     if (state.currentMediaId !== id) return state;
-    return { currentMediaId: null, inPoint: null, outPoint: null, currentSourceFrame: 0 };
+    return {
+      currentMediaId: null,
+      inPoint: null,
+      outPoint: null,
+      currentSourceFrame: 0,
+      previewSourceFrame: null,
+      pendingSeekFrame: null,
+      pendingPlay: false,
+    };
   }),
   setCurrentSourceFrame: (frame) => set({ currentSourceFrame: frame }),
+  setPreviewSourceFrame: (frame) => set({ previewSourceFrame: frame }),
   setInPoint: (frame) => set((state) => {
     if (frame !== null && state.outPoint !== null && frame >= state.outPoint) {
       return { inPoint: frame, outPoint: null };
@@ -34,4 +53,5 @@ export const useSourcePlayerStore = create<SourcePlayerState>((set) => ({
   }),
   clearInOutPoints: () => set({ inPoint: null, outPoint: null }),
   setPendingSeekFrame: (frame) => set({ pendingSeekFrame: frame }),
+  setPendingPlay: (play) => set({ pendingPlay: play }),
 }));
diff --git a/src/shared/state/source-player/types.ts b/src/shared/state/source-player/types.ts
index 47016c462..55283675a 100644
--- a/src/shared/state/source-player/types.ts
+++ b/src/shared/state/source-player/types.ts
@@ -1,5 +1,13 @@
 export interface SourcePlayerMethods {
   toggle: () => void;
+  /**
+   * Unconditional pause — no-ops when already paused. Exposed so callers
+   * outside the source monitor (e.g. scene browser clicks) can stop the
+   * current scene synchronously before queueing a seek, instead of
+   * waiting for the seek-consume effect and racing with the video
+   * element still decoding the old frame.
+   */
+  pause: () => void;
   seek: (frame: number) => void;
   frameBack: (frames: number) => void;
   frameForward: (frames: number) => void;
@@ -11,16 +19,27 @@ export interface SourcePlayerState {
   playerMethods: SourcePlayerMethods | null;
   currentMediaId: string | null;
   currentSourceFrame: number;
+  previewSourceFrame: number | null;
   inPoint: number | null;
   outPoint: number | null;
   pendingSeekFrame: number | null;
+  /**
+   * When true, the source monitor starts playback after consuming the
+   * next `pendingSeekFrame`. The monitor always pauses before seeking,
+   * so scene-browser single-click just queues a seek (leaves paused)
+   * while double-click queues `pendingPlay: true` to play from the new
+   * scene.
+   */
+  pendingPlay: boolean;
   setHoveredPanel: (panel: 'source' | null) => void;
   setPlayerMethods: (methods: SourcePlayerMethods | null) => void;
   setCurrentMediaId: (id: string | null) => void;
   releaseCurrentMediaId: (id: string) => void;
   setCurrentSourceFrame: (frame: number) => void;
+  setPreviewSourceFrame: (frame: number | null) => void;
   setInPoint: (frame: number | null) => void;
   setOutPoint: (frame: number | null) => void;
   clearInOutPoints: () => void;
   setPendingSeekFrame: (frame: number | null) => void;
+  setPendingPlay: (play: boolean) => void;
 }
diff --git a/src/shared/utils/browser-whisper-models.ts b/src/shared/utils/browser-whisper-models.ts
index 72b0fae63..3986fa882 100644
--- a/src/shared/utils/browser-whisper-models.ts
+++ b/src/shared/utils/browser-whisper-models.ts
@@ -1,6 +1,6 @@
 import type { MediaTranscriptModel } from '@/types/storage';
 
-export const DEFAULT_BROWSER_WHISPER_MODEL: MediaTranscriptModel = 'whisper-tiny';
+export const DEFAULT_BROWSER_WHISPER_MODEL: MediaTranscriptModel = 'whisper-small';
 
 export const BROWSER_WHISPER_MODEL_LABELS: Record<MediaTranscriptModel, string> = {
   'whisper-tiny': 'Tiny',
@@ -10,7 +10,6 @@ export const BROWSER_WHISPER_MODEL_LABELS: Record<MediaTranscriptModel, string>
 };
 
 export const BROWSER_WHISPER_MODEL_OPTIONS = [
-  { value: 'whisper-tiny', label: BROWSER_WHISPER_MODEL_LABELS['whisper-tiny'] },
   { value: 'whisper-base', label: BROWSER_WHISPER_MODEL_LABELS['whisper-base'] },
   { value: 'whisper-small', label: BROWSER_WHISPER_MODEL_LABELS['whisper-small'] },
   { value: 'whisper-large', label: BROWSER_WHISPER_MODEL_LABELS['whisper-large'] },
@@ -18,3 +17,19 @@ export const BROWSER_WHISPER_MODEL_OPTIONS = [
   value: MediaTranscriptModel;
   label: string;
 }>;
+
+const SELECTABLE_BROWSER_WHISPER_MODELS = new Set<MediaTranscriptModel>(
+  BROWSER_WHISPER_MODEL_OPTIONS.map((option) => option.value),
+);
+
+export function normalizeSelectableBrowserWhisperModel(
+  model: MediaTranscriptModel | undefined,
+): MediaTranscriptModel {
+  if (!model) {
+    return DEFAULT_BROWSER_WHISPER_MODEL;
+  }
+
+  return SELECTABLE_BROWSER_WHISPER_MODELS.has(model)
+    ? model
+    : DEFAULT_BROWSER_WHISPER_MODEL;
+}
diff --git a/src/shared/utils/schedule-after-paint.ts b/src/shared/utils/schedule-after-paint.ts
new file mode 100644
index 000000000..0f173fe2e
--- /dev/null
+++ b/src/shared/utils/schedule-after-paint.ts
@@ -0,0 +1,18 @@
+export function scheduleAfterPaint(task: () => void): () => void {
+  if (typeof window === 'undefined') {
+    const timeoutId = setTimeout(task, 0);
+    return () => clearTimeout(timeoutId);
+  }
+
+  let timeoutId: number | null = null;
+  const rafId = window.requestAnimationFrame(() => {
+    timeoutId = window.setTimeout(task, 0);
+  });
+
+  return () => {
+    window.cancelAnimationFrame(rafId);
+    if (timeoutId !== null) {
+      window.clearTimeout(timeoutId);
+    }
+  };
+}
diff --git a/src/shared/utils/transcription-cancellation.ts b/src/shared/utils/transcription-cancellation.ts
new file mode 100644
index 000000000..fe0c024ef
--- /dev/null
+++ b/src/shared/utils/transcription-cancellation.ts
@@ -0,0 +1,44 @@
+import { LOCAL_INFERENCE_UNLOADED_MESSAGE } from '@/shared/state/local-inference';
+
+export const TRANSCRIPTION_CANCELLED_MESSAGE = 'Transcription cancelled';
+
+export function isTranscriptionCancellationError(error: unknown): boolean {
+  return error instanceof Error && (
+    error.message === TRANSCRIPTION_CANCELLED_MESSAGE
+    || error.message === LOCAL_INFERENCE_UNLOADED_MESSAGE
+  );
+}
+
+const OOM_PATTERNS = [
+  /out of memory/i,
+  /\boom\b/i,
+  /insufficient memory/i,
+  /allocation failed/i,
+  /failed to allocate/i,
+  /cannot allocate/i,
+  /memory allocation/i,
+  /array buffer allocation/i,
+  /device lost/i,
+  /webgpu.*buffer/i,
+  /createbuffer/i,
+  /wasm memory/i,
+  /maximum.*memory/i,
+];
+
+export function isTranscriptionOutOfMemoryError(error: unknown): boolean {
+  // RangeError from buffer allocation is the clearest OOM signal from browsers.
+  if (error instanceof RangeError) return true;
+
+  if (!(error instanceof Error)) {
+    if (typeof error === 'string') {
+      return OOM_PATTERNS.some((pattern) => pattern.test(error));
+    }
+    return false;
+  }
+
+  const message = `${error.message} ${error.name}`;
+  return OOM_PATTERNS.some((pattern) => pattern.test(message));
+}
+
+export const TRANSCRIPTION_OOM_HINT =
+  'The model ran out of memory. Try a lower quantization (q8 or q4) or a smaller model in Settings → Whisper, then try again.';
diff --git a/src/shared/utils/transcription-progress.test.ts b/src/shared/utils/transcription-progress.test.ts
index ae6f0abc7..f7e16ed44 100644
--- a/src/shared/utils/transcription-progress.test.ts
+++ b/src/shared/utils/transcription-progress.test.ts
@@ -7,6 +7,7 @@ import {
 
 describe('transcription-progress', () => {
   it('maps stages into a stable overall percentage range', () => {
+    expect(getTranscriptionOverallPercent({ stage: 'queued', progress: 1 })).toBe(0);
     expect(getTranscriptionOverallPercent({ stage: 'loading', progress: 1 })).toBe(35);
     expect(getTranscriptionOverallPercent({ stage: 'decoding', progress: 0.5 })).toBeCloseTo(52.5);
     expect(getTranscriptionOverallPercent({ stage: 'transcribing', progress: 0.5 })).toBe(85);
@@ -40,8 +41,9 @@ describe('transcription-progress', () => {
   });
 
   it('formats readable stage labels', () => {
+    expect(getTranscriptionStageLabel('queued')).toBe('Queued');
     expect(getTranscriptionStageLabel('loading')).toBe('Loading model');
-    expect(getTranscriptionStageLabel('decoding')).toBe('Decoding audio');
+    expect(getTranscriptionStageLabel('decoding')).toBe('Preparing audio');
     expect(getTranscriptionStageLabel('transcribing')).toBe('Transcribing');
   });
 });
diff --git a/src/shared/utils/transcription-progress.ts b/src/shared/utils/transcription-progress.ts
index 74f538eda..2e20350dc 100644
--- a/src/shared/utils/transcription-progress.ts
+++ b/src/shared/utils/transcription-progress.ts
@@ -1,4 +1,4 @@
-export type TranscriptionProgressStage = 'loading' | 'decoding' | 'transcribing';
+export type TranscriptionProgressStage = 'queued' | 'loading' | 'decoding' | 'transcribing';
 
 export interface TranscriptionProgressSnapshot {
   stage: TranscriptionProgressStage;
@@ -15,6 +15,8 @@ export function getTranscriptionOverallProgress(
   const normalizedProgress = clampProgress(snapshot.progress);
 
   switch (snapshot.stage) {
+    case 'queued':
+      return 0;
     case 'loading':
       return normalizedProgress * 0.35;
     case 'decoding':
@@ -50,10 +52,12 @@ export function mergeTranscriptionProgress(
 
 export function getTranscriptionStageLabel(stage: TranscriptionProgressStage): string {
   switch (stage) {
+    case 'queued':
+      return 'Queued';
     case 'loading':
       return 'Loading model';
     case 'decoding':
-      return 'Decoding audio';
+      return 'Preparing audio';
     case 'transcribing':
       return 'Transcribing';
   }
diff --git a/src/shared/utils/whisper-settings.ts b/src/shared/utils/whisper-settings.ts
index 8f86570e5..d8149f2ae 100644
--- a/src/shared/utils/whisper-settings.ts
+++ b/src/shared/utils/whisper-settings.ts
@@ -3,6 +3,7 @@ import {
   BROWSER_WHISPER_MODEL_LABELS,
   BROWSER_WHISPER_MODEL_OPTIONS,
   DEFAULT_BROWSER_WHISPER_MODEL,
+  normalizeSelectableBrowserWhisperModel,
 } from './browser-whisper-models';
 
 export const DEFAULT_WHISPER_MODEL: MediaTranscriptModel = DEFAULT_BROWSER_WHISPER_MODEL;
@@ -17,6 +18,12 @@ export const WHISPER_MODEL_OPTIONS: ReadonlyArray<{
   label: string;
 }> = BROWSER_WHISPER_MODEL_OPTIONS;
 
+export function normalizeSelectableWhisperModel(
+  model: MediaTranscriptModel | undefined,
+): MediaTranscriptModel {
+  return normalizeSelectableBrowserWhisperModel(model);
+}
+
 export const WHISPER_QUANTIZATION_OPTIONS: ReadonlyArray<{
   value: MediaTranscriptQuantization;
   label: string;
@@ -51,9 +58,10 @@ export const WHISPER_QUANTIZATION_OPTIONS: ReadonlyArray<{
 
 export function getWhisperQuantizationOption(
   value: MediaTranscriptQuantization | undefined,
-) {
+): (typeof WHISPER_QUANTIZATION_OPTIONS)[number] {
+  const fallback = WHISPER_QUANTIZATION_OPTIONS[0]!;
   return WHISPER_QUANTIZATION_OPTIONS.find((option) => option.value === value)
-    ?? WHISPER_QUANTIZATION_OPTIONS[0];
+    ?? fallback;
 }
 
 const WHISPER_LANGUAGE_NAMES = {
diff --git a/src/types/project.ts b/src/types/project.ts
index 5a8bed1e0..d1b6bdc8b 100644
--- a/src/types/project.ts
+++ b/src/types/project.ts
@@ -15,7 +15,7 @@ export interface Project {
    * Increment CURRENT_SCHEMA_VERSION in lib/migrations when adding migrations.
    */
   schemaVersion?: number;
-  thumbnailId?: string; // Reference to ThumbnailData in IndexedDB
+  thumbnailId?: string; // Reference to workspace-backed ThumbnailData
   thumbnail?: string; // @deprecated Base64 data URL (for backward compatibility)
   metadata: ProjectResolution;
   timeline?: ProjectTimeline;
@@ -80,8 +80,9 @@ export interface ProjectTimeline {
     sourceDuration?: number; // Total duration of source media (frames)
     sourceFps?: number; // Source media frame rate for source* frame fields
     text?: string;
+    textRole?: 'caption';
     captionSource?: {
-      type: 'transcript';
+      type: 'transcript' | 'ai-captions';
       clipId: string;
       mediaId: string;
     };
diff --git a/src/types/storage.ts b/src/types/storage.ts
index 6b1a60923..77c9d32e9 100644
--- a/src/types/storage.ts
+++ b/src/types/storage.ts
@@ -54,7 +54,8 @@ export interface MediaMetadata {
   audioCodecSupported?: boolean;
   /**
    * Conformed preview-audio asset path for custom-decoded codecs.
-   * Points to a browser-native playable WAV stored in OPFS.
+   * Kept under the legacy name for compatibility, but now points to the
+   * workspace-backed persisted WAV path.
    */
   previewAudioOpfsPath?: string;
   previewAudioMimeType?: string;
@@ -74,8 +75,30 @@ export interface MediaMetadata {
   gopInterval?: number;
   thumbnailId?: string;
   tags: string[];
-  /** AI-generated timestamped captions from LFM vision-language model. */
-  aiCaptions?: Array<{ timeSec: number; text: string }>;
+  /**
+   * AI-generated timestamped captions from LFM vision-language model.
+   * Mirrors the canonical `cache/ai/captions.json` payload for in-memory
+   * consumers (search, Scene Browser). See `MediaCaption` in
+   * `lib/analysis/captioning/types.ts` for the full shape including optional
+   * thumbnail paths, semantic embeddings, and color palettes.
+   */
+  aiCaptions?: Array<{
+    timeSec: number;
+    text: string;
+    sceneData?: {
+      caption?: string;
+      shotType?: string;
+      subjects?: string[];
+      action?: string;
+      setting?: string;
+      lighting?: string;
+      timeOfDay?: string;
+      weather?: string;
+    };
+    thumbRelPath?: string;
+    embedding?: number[];
+    palette?: Array<{ l: number; a: number; b: number; weight: number }>;
+  }>;
   createdAt: number;
   updatedAt: number;
 }
@@ -157,7 +180,7 @@ export interface WaveformData {
   createdAt: number;
 }
 
-// Streaming waveform cache records (meta + bins in IndexedDB).
+// Streaming waveform cache records (meta + bins in persisted storage).
 export interface WaveformMeta {
   id: string; // Same as mediaId
   mediaId: string;
diff --git a/src/types/timeline.ts b/src/types/timeline.ts
index 737438a20..410fc3c17 100644
--- a/src/types/timeline.ts
+++ b/src/types/timeline.ts
@@ -94,7 +94,13 @@ type BaseTimelineItem = {
 };
 
 export interface GeneratedCaptionSource {
-  type: 'transcript';
+  /**
+   * `transcript` — generated from whisper speech-to-text segments.
+   * `ai-captions` — generated from vision-language-model frame descriptions
+   *   (e.g. LFM captioning). Distinguished so replace/remove flows can target
+   *   one kind without disturbing the other on the same clip.
+   */
+  type: 'transcript' | 'ai-captions';
   clipId: string;
   mediaId: string;
 }
@@ -122,6 +128,7 @@ export type AudioItem = BaseTimelineItem & {
 export type TextItem = BaseTimelineItem & {
   type: 'text';
   text: string;
+  textRole?: 'caption';
   captionSource?: GeneratedCaptionSource;
   // Typography
   fontSize?: number; // Font size in pixels (default: 60)
diff --git a/vercel.json b/vercel.json
index ae401caa2..61635dfae 100644
--- a/vercel.json
+++ b/vercel.json
@@ -37,7 +37,7 @@
         },
         {
           "key": "Cross-Origin-Embedder-Policy",
-          "value": "require-corp"
+          "value": "credentialless"
         },
         {
           "key": "Cross-Origin-Opener-Policy",