diff --git a/README.md b/README.md index d3a12ba4c..8fb279b8b 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ ![FreeCut Timeline Editor](./public/assets/landing/timeline.png) -FreeCut is a browser-based multi-track video editor. No installation, no uploads — everything runs locally in your browser using WebGPU, WebCodecs, OPFS, and the File System Access API. +FreeCut is a browser-based multi-track video editor. No installation, no uploads — everything runs locally in your browser using WebGPU, WebCodecs, and the File System Access API. Projects, media metadata, thumbnails, waveforms, and transcripts are written as plain files to a workspace folder you pick on disk. ## Features @@ -45,8 +45,10 @@ Layer masks with keyframeable geometry transforms for compositing and selective ### Transitions -- **CPU transitions** — fade, wipe, slide, 3D flip, clock wipe, iris — each with directional variants -- **GPU transitions** — dissolve, sparkles, glitch, light leak, pixelate, chromatic aberration, radial blur +All transitions are WebGPU-accelerated with a Canvas 2D fallback for non-WebGPU environments. + +- Fade, wipe, slide, 3D flip, clock wipe, iris — each with directional variants +- Dissolve, sparkles, glitch, light leak, pixelate, chromatic aberration, radial blur - Adjustable duration and alignment ### Keyframe Animation @@ -78,7 +80,7 @@ Layer masks with keyframeable geometry transforms for compositing and selective - **Audio:** MP3, WAV, AAC, OGG, Opus - **Image:** JPG, PNG, GIF (animated), WebP - Up to 5 GB per file -- OPFS proxy video generation for smooth preview +- Proxy video generation for smooth preview (cached to the workspace folder) - Media relinking for moved or deleted files - Scene detection and optical flow analysis @@ -89,19 +91,24 @@ Layer masks with keyframeable geometry transforms for compositing and selective - Auto-generate caption text items from transcripts - Multi-language support +### Text-to-Speech + +- In-browser voiceover generation via KittenTTS (WebGPU) +- Adds the generated audio clip directly to the timeline + ### Other - Native SVG shapes — rectangle, circle, triangle, ellipse, star, polygon, heart - Text overlays with custom fonts, colors, and positioning - Project bundles — export/import projects as ZIP files with Zod-validated schemas -- IndexedDB persistence with content-addressable storage +- Workspace folder persistence via File System Access API — your projects live as plain files on disk, not locked away in browser storage - Auto-save - Customizable keyboard shortcuts with preset import/export - Configurable settings (default FPS, snap, waveforms, filmstrips, preview quality, export defaults, undo depth, auto-save interval) ## Quick Start -**Prerequisites:** Node.js 18+ +**Prerequisites:** Node.js 20+ ```bash git clone https://github.com/walterlow/freecut.git @@ -114,12 +121,13 @@ Open [http://localhost:5173](http://localhost:5173) in Chrome. ### Workflow -1. Create a project from the projects page -2. Import media by dragging files into the media library -3. Drag clips to the timeline — trim, arrange, add effects and transitions -4. Animate with the keyframe editor -5. Preview your edit in real time -6. Export directly from the browser +1. Pick a workspace folder when prompted — FreeCut writes all projects, media metadata, and caches into this folder +2. Create a project from the projects page +3. Import media by dragging files into the media library +4. Drag clips to the timeline — trim, arrange, add effects and transitions +5. Animate with the keyframe editor +6. Preview your edit in real time +7. Export directly from the browser ## Browser Support @@ -184,7 +192,9 @@ Brave disables the File System Access API by default. To enable it: - [Tailwind CSS 4](https://tailwindcss.com/) + [shadcn/ui](https://ui.shadcn.com/) — styling and UI components - [Mediabunny](https://mediabunny.dev/) — media decoding and metadata extraction - [WebCodecs](https://developer.mozilla.org/en-US/docs/Web/API/WebCodecs_API) — composition rendering and export -- [OPFS](https://developer.mozilla.org/en-US/docs/Web/API/File_System_API/Origin_private_file_system) + [IndexedDB](https://developer.mozilla.org/en-US/docs/Web/API/IndexedDB_API) — local persistence +- [File System Access API](https://developer.mozilla.org/en-US/docs/Web/API/File_System_API) — workspace folder persistence +- [Transformers.js](https://huggingface.co/docs/transformers.js) — in-browser Whisper transcription +- [KittenTTS](https://github.com/KittenML/kitten-tts-webgpu) — WebGPU text-to-speech - Web Workers — heavy processing off the main thread ## Development @@ -235,7 +245,7 @@ src/ | |- animation/ # Easing functions and interpolation | |- projects/ # Project domain types | \- timeline/ # Transitions (engine, registry, renderers) -|- infrastructure/ # Browser/storage/GPU adapters +|- infrastructure/ # Browser/storage/GPU adapters (workspace-fs, handles-db, gpu facades) |- lib/ | |- gpu-effects/ # WebGPU effect pipeline + shader definitions | |- gpu-transitions/ # WebGPU transition pipeline + shaders @@ -256,10 +266,11 @@ src/ | |- export/ # WebCodecs export pipeline (Web Worker) | |- effects/ # GPU effect system and UI panels | |- keyframes/ # Keyframe animation, Bezier editor, easing -| |- media-library/ # Media import, metadata, OPFS proxies, transcription +| |- media-library/ # Media import, metadata, proxy cache, transcription, TTS | |- project-bundle/ # Project ZIP export/import | |- projects/ # Project management -| \- settings/ # App settings, keyboard shortcut editor +| |- settings/ # App settings, keyboard shortcut editor +| \- workspace-gate/ # Workspace folder picker / permission gate |- shared/ # Shared UI/state/utilities across layers |- components/ui/ # shadcn/ui components |- config/hotkeys.ts # Keyboard shortcut definitions diff --git a/src/app/state/editor/store.ts b/src/app/state/editor/store.ts index 15949b3d9..7b1a35125 100644 --- a/src/app/state/editor/store.ts +++ b/src/app/state/editor/store.ts @@ -51,6 +51,7 @@ export const useEditorStore = create((set) => ({ mediaSkimPreviewFrame: null, compoundClipSkimPreviewCompositionId: null, compoundClipSkimPreviewFrame: null, + transcriptionDialogDepth: 0, sourcePatchVideoEnabled: true, sourcePatchAudioEnabled: true, sourcePatchVideoTrackId: null, @@ -179,6 +180,12 @@ export const useEditorStore = create((set) => ({ compoundClipSkimPreviewFrame: null, }; }), + beginTranscriptionDialog: () => set((state) => ({ + transcriptionDialogDepth: state.transcriptionDialogDepth + 1, + })), + endTranscriptionDialog: () => set((state) => ({ + transcriptionDialogDepth: Math.max(0, state.transcriptionDialogDepth - 1), + })), setSourcePatchVideoEnabled: (enabled) => set({ sourcePatchVideoEnabled: enabled }), setSourcePatchAudioEnabled: (enabled) => set({ sourcePatchAudioEnabled: enabled }), setSourcePatchVideoTrackId: (trackId) => set({ sourcePatchVideoTrackId: trackId }), diff --git a/src/app/state/editor/types.ts b/src/app/state/editor/types.ts index b3d42f298..3705494c1 100644 --- a/src/app/state/editor/types.ts +++ b/src/app/state/editor/types.ts @@ -16,6 +16,7 @@ export interface EditorState { mediaSkimPreviewFrame: number | null; compoundClipSkimPreviewCompositionId: string | null; compoundClipSkimPreviewFrame: number | null; + transcriptionDialogDepth: number; sourcePatchVideoEnabled: boolean; sourcePatchAudioEnabled: boolean; sourcePatchVideoTrackId: string | null; @@ -54,6 +55,8 @@ export interface EditorActions { clearMediaSkimPreview: () => void; setCompoundClipSkimPreview: (compositionId: string | null, frame?: number | null) => void; clearCompoundClipSkimPreview: () => void; + beginTranscriptionDialog: () => void; + endTranscriptionDialog: () => void; setSourcePatchVideoEnabled: (enabled: boolean) => void; setSourcePatchAudioEnabled: (enabled: boolean) => void; setSourcePatchVideoTrackId: (trackId: string | null) => void; diff --git a/src/config/hotkeys.ts b/src/config/hotkeys.ts index aadf3f79a..5abfec2e8 100644 --- a/src/config/hotkeys.ts +++ b/src/config/hotkeys.ts @@ -67,6 +67,7 @@ export const HOTKEYS = { // UI TOGGLE_SNAP: 's', + OPEN_SCENE_BROWSER: 'mod+shift+f', // Markers ADD_MARKER: 'm', @@ -325,6 +326,7 @@ export const HOTKEY_DESCRIPTIONS: Record = { // UI TOGGLE_SNAP: 'Toggle snap', + OPEN_SCENE_BROWSER: 'Open Scene Browser (search AI captions)', // Markers ADD_MARKER: 'Add marker at playhead', diff --git a/src/features/composition-runtime/utils/audio-decode-cache.ts b/src/features/composition-runtime/utils/audio-decode-cache.ts index 5b423082f..fc9a9c281 100644 --- a/src/features/composition-runtime/utils/audio-decode-cache.ts +++ b/src/features/composition-runtime/utils/audio-decode-cache.ts @@ -4,11 +4,11 @@ * Caches decoded AudioBuffers for custom-decoded audio tracks so that * split clips from the same source share a single decode. * - * Storage: Decoded audio is persisted to IndexedDB in 10-second bins + * Storage: Decoded audio is persisted to workspace-backed files in 10-second bins * (Int16 @ 22050 Hz stereo ~ 0.84 MB/bin). This avoids large single * records and allows progressive persistence during decode. * - * On refresh, bins are loaded from IndexedDB in parallel and + * On refresh, bins are loaded from the workspace cache in parallel and * reassembled into an AudioBuffer with no re-decode needed. * * Surround (5.1/7.1) sources are downmixed to stereo during decode @@ -72,10 +72,10 @@ const PLAYABLE_PARTIAL_PREROLL_SECONDS = 0.25; const STARTUP_PLAYABLE_PARTIAL_READY_SECONDS = 1; const PENDING_PLAYBACK_SLICE_REUSE_HEADROOM_SECONDS = 1; -/** Sample rate for IndexedDB storage; 22050 Hz is sufficient for preview. */ +/** Sample rate for persisted preview-audio bins; 22050 Hz is sufficient for preview. */ const STORAGE_SAMPLE_RATE = 22050; -/** Bin duration in seconds for chunked IndexedDB storage. */ +/** Bin duration in seconds for chunked persisted storage. */ const BIN_DURATION_SEC = 10; export interface PlaybackAudioSlice { @@ -224,7 +224,7 @@ function createInputSource( /** * Get a cached AudioBuffer or decode one via mediabunny. - * Checks: memory cache -> IndexedDB bins -> decode (persists bins progressively). + * Checks: memory cache -> persisted bins -> decode (persists bins progressively). * Concurrent calls for the same mediaId share a single promise. */ function ensureDecodeStarted(mediaId: string, src: PreviewAudioSource): Promise { @@ -619,11 +619,11 @@ export function clearPreviewAudioCache(): void { } // --------------------------------------------------------------------------- -// Load from IndexedDB bins +// Load from persisted bins // --------------------------------------------------------------------------- async function loadOrDecodeAudio(mediaId: string, src: PreviewAudioSource): Promise { - // Try IndexedDB + // Try persisted workspace cache try { const cached = await getDecodedPreviewAudio(mediaId); if (cached && 'kind' in cached && cached.kind === 'meta') { @@ -638,7 +638,7 @@ async function loadOrDecodeAudio(mediaId: string, src: PreviewAudioSource): Prom await deleteDecodedPreviewAudio(mediaId).catch(() => undefined); } } catch (err) { - log.warn('Failed to load from IndexedDB, will decode', { mediaId, err }); + log.warn('Failed to load persisted decoded audio, will decode', { mediaId, err }); } // Full decode with progressive bin persistence @@ -694,7 +694,7 @@ async function loadFromBins(meta: DecodedPreviewAudioMeta): Promise throw new Error(`Decoded audio bins incomplete: ${offset}/${totalFrames} frames`); } - log.info('Loaded decoded audio from IndexedDB', { + log.info('Loaded decoded audio from workspace cache', { mediaId, binCount, sampleRate, @@ -804,7 +804,7 @@ async function buildPreviewStereoBuffer( } /** - * Downsample, convert to Int16, and persist one bin to IndexedDB. + * Downsample, convert to Int16, and persist one bin to workspace-backed storage. * Returns persisted Int16 data so playback can be assembled without * retaining a massive full-resolution decode in memory. */ @@ -1015,9 +1015,9 @@ async function decodeFullAudio( binDurationSec: BIN_DURATION_SEC, createdAt: Date.now(), }).then(() => { - log.info('All bins persisted to IndexedDB', { mediaId, binCount: totalBins }); + log.info('All bins persisted to workspace cache', { mediaId, binCount: totalBins }); }).catch((err) => { - log.warn('Failed to persist bins to IndexedDB', { mediaId, err }); + log.warn('Failed to persist bins to workspace cache', { mediaId, err }); }); return combined; diff --git a/src/features/composition-runtime/utils/preview-audio-conform.ts b/src/features/composition-runtime/utils/preview-audio-conform.ts index 62b5df8f1..4d5af80eb 100644 --- a/src/features/composition-runtime/utils/preview-audio-conform.ts +++ b/src/features/composition-runtime/utils/preview-audio-conform.ts @@ -4,10 +4,12 @@ import { opfsService } from '@/features/composition-runtime/deps/media-library'; import { createLogger } from '@/shared/logging/logger'; import type { MediaMetadata } from '@/types/storage'; import { - mirrorBytesToWorkspace, readWorkspaceBlob, removeWorkspaceCacheEntry, } from '@/infrastructure/storage/workspace-fs/cache-mirror'; +import { previewAudioPath } from '@/infrastructure/storage/workspace-fs/paths'; +import { requireWorkspaceRoot } from '@/infrastructure/storage/workspace-fs/root'; +import { writeBlob } from '@/infrastructure/storage/workspace-fs/fs-primitives'; import { audioBufferToWavBlob } from './audio-buffer-wav'; const log = createLogger('PreviewAudioConform'); @@ -18,7 +20,7 @@ const PREVIEW_AUDIO_CONFORM_MIME_TYPE = 'audio/wav'; const pendingPreviewAudioConformLoads = new Map>(); const pendingPreviewAudioConformPersists = new Map>(); -function buildPreviewAudioConformOpfsPath(mediaId: string): string { +function buildPreviewAudioConformPath(mediaId: string): string { const shard1 = mediaId.slice(0, 2) || '00'; const shard2 = mediaId.slice(2, 4) || '00'; return `${PREVIEW_AUDIO_CONFORM_DIR}/${shard1}/${shard2}/${mediaId}.wav`; @@ -50,33 +52,38 @@ export async function resolvePreviewAudioConformUrl(mediaId: string): Promise undefined); - throw err; - } - - // Mirror to the workspace folder so other origins can reuse the - // conformed WAV without re-running the decode/encode. Fire-and-forget. - void mirrorBytesToWorkspace(opfsPath.split('/'), bytes); + await writeBlob( + requireWorkspaceRoot(), + previewAudioPath(persistedPath), + new Uint8Array(bytes), + ); + + await updateMedia(mediaId, { + previewAudioOpfsPath: persistedPath, + previewAudioMimeType: PREVIEW_AUDIO_CONFORM_MIME_TYPE, + previewAudioConformedAt: Date.now(), + }); })() .catch((err) => { log.warn('Failed to persist preview audio conform asset', { mediaId, err }); @@ -169,16 +161,17 @@ export async function deletePreviewAudioConform( } if (media.previewAudioOpfsPath) { + const persistedPath = media.previewAudioOpfsPath; try { - await opfsService.deleteFile(media.previewAudioOpfsPath); + await opfsService.deleteFile(persistedPath); } catch (err) { - log.warn('Failed to delete preview audio conform asset', { + log.debug('Legacy OPFS preview audio conform asset was already absent or unreadable', { mediaId, - path: media.previewAudioOpfsPath, + path: persistedPath, err, }); } - void removeWorkspaceCacheEntry(media.previewAudioOpfsPath.split('/')); + void removeWorkspaceCacheEntry(previewAudioPath(persistedPath)); } if (options?.clearMetadata) { diff --git a/src/features/editor/components/audio-meter-panel.tsx b/src/features/editor/components/audio-meter-panel.tsx index 35f14375a..175e72d6f 100644 --- a/src/features/editor/components/audio-meter-panel.tsx +++ b/src/features/editor/components/audio-meter-panel.tsx @@ -8,7 +8,7 @@ import { importWaveformCache, } from '@/features/editor/deps/timeline-store'; import { importMediaLibraryService } from '@/features/editor/deps/media-library'; -import { usePlaybackStore } from '@/shared/state/playback'; +import { getResolvedPlaybackFrame, usePlaybackStore } from '@/shared/state/playback'; import { usePreviewBridgeStore } from '@/shared/state/preview-bridge'; import { useEditorStore } from '@/app/state/editor/store'; import { EDITOR_LAYOUT_CSS_VALUES } from '@/app/editor-layout'; @@ -174,7 +174,14 @@ export const AudioMeterPanel = memo(function AudioMeterPanel() { lastTimestamp: 0, }); - const effectiveFrame = previewFrame ?? displayedFrame ?? currentFrame; + const effectiveFrame = useMemo(() => getResolvedPlaybackFrame({ + currentFrame, + currentFrameEpoch: usePlaybackStore.getState().currentFrameEpoch, + previewFrame, + previewFrameEpoch: usePlaybackStore.getState().previewFrameEpoch, + isPlaying, + displayedFrame, + }), [currentFrame, displayedFrame, isPlaying, previewFrame]); const combinedTracks = useMemo(() => { return tracks .filter((track) => !track.isGroup) diff --git a/src/features/editor/components/editor.test.tsx b/src/features/editor/components/editor.test.tsx index 0e33600fc..717313fcf 100644 --- a/src/features/editor/components/editor.test.tsx +++ b/src/features/editor/components/editor.test.tsx @@ -24,6 +24,7 @@ const mocks = vi.hoisted(() => ({ }), initTransitionChainSubscription: vi.fn(() => vi.fn()), createProjectUpgradeBackup: vi.fn(), + resizablePanelGroup: vi.fn(), })); vi.mock('@tanstack/react-router', () => ({ @@ -42,7 +43,18 @@ vi.mock('@/shared/logging/logger', () => ({ })); vi.mock('@/components/ui/resizable', () => ({ - ResizablePanelGroup: ({ children }: { children: ReactNode }) =>
{children}
, + ResizablePanelGroup: ({ + children, + ...props + }: { + children: ReactNode; + autoSaveId?: string; + className?: string; + direction?: string; + }) => { + mocks.resizablePanelGroup(props); + return
{children}
; + }, ResizablePanel: ({ children }: { children: ReactNode }) =>
{children}
, ResizableHandle: () =>
, })); @@ -326,4 +338,31 @@ describe('LoadedEditor migration metadata refresh', () => { await waitFor(() => expect(mocks.invalidate).not.toHaveBeenCalled()); }); + + it('persists the timeline split layout in localStorage', async () => { + render( + + ); + + expect(mocks.resizablePanelGroup).toHaveBeenCalledWith( + expect.objectContaining({ + autoSaveId: 'editor:timeline-layout', + direction: 'vertical', + }) + ); + }); }); diff --git a/src/features/editor/components/editor.tsx b/src/features/editor/components/editor.tsx index 4a5c28942..e4aeab202 100644 --- a/src/features/editor/components/editor.tsx +++ b/src/features/editor/components/editor.tsx @@ -461,7 +461,11 @@ export const LoadedEditor = memo(function LoadedEditor({ )} {/* Right side: Preview/Properties + Timeline */} - + {/* Top - Preview + Properties (inline mode) */} { ); expect(screen.getByTestId('inline-source-preview')).toBeInTheDocument(); - expect(screen.queryByTestId('video-preview')).not.toBeInTheDocument(); + expect(screen.getByTestId('video-preview')).toBeInTheDocument(); expect(screen.getByTestId('playback-controls')).toBeInTheDocument(); }); @@ -176,7 +176,7 @@ describe('PreviewArea mask editor toolbar', () => { ); expect(screen.getByTestId('inline-composition-preview')).toBeInTheDocument(); - expect(screen.queryByTestId('video-preview')).not.toBeInTheDocument(); + expect(screen.getByTestId('video-preview')).toBeInTheDocument(); expect(screen.getByTestId('playback-controls')).toBeInTheDocument(); }); diff --git a/src/features/editor/components/preview-area.tsx b/src/features/editor/components/preview-area.tsx index 687a22b81..5b990d228 100644 --- a/src/features/editor/components/preview-area.tsx +++ b/src/features/editor/components/preview-area.tsx @@ -92,28 +92,34 @@ const ProgramPreviewSurface = memo(function ProgramPreviewSurface({ const mediaSkimPreviewFrame = useEditorStore((s) => s.mediaSkimPreviewFrame); const compoundClipSkimPreviewCompositionId = useEditorStore((s) => s.compoundClipSkimPreviewCompositionId); const compoundClipSkimPreviewFrame = useEditorStore((s) => s.compoundClipSkimPreviewFrame); + const skimPreviewOverlay = compoundClipSkimPreviewCompositionId ? ( + + ) : mediaSkimPreviewMediaId ? ( + + ) : null; return ( - {compoundClipSkimPreviewCompositionId ? ( - - ) : mediaSkimPreviewMediaId ? ( - - ) : ( +
- )} + {skimPreviewOverlay && ( +
+ {skimPreviewOverlay} +
+ )} +
); }); diff --git a/src/features/editor/components/settings-dialog.tsx b/src/features/editor/components/settings-dialog.tsx index 2e2dc4205..82edbef64 100644 --- a/src/features/editor/components/settings-dialog.tsx +++ b/src/features/editor/components/settings-dialog.tsx @@ -1,5 +1,6 @@ import { useState, useCallback } from 'react'; import type { MediaMetadata } from '@/types/storage'; +import { toast } from 'sonner'; import { Dialog, DialogContent, @@ -17,7 +18,6 @@ import { AlertDialogTitle, } from '@/components/ui/alert-dialog'; import { Button } from '@/components/ui/button'; -import { Combobox } from '@/components/ui/combobox'; import { Label } from '@/components/ui/label'; import { Separator } from '@/components/ui/separator'; import { @@ -30,19 +30,23 @@ import { import { Switch } from '@/components/ui/switch'; import { Slider } from '@/components/ui/slider'; import { ScrollArea } from '@/components/ui/scroll-area'; +import { Input } from '@/components/ui/input'; import { - RotateCcw, Trash2, Loader2, Check, ImagePlus, Film, - Settings2, Rows3, AudioLines, HardDrive, + RotateCcw, Trash2, Loader2, Check, ImagePlus, Film, TriangleAlert, + Settings2, Rows3, HardDrive, Sparkles, } from 'lucide-react'; import { LocalInferenceUnloadControl, LocalModelCacheControl, useSettingsStore, + CAPTIONING_INTERVAL_BOUNDS, + DEFAULT_CAPTIONING_INTERVAL_SECONDS, + resolveCaptioningIntervalSec, + type CaptioningIntervalUnit, } from '@/features/editor/deps/settings'; import { useMediaLibraryStore, getSharedProxyKey, - getMediaTranscriptionModelOptions, importProxyService, importMediaLibraryService, importThumbnailGenerator, @@ -56,25 +60,28 @@ import { clearPreviewAudioCache } from '@/features/editor/deps/composition-runti import { createLogger } from '@/shared/logging/logger'; import { cn } from '@/shared/ui/cn'; import { EDITOR_DENSITY_OPTIONS } from '@/app/editor-layout'; -import { - getWhisperQuantizationOption, - getWhisperLanguageSelectValue, - getWhisperLanguageSettingValue, - WHISPER_LANGUAGE_OPTIONS, - WHISPER_QUANTIZATION_OPTIONS, -} from '@/shared/utils/whisper-settings'; -import type { MediaTranscriptModel, MediaTranscriptQuantization } from '@/types/storage'; const log = createLogger('SettingsDialog'); -const TRANSCRIPTION_MODEL_OPTIONS = getMediaTranscriptionModelOptions(); const SETTINGS_SECTIONS = [ { id: 'general', label: 'General', icon: Settings2 }, { id: 'timeline', label: 'Timeline', icon: Rows3 }, - { id: 'whisper', label: 'Whisper', icon: AudioLines }, + { id: 'ai', label: 'AI', icon: Sparkles }, { id: 'storage', label: 'Storage', icon: HardDrive }, ] as const; +const ESTIMATE_REFERENCE_DURATION_SEC = 60; +const ESTIMATE_REFERENCE_FPS = 30; + +function formatCaptionEstimate(unit: CaptioningIntervalUnit, value: number): string { + const intervalSec = resolveCaptioningIntervalSec(unit, value, ESTIMATE_REFERENCE_FPS); + if (intervalSec <= 0) { + return 'Enter an interval above zero.'; + } + const sceneCount = Math.max(1, Math.round(ESTIMATE_REFERENCE_DURATION_SEC / intervalSec)); + return `~${sceneCount} ${sceneCount === 1 ? 'scene' : 'scenes'} per 1-min clip at ${ESTIMATE_REFERENCE_FPS}fps`; +} + type SettingsSectionId = (typeof SETTINGS_SECTIONS)[number]['id']; interface SettingsDialogProps { @@ -82,6 +89,99 @@ interface SettingsDialogProps { onOpenChange: (open: boolean) => void; } +interface BatchActionResult { + total: number; + succeeded: number; + failed: number; + failedItems: string[]; +} + +interface ActionFeedback { + tone: 'success' | 'error'; + message: string; +} + +function formatCount(count: number, noun: string): string { + return `${count} ${noun}${count === 1 ? '' : 's'}`; +} + +function formatFailedItems(items: string[]): string { + if (items.length === 0) return ''; + if (items.length <= 2) return items.join(', '); + return `${items.slice(0, 2).join(', ')}, +${items.length - 2} more`; +} + +function createBatchResult(total: number, failedItems: string[]): BatchActionResult { + return { + total, + succeeded: Math.max(0, total - failedItems.length), + failed: failedItems.length, + failedItems, + }; +} + +function getBatchOutcomeFeedback( + actionLabel: string, + result: BatchActionResult, +): ActionFeedback { + if (result.total === 0) { + return { + tone: 'success', + message: `No project media to ${actionLabel.toLowerCase()}.`, + }; + } + + if (result.failed === 0) { + return { + tone: 'success', + message: `${actionLabel} completed for ${formatCount(result.succeeded, 'item')}.`, + }; + } + + const failedLabel = formatFailedItems(result.failedItems); + + if (result.succeeded === 0) { + return { + tone: 'error', + message: `Couldn't ${actionLabel.toLowerCase()} ${formatCount(result.failed, 'item')}${failedLabel ? `: ${failedLabel}` : '.'}`, + }; + } + + return { + tone: 'error', + message: `${actionLabel} completed for ${result.succeeded}/${result.total} items. Needs attention: ${failedLabel}.`, + }; +} + +function showBatchOutcomeToast( + successTitle: string, + partialTitle: string, + failureTitle: string, + result: BatchActionResult, +): void { + if (result.total === 0) { + toast.success(successTitle, { + description: 'No project media needed updating.', + }); + return; + } + + if (result.failed === 0) { + toast.success(successTitle, { + description: `${formatCount(result.succeeded, 'item')} updated.`, + }); + return; + } + + const description = result.succeeded === 0 + ? formatFailedItems(result.failedItems) + : `${formatCount(result.succeeded, 'item')} updated. Failed: ${formatFailedItems(result.failedItems)}`; + + toast.error(result.succeeded === 0 ? failureTitle : partialTitle, { + description, + }); +} + /** * Clear regenerable cache data for the current project's media only. * Clears filmstrips, waveforms, GIF frames, and decoded audio @@ -89,8 +189,10 @@ interface SettingsDialogProps { * * Does NOT clear thumbnails (not auto-regenerated) or proxies (separate action). */ -async function clearProjectCaches(mediaIds: string[]): Promise { - if (mediaIds.length === 0) return; +async function clearProjectCaches( + mediaItems: Array>, +): Promise { + if (mediaItems.length === 0) return createBatchResult(0, []); const [ { deleteWaveform, deleteGifFrames, deleteDecodedPreviewAudio }, @@ -109,49 +211,73 @@ async function clearProjectCaches(mediaIds: string[]): Promise { // Clear in-memory preview audio cache (not keyed per-media, so clear all) clearPreviewAudioCache(); - await Promise.all( - mediaIds.flatMap((id) => [ - deleteWaveform(id).catch((e) => { log.debug('Failed to delete waveform:', id, e); }), - deleteGifFrames(id).catch((e) => { log.debug('Failed to delete GIF frames:', id, e); }), - deleteDecodedPreviewAudio(id).catch((e) => { log.debug('Failed to delete decoded audio:', id, e); }), - deletePreviewAudioConform(id, { clearMetadata: true }).catch((e) => { log.debug('Failed to delete preview conform audio:', id, e); }), - gifFrameCache.clearMedia(id).catch((e) => { log.debug('Failed to clear GIF cache:', id, e); }), - filmstripCache.clearMedia(id).catch((e) => { log.debug('Failed to clear filmstrip cache:', id, e); }), - waveformCache.clearMedia(id).catch((e) => { log.debug('Failed to clear waveform cache:', id, e); }), - ]) - ); + const failedItems: string[] = []; + + await Promise.all(mediaItems.map(async ({ id, fileName }) => { + const results = await Promise.allSettled([ + deleteWaveform(id), + deleteGifFrames(id), + deleteDecodedPreviewAudio(id), + deletePreviewAudioConform(id, { clearMetadata: true }), + gifFrameCache.clearMedia(id), + filmstripCache.clearMedia(id), + waveformCache.clearMedia(id), + ]); + + const failures = results.filter((result) => result.status === 'rejected'); + if (failures.length > 0) { + log.warn('Failed to fully clear project cache for media item', { + mediaId: id, + fileName, + failures: failures.map((result) => String(result.reason)), + }); + failedItems.push(fileName); + } + })); - log.info(`Cleared caches for ${mediaIds.length} media items`); + const result = createBatchResult(mediaItems.length, failedItems); + log.info(`Cleared caches for ${result.succeeded}/${result.total} media items`); + return result; } /** Delete all proxy videos for the given media items and clear their store status. */ async function clearProjectProxies( mediaItems: MediaMetadata[] -): Promise { - if (mediaItems.length === 0) return; +): Promise { + if (mediaItems.length === 0) return createBatchResult(0, []); const { proxyService } = await importProxyService(); + const failedItems: string[] = []; await Promise.all(mediaItems.map(async (media) => { try { await proxyService.deleteProxy(media.id, getSharedProxyKey(media)); - } catch { /* already absent */ } - useMediaLibraryStore.getState().clearProxyStatus(media.id); - proxyService.clearProxyKey(media.id); + useMediaLibraryStore.getState().clearProxyStatus(media.id); + proxyService.clearProxyKey(media.id); + } catch (error) { + log.warn('Failed to clear proxy for media item', { + mediaId: media.id, + fileName: media.fileName, + error, + }); + failedItems.push(media.fileName); + } })); - log.info(`Cleared proxies for ${mediaItems.length} media items`); + const result = createBatchResult(mediaItems.length, failedItems); + log.info(`Cleared proxies for ${result.succeeded}/${result.total} media items`); + return result; } /** * Regenerate thumbnails for all media in the current project. - * Fetches each media file, generates a new thumbnail, and saves it to IndexedDB. + * Fetches each media file, generates a new thumbnail, and saves it to workspace storage. */ async function regenerateProjectThumbnails( mediaItems: Array<{ id: string; fileName: string; mimeType: string }>, onProgress?: (done: number, total: number) => void, -): Promise { - if (mediaItems.length === 0) return 0; +): Promise { + if (mediaItems.length === 0) return createBatchResult(0, []); const [ { mediaLibraryService }, @@ -163,7 +289,8 @@ async function regenerateProjectThumbnails( import('@/infrastructure/storage'), ]); - let regenerated = 0; + let succeeded = 0; + const failedItems: string[] = []; for (const media of mediaItems) { try { @@ -189,18 +316,20 @@ async function regenerateProjectThumbnails( // Clear the in-memory blob URL cache so UI picks up the new thumbnail mediaLibraryService.clearThumbnailCache(media.id); - regenerated++; + succeeded++; } catch (err) { log.warn(`Failed to regenerate thumbnail for ${media.fileName}:`, err); + failedItems.push(media.fileName); } - onProgress?.(regenerated, mediaItems.length); + onProgress?.(succeeded + failedItems.length, mediaItems.length); } // Reload store so MediaCards see the updated thumbnailId and re-fetch await useMediaLibraryStore.getState().loadMediaItems(); - log.info(`Regenerated ${regenerated}/${mediaItems.length} thumbnails`); - return regenerated; + const result = createBatchResult(mediaItems.length, failedItems); + log.info(`Regenerated ${result.succeeded}/${result.total} thumbnails`); + return result; } export function SettingsDialog({ open, onOpenChange }: SettingsDialogProps) { @@ -210,32 +339,46 @@ export function SettingsDialog({ open, onOpenChange }: SettingsDialogProps) { const showFilmstrips = useSettingsStore((s) => s.showFilmstrips); const autoSaveInterval = useSettingsStore((s) => s.autoSaveInterval); const maxUndoHistory = useSettingsStore((s) => s.maxUndoHistory); - const defaultWhisperModel = useSettingsStore((s) => s.defaultWhisperModel); - const defaultWhisperQuantization = useSettingsStore((s) => s.defaultWhisperQuantization); - const defaultWhisperLanguage = useSettingsStore((s) => s.defaultWhisperLanguage); + const captioningIntervalUnit = useSettingsStore((s) => s.captioningIntervalUnit); + const captioningIntervalValue = useSettingsStore((s) => s.captioningIntervalValue); const setSetting = useSettingsStore((s) => s.setSetting); const resetToDefaults = useSettingsStore((s) => s.resetToDefaults); + const intervalBounds = CAPTIONING_INTERVAL_BOUNDS[captioningIntervalUnit]; + const intervalInputStep = captioningIntervalUnit === 'seconds' ? 0.5 : 1; + const intervalUnitLabel = captioningIntervalUnit === 'seconds' ? 'sec' : 'frames'; + const mediaItems = useMediaLibraryStore((s) => s.mediaItems); const proxyStatus = useMediaLibraryStore((s) => s.proxyStatus); const [activeSection, setActiveSection] = useState('general'); - const [clearState, setClearState] = useState<'idle' | 'clearing' | 'done'>('idle'); + const [clearState, setClearState] = useState<'idle' | 'clearing' | 'done' | 'partial'>('idle'); const [showClearConfirm, setShowClearConfirm] = useState(false); - const [regenState, setRegenState] = useState<'idle' | 'working' | 'done'>('idle'); + const [regenState, setRegenState] = useState<'idle' | 'working' | 'done' | 'partial'>('idle'); const [regenProgress, setRegenProgress] = useState(''); - const [proxyState, setProxyState] = useState<'idle' | 'clearing' | 'done'>('idle'); + const [proxyState, setProxyState] = useState<'idle' | 'clearing' | 'done' | 'partial'>('idle'); const [proxyGenerateState, setProxyGenerateState] = useState<'idle' | 'queueing' | 'done'>('idle'); + const [clearFeedback, setClearFeedback] = useState(null); + const [regenFeedback, setRegenFeedback] = useState(null); + const [proxyFeedback, setProxyFeedback] = useState(null); const handleClearCache = useCallback(async () => { setClearState('clearing'); try { - const ids = mediaItems.map((m) => m.id); - await clearProjectCaches(ids); - setClearState('done'); + const items = mediaItems.map((m) => ({ id: m.id, fileName: m.fileName })); + const result = await clearProjectCaches(items); + const feedback = getBatchOutcomeFeedback('Clear Cache', result); + setClearFeedback(feedback); + setClearState(result.failed === 0 ? 'done' : 'partial'); + showBatchOutcomeToast('Project cache cleared', 'Project cache partially cleared', 'Project cache not cleared', result); setTimeout(() => setClearState('idle'), 2000); } catch (err) { log.error('Failed to clear caches', err); + setClearFeedback({ + tone: 'error', + message: 'Couldn\'t clear project cache.', + }); + toast.error('Failed to clear project cache'); setClearState('idle'); } }, [mediaItems]); @@ -245,16 +388,24 @@ export function SettingsDialog({ open, onOpenChange }: SettingsDialogProps) { setRegenProgress('0/' + mediaItems.length); try { const items = mediaItems.map((m) => ({ id: m.id, fileName: m.fileName, mimeType: m.mimeType })); - await regenerateProjectThumbnails(items, (done, total) => { + const result = await regenerateProjectThumbnails(items, (done, total) => { setRegenProgress(`${done}/${total}`); }); - setRegenState('done'); + const feedback = getBatchOutcomeFeedback('Regenerate Thumbnails', result); + setRegenFeedback(feedback); + setRegenState(result.failed === 0 ? 'done' : 'partial'); + showBatchOutcomeToast('Thumbnails regenerated', 'Thumbnails partially regenerated', 'Thumbnails not regenerated', result); setTimeout(() => { setRegenState('idle'); setRegenProgress(''); }, 2000); } catch (err) { log.error('Failed to regenerate thumbnails', err); + setRegenFeedback({ + tone: 'error', + message: 'Couldn\'t regenerate thumbnails.', + }); + toast.error('Failed to regenerate thumbnails'); setRegenState('idle'); setRegenProgress(''); } @@ -263,11 +414,19 @@ export function SettingsDialog({ open, onOpenChange }: SettingsDialogProps) { const handleClearProxies = useCallback(async () => { setProxyState('clearing'); try { - await clearProjectProxies(mediaItems); - setProxyState('done'); + const result = await clearProjectProxies(mediaItems); + const feedback = getBatchOutcomeFeedback('Delete Proxies', result); + setProxyFeedback(feedback); + setProxyState(result.failed === 0 ? 'done' : 'partial'); + showBatchOutcomeToast('Proxies deleted', 'Proxies partially deleted', 'Proxies not deleted', result); setTimeout(() => setProxyState('idle'), 2000); } catch (err) { log.error('Failed to clear proxies', err); + setProxyFeedback({ + tone: 'error', + message: 'Couldn\'t delete proxies.', + }); + toast.error('Failed to delete proxies'); setProxyState('idle'); } }, [mediaItems]); @@ -318,8 +477,6 @@ export function SettingsDialog({ open, onOpenChange }: SettingsDialogProps) { } }, [mediaItems]); - const defaultWhisperLanguageValue = getWhisperLanguageSelectValue(defaultWhisperLanguage); - const defaultWhisperQuantizationOption = getWhisperQuantizationOption(defaultWhisperQuantization); const missingProjectProxyCount = mediaItems.filter((media) => ( media.mimeType.startsWith('video/') && proxyStatus.get(media.id) !== 'ready' @@ -424,6 +581,75 @@ export function SettingsDialog({ open, onOpenChange }: SettingsDialogProps) {
)} + {activeSection === 'ai' && ( +
+
+
+
+ +

+ How often Analyze with AI samples a frame for captioning. +

+
+
+
+
+ {(['seconds', 'frames'] as const).map((unit) => ( + + ))} +
+ { + const parsed = Number(event.target.value); + if (Number.isFinite(parsed)) { + setSetting('captioningIntervalValue', parsed); + } + }} + /> + {intervalUnitLabel} + +
+

+ {formatCaptionEstimate(captioningIntervalUnit, captioningIntervalValue)}. + Smaller intervals produce denser scenes but take longer to generate. +

+
+
+ )} + {activeSection === 'timeline' && (
@@ -444,77 +670,6 @@ export function SettingsDialog({ open, onOpenChange }: SettingsDialogProps) {
)} - {activeSection === 'whisper' && ( -
-
- - -

- Used when transcription starts without an explicit model override. -

-
- -
- - -

- Pick based on memory first. {defaultWhisperQuantizationOption.description} -

-
- -
- - - setSetting('defaultWhisperLanguage', getWhisperLanguageSettingValue(value)) - } - options={WHISPER_LANGUAGE_OPTIONS} - placeholder="Auto-detect" - searchPlaceholder="Search languages..." - emptyMessage="No languages match that search." - /> -

- Choose Auto-detect to infer the language, or lock transcription to a known language for faster startup. -

-
- - -
- )} - {activeSection === 'storage' && (
@@ -550,6 +705,15 @@ export function SettingsDialog({ open, onOpenChange }: SettingsDialogProps) {

Waveforms, filmstrips, GIF frames, decoded audio

+ {clearFeedback && ( +

+ {clearFeedback.message} +

+ )}
@@ -570,6 +741,15 @@ export function SettingsDialog({ open, onOpenChange }: SettingsDialogProps) {

Re-create media library thumbnails for this project

+ {regenFeedback && ( +

+ {regenFeedback.message} +

+ )}
@@ -590,6 +777,15 @@ export function SettingsDialog({ open, onOpenChange }: SettingsDialogProps) {

Remove generated proxy videos for this project

+ {proxyFeedback && ( +

+ {proxyFeedback.message} +

+ )}
- +
+
+ +

+ Unload resident runtimes or clear cached model downloads. +

+
+ + +
)} diff --git a/src/features/editor/deps/scene-browser-contract.ts b/src/features/editor/deps/scene-browser-contract.ts new file mode 100644 index 000000000..3f3eb127d --- /dev/null +++ b/src/features/editor/deps/scene-browser-contract.ts @@ -0,0 +1,5 @@ +/** + * Adapter — editor shell wires the Scene Browser hotkey through this barrel. + */ + +export { useSceneBrowserStore } from '@/features/scene-browser'; diff --git a/src/features/editor/deps/scene-browser.ts b/src/features/editor/deps/scene-browser.ts new file mode 100644 index 000000000..bf30c3858 --- /dev/null +++ b/src/features/editor/deps/scene-browser.ts @@ -0,0 +1 @@ +export * from './scene-browser-contract'; diff --git a/src/features/editor/deps/settings-contract.ts b/src/features/editor/deps/settings-contract.ts index b9fd760db..ba95bb9d7 100644 --- a/src/features/editor/deps/settings-contract.ts +++ b/src/features/editor/deps/settings-contract.ts @@ -3,7 +3,13 @@ * Editor modules should import settings stores/services from here. */ -export { useSettingsStore } from '@/features/settings/stores/settings-store'; +export { + useSettingsStore, + CAPTIONING_INTERVAL_BOUNDS, + DEFAULT_CAPTIONING_INTERVAL_SECONDS, + resolveCaptioningIntervalSec, +} from '@/features/settings/stores/settings-store'; +export type { CaptioningIntervalUnit } from '@/features/settings/stores/settings-store'; export { LocalInferenceUnloadControl } from '@/features/settings/components/local-inference-unload-control'; export { LocalModelCacheControl } from '@/features/settings/components/local-model-cache-control'; export { useResolvedHotkeys } from '@/features/settings/hooks/use-resolved-hotkeys'; diff --git a/src/features/editor/hooks/use-editor-hotkeys.ts b/src/features/editor/hooks/use-editor-hotkeys.ts index 1f06d114d..2775f26fa 100644 --- a/src/features/editor/hooks/use-editor-hotkeys.ts +++ b/src/features/editor/hooks/use-editor-hotkeys.ts @@ -2,6 +2,8 @@ import { useHotkeys } from 'react-hotkeys-hook'; import { HOTKEY_OPTIONS } from '@/config/hotkeys'; import { useResolvedHotkeys } from '@/features/editor/deps/settings'; +import { useSceneBrowserStore } from '@/features/editor/deps/scene-browser'; + interface EditorHotkeyCallbacks { onSave?: () => void; onExport?: () => void; @@ -13,6 +15,7 @@ interface EditorHotkeyCallbacks { * Handles editor-level shortcuts that work across all components: * - Save (Ctrl+S) - Saves timeline to project * - Export (Ctrl+Shift+E) - Exports video + * - Open Scene Browser (Ctrl+Shift+F) - Opens caption search across media * * Note: Undo/Redo are handled in useTimelineShortcuts since they're timeline-specific * @@ -46,4 +49,17 @@ export function useEditorHotkeys(callbacks: EditorHotkeyCallbacks = {}) { { ...HOTKEY_OPTIONS, eventListenerOptions: { capture: true } }, [callbacks.onExport] ); + + // Open Scene Browser: Cmd/Ctrl+Shift+F — capture phase because the + // default browser binding is a no-op here but Chrome will still eat it + // if our listener is in bubbling phase. + useHotkeys( + hotkeys.OPEN_SCENE_BROWSER, + (event) => { + event.preventDefault(); + useSceneBrowserStore.getState().openBrowser({ focus: true }); + }, + { ...HOTKEY_OPTIONS, eventListenerOptions: { capture: true } }, + [] + ); } diff --git a/src/features/export/utils/canvas-effects.test.ts b/src/features/export/utils/canvas-effects.test.ts index 184251324..684940da0 100644 --- a/src/features/export/utils/canvas-effects.test.ts +++ b/src/features/export/utils/canvas-effects.test.ts @@ -1,8 +1,21 @@ -import { describe, expect, it } from 'vitest'; -import { getAdjustmentLayerEffects, type AdjustmentLayerWithTrackOrder } from './canvas-effects'; +import { beforeEach, describe, expect, it, vi } from 'vitest'; import type { AdjustmentItem } from '@/types/timeline'; import type { ItemEffect } from '@/types/effects'; +const mockFns = vi.hoisted(() => ({ + applyMasksMock: vi.fn(), +})); + +vi.mock('./canvas-masks', () => ({ + applyMasks: mockFns.applyMasksMock, +})); + +import { + getAdjustmentLayerEffects, + renderEffectsFromMaskedSource, + type AdjustmentLayerWithTrackOrder, +} from './canvas-effects'; + function createGpuEffect(id: string, amount: number): ItemEffect { return { id, @@ -33,6 +46,14 @@ function createAdjustmentLayer( return { layer, trackOrder }; } +function createMock2dContext(canvas: OffscreenCanvas): OffscreenCanvasRenderingContext2D { + return { + canvas, + drawImage: vi.fn(), + clearRect: vi.fn(), + } as unknown as OffscreenCanvasRenderingContext2D; +} + describe('getAdjustmentLayerEffects', () => { it('prefers preview overrides for active adjustment layers in preview mode', () => { const committedEffect = createGpuEffect('effect-1', 0.25); @@ -58,6 +79,27 @@ describe('getAdjustmentLayerEffects', () => { expect(effects).toEqual([committedEffect]); }); + it('uses the live adjustment layer snapshot when committed effects change in preview mode', () => { + const committedEffect = createGpuEffect('effect-1', 0.25); + const updatedEffect = createGpuEffect('effect-1', 0.8); + const adjustmentLayers = [createAdjustmentLayer('adj-1', 1, [committedEffect])]; + + const effects = getAdjustmentLayerEffects( + 3, + adjustmentLayers, + 10, + undefined, + (itemId) => itemId === 'adj-1' + ? { + ...adjustmentLayers[0]!.layer, + effects: [updatedEffect], + } + : undefined, + ); + + expect(effects).toEqual([updatedEffect]); + }); + it('ignores inactive or out-of-scope adjustment layers before checking overrides', () => { const activeEffect = createGpuEffect('active', 0.4); const inactiveEffect = createGpuEffect('inactive', 0.7); @@ -89,3 +131,50 @@ describe('getAdjustmentLayerEffects', () => { expect(previewLookups).toEqual(['adj-active']); }); }); + +describe('renderEffectsFromMaskedSource', () => { + beforeEach(() => { + mockFns.applyMasksMock.mockReset(); + }); + + it('pre-masks the effect source before the effect chain draws from it', async () => { + const sourceCanvas = { width: 1920, height: 1080 } as OffscreenCanvas; + const maskedSourceCanvas = { width: 1920, height: 1080 } as OffscreenCanvas; + const effectCanvas = { width: 1920, height: 1080 } as OffscreenCanvas; + const maskedSourceCtx = createMock2dContext(maskedSourceCanvas); + const effectCtx = createMock2dContext(effectCanvas); + const canvasPool = { + acquire: vi.fn() + .mockReturnValueOnce({ canvas: maskedSourceCanvas, ctx: maskedSourceCtx }) + .mockReturnValueOnce({ canvas: effectCanvas, ctx: effectCtx }), + }; + const masks = [{ + path: {} as Path2D, + inverted: false, + feather: 0, + maskType: 'clip' as const, + }]; + const effect = createGpuEffect('fx-1', 0.5); + + const result = await renderEffectsFromMaskedSource( + canvasPool, + sourceCanvas, + [effect], + masks, + 12, + { width: 1920, height: 1080, fps: 30 }, + ); + + expect(mockFns.applyMasksMock).toHaveBeenCalledWith( + maskedSourceCtx, + sourceCanvas, + masks, + { width: 1920, height: 1080, fps: 30 }, + ); + expect(effectCtx.drawImage).toHaveBeenCalledWith(maskedSourceCanvas, 0, 0); + expect(result).toEqual({ + source: effectCanvas, + poolCanvases: [maskedSourceCanvas, effectCanvas], + }); + }); +}); diff --git a/src/features/export/utils/canvas-effects.ts b/src/features/export/utils/canvas-effects.ts index a16e35917..63cdfe383 100644 --- a/src/features/export/utils/canvas-effects.ts +++ b/src/features/export/utils/canvas-effects.ts @@ -5,9 +5,11 @@ */ import type { ItemEffect, GpuEffect } from '@/types/effects'; -import type { AdjustmentItem } from '@/types/timeline'; +import type { AdjustmentItem, TimelineItem } from '@/types/timeline'; import { createLogger } from '@/shared/logging/logger'; import type { EffectsPipeline, GpuEffectInstance } from '@/infrastructure/gpu/effects'; +import { applyMasks, type MaskCanvasSettings } from './canvas-masks'; +import type { CanvasPool } from './canvas-pool'; const log = createLogger('CanvasEffects'); @@ -19,6 +21,51 @@ export interface AdjustmentLayerWithTrackOrder { trackOrder: number; } +/** + * Applies any track-scoped shape masks to the source canvas before running the + * effect stack. The caller can still apply a final post-effect mask pass during + * compositing so effect bleed is trimmed to the same shape. + */ +export async function renderEffectsFromMaskedSource( + canvasPool: Pick, + sourceCanvas: OffscreenCanvas, + effects: ItemEffect[], + masks: EffectSourceMask[], + frame: number, + canvas: EffectCanvasSettings & MaskCanvasSettings, + gpuPipeline?: EffectsPipeline | null, +): Promise<{ source: OffscreenCanvas; poolCanvases: OffscreenCanvas[] }> { + const poolCanvases: OffscreenCanvas[] = []; + let effectSource = sourceCanvas; + + if (masks.length > 0) { + const { canvas: maskedSourceCanvas, ctx: maskedSourceCtx } = canvasPool.acquire(); + applyMasks(maskedSourceCtx, sourceCanvas, masks, canvas); + effectSource = maskedSourceCanvas; + poolCanvases.push(maskedSourceCanvas); + } + + if (effects.length === 0) { + return { source: effectSource, poolCanvases }; + } + + const { canvas: effectCanvas, ctx: effectCtx } = canvasPool.acquire(); + const deferredGpuCanvas = await applyAllEffectsAsync( + effectCtx, + effectSource, + effects, + frame, + canvas, + gpuPipeline, + ); + poolCanvases.push(effectCanvas); + + return { + source: deferredGpuCanvas ?? effectCanvas, + poolCanvases, + }; +} + /** * Canvas settings for effect rendering */ @@ -27,6 +74,14 @@ interface EffectCanvasSettings { height: number; } +export interface EffectSourceMask { + path: Path2D; + inverted: boolean; + feather: number; + maskType: 'clip' | 'alpha'; + trackOrder?: number; +} + // ============================================================================ // GPU Effects // ============================================================================ @@ -110,10 +165,18 @@ export function getAdjustmentLayerEffects( adjustmentLayers: AdjustmentLayerWithTrackOrder[], frame: number, getPreviewEffectsOverride?: (itemId: string) => ItemEffect[] | undefined, + getLiveItemSnapshot?: (itemId: string) => TimelineItem | undefined, ): ItemEffect[] { if (adjustmentLayers.length === 0) return []; return adjustmentLayers + .map(({ layer, trackOrder }) => { + const liveLayer = getLiveItemSnapshot?.(layer.id); + return { + layer: liveLayer?.type === 'adjustment' ? liveLayer : layer, + trackOrder, + }; + }) .filter(({ layer, trackOrder }) => { // Item must be BEHIND the adjustment (higher track order = lower zIndex) if (itemTrackOrder <= trackOrder) return false; diff --git a/src/features/export/utils/canvas-item-renderer.ts b/src/features/export/utils/canvas-item-renderer.ts index 6c3db0368..5103993c3 100644 --- a/src/features/export/utils/canvas-item-renderer.ts +++ b/src/features/export/utils/canvas-item-renderer.ts @@ -26,9 +26,10 @@ import { doesMaskAffectTrack } from '@/shared/utils/mask-scope'; // Subsystem imports import { getAnimatedTransform } from './canvas-keyframes'; import { - applyAllEffectsAsync, + renderEffectsFromMaskedSource, getAdjustmentLayerEffects, combineEffects, + type EffectSourceMask, type AdjustmentLayerWithTrackOrder, } from './canvas-effects'; import { @@ -122,6 +123,7 @@ export interface ItemRenderContext { renderMode: 'export' | 'preview'; scrubbingCache?: ScrubbingCache | null; getCurrentItemSnapshot?: (item: TItem) => TItem; + getLiveItemSnapshotById?: (itemId: string) => TimelineItem | undefined; getCurrentKeyframes?: (itemId: string) => ItemKeyframes | undefined; getPreviewTransformOverride?: (itemId: string) => Partial | undefined; getPreviewCornerPinOverride?: (itemId: string) => TimelineItem['cornerPin'] | undefined; @@ -1536,6 +1538,7 @@ export async function renderTransitionToCanvas( frame: number, rctx: ItemRenderContext, trackOrder: number, + trackMasks: EffectSourceMask[] = [], ): Promise { const { canvasPool, canvasSettings } = rctx; const { leftClip, rightClip } = activeTransition; @@ -1568,41 +1571,49 @@ export async function renderTransitionToCanvas( const hasLeftEffects = leftCombinedEffects.length > 0; const hasRightEffects = rightCombinedEffects.length > 0; - // Track pool effect canvases separately — in GPU batch mode the final - // source may be a GPU output canvas (not from the pool), but the pool - // canvases still need to be released. - let leftEffectPoolCanvas: OffscreenCanvas | null = null; - let rightEffectPoolCanvas: OffscreenCanvas | null = null; + const leftEffectPoolCanvases: OffscreenCanvas[] = []; + const rightEffectPoolCanvases: OffscreenCanvas[] = []; if (hasLeftEffects || hasRightEffects) { - // In GPU batch mode, applyAllEffectsAsync returns a deferred GPU canvas - // instead of drawing back to the effect canvas. We must capture and use - // the returned canvas, otherwise effects are silently dropped. - let leftGpuPromise: Promise | undefined; - let rightGpuPromise: Promise | undefined; + let leftEffectsPromise: Promise<{ source: OffscreenCanvas; poolCanvases: OffscreenCanvas[] }> | undefined; + let rightEffectsPromise: Promise<{ source: OffscreenCanvas; poolCanvases: OffscreenCanvas[] }> | undefined; if (hasLeftEffects) { - const { canvas: leftEffectCanvas, ctx: leftEffectCtx } = canvasPool.acquire(); - leftEffectPoolCanvas = leftEffectCanvas; - leftFinalCanvas = leftEffectCanvas; - leftGpuPromise = applyAllEffectsAsync(leftEffectCtx, leftCanvas, leftCombinedEffects, frame, canvasSettings, rctx.gpuPipeline); + leftEffectsPromise = renderEffectsFromMaskedSource( + canvasPool, + leftCanvas, + leftCombinedEffects, + trackMasks, + frame, + canvasSettings, + rctx.gpuPipeline, + ); } if (hasRightEffects) { - const { canvas: rightEffectCanvas, ctx: rightEffectCtx } = canvasPool.acquire(); - rightEffectPoolCanvas = rightEffectCanvas; - rightFinalCanvas = rightEffectCanvas; - rightGpuPromise = applyAllEffectsAsync(rightEffectCtx, rightCanvas, rightCombinedEffects, frame, canvasSettings, rctx.gpuPipeline); + rightEffectsPromise = renderEffectsFromMaskedSource( + canvasPool, + rightCanvas, + rightCombinedEffects, + trackMasks, + frame, + canvasSettings, + rctx.gpuPipeline, + ); } - const [leftGpu, rightGpu] = await Promise.all([ - leftGpuPromise ?? Promise.resolve(null), - rightGpuPromise ?? Promise.resolve(null), + const [leftEffects, rightEffects] = await Promise.all([ + leftEffectsPromise ?? Promise.resolve(null), + rightEffectsPromise ?? Promise.resolve(null), ]); - // Use deferred GPU canvas when returned (batch mode), otherwise the - // effect canvas already has the result drawn into it. - if (leftGpu) leftFinalCanvas = leftGpu; - if (rightGpu) rightFinalCanvas = rightGpu; + if (leftEffects) { + leftFinalCanvas = leftEffects.source; + leftEffectPoolCanvases.push(...leftEffects.poolCanvases); + } + if (rightEffects) { + rightFinalCanvas = rightEffects.source; + rightEffectPoolCanvases.push(...rightEffects.poolCanvases); + } } // Render transition with effect-applied canvases @@ -1610,9 +1621,9 @@ export async function renderTransitionToCanvas( renderTransition(ctx, activeTransition, leftFinalCanvas, rightFinalCanvas, transitionSettings, rctx.gpuTransitionPipeline); // Release all pool canvases (GPU output canvases are managed by the pipeline) - if (leftEffectPoolCanvas) canvasPool.release(leftEffectPoolCanvas); + for (const effectCanvas of leftEffectPoolCanvases) canvasPool.release(effectCanvas); canvasPool.release(leftCanvas); - if (rightEffectPoolCanvas) canvasPool.release(rightEffectPoolCanvas); + for (const effectCanvas of rightEffectPoolCanvases) canvasPool.release(effectCanvas); canvasPool.release(rightCanvas); } @@ -1660,6 +1671,7 @@ export function resolveTransitionParticipantRenderState doesMaskAffectTrack(mask.trackOrder, trackOrder)); // NOTE: The importExternalTexture zero-copy path is disabled because // textureSampleBaseClampToEdge produces subtly different edge pixel values @@ -1233,16 +1236,22 @@ export async function createCompositionRenderer( getLog().warn('GPU pipeline init failed — GPU effects will be skipped'); } } - const { canvas: effectCanvas, ctx: effectCtx } = canvasPool.acquire(); - const deferredGpuCanvas = await applyAllEffectsAsync(effectCtx, itemCanvas, combinedEffects, frame, canvasSettings, itemRenderContext.gpuPipeline); + const { source, poolCanvases } = await renderEffectsFromMaskedSource( + canvasPool, + itemCanvas, + combinedEffects, + applicableMasks, + frame, + maskSettings, + itemRenderContext.gpuPipeline, + ); canvasPool.release(itemCanvas); - const source = deferredGpuCanvas ?? effectCanvas; if (deferred) { - return { source, poolCanvases: [effectCanvas] }; + return { source, poolCanvases }; } targetCtx.drawImage(source, 0, 0); - canvasPool.release(effectCanvas); + for (const effectCanvas of poolCanvases) canvasPool.release(effectCanvas); return null; } @@ -1334,6 +1343,7 @@ export async function createCompositionRenderer( adjustmentLayers, frame, renderMode === 'preview' ? getPreviewEffectsOverride : undefined, + renderMode === 'preview' ? getLiveItemSnapshot : undefined, ); const allEffects = [...itemEffects, ...adjEffects]; @@ -1462,7 +1472,14 @@ export async function createCompositionRenderer( } // Transitions: render to a dedicated canvas const { canvas: trCanvas, ctx: trCtx } = canvasPool.acquire(); - await renderTransitionToCanvas(trCtx, task.transition, frame, itemRenderContext, task.trackOrder); + await renderTransitionToCanvas( + trCtx, + task.transition, + frame, + itemRenderContext, + task.trackOrder, + activeMasks.filter((mask) => doesMaskAffectTrack(mask.trackOrder, task.trackOrder)), + ); return { source: trCanvas, poolCanvases: [trCanvas] } as { source: OffscreenCanvas; poolCanvases: OffscreenCanvas[] }; }), ); diff --git a/src/features/media-library/components/background-task-progress.test.tsx b/src/features/media-library/components/background-task-progress.test.tsx new file mode 100644 index 000000000..6db6525a5 --- /dev/null +++ b/src/features/media-library/components/background-task-progress.test.tsx @@ -0,0 +1,49 @@ +import { fireEvent, render, screen } from '@testing-library/react'; +import { describe, expect, it, vi } from 'vitest'; +import { BackgroundTaskProgress } from './background-task-progress'; + +describe('BackgroundTaskProgress', () => { + it('renders determinate progress with custom meta actions', () => { + const onCancel = vi.fn(); + + render( + icon} + label="Generating transcripts" + progressAriaLabel="Transcript generation progress" + progressPercent={42.4} + meta={( + <> + 42% + + + )} + fillClassName="bg-blue-500" + /> + ); + + expect(screen.getByText('Generating transcripts')).toBeInTheDocument(); + expect(screen.getByRole('progressbar', { name: 'Transcript generation progress' })) + .toHaveAttribute('aria-valuenow', '42'); + + fireEvent.click(screen.getByText('Cancel all')); + expect(onCancel).toHaveBeenCalledTimes(1); + }); + + it('renders indeterminate progress without a numeric value', () => { + render( + icon} + label="Analyzing with AI" + progressAriaLabel="AI analysis progress" + indeterminate + meta={Working...} + fillClassName="bg-purple-500" + /> + ); + + expect(screen.getByRole('progressbar', { name: 'AI analysis progress' })) + .not.toHaveAttribute('aria-valuenow'); + expect(screen.getByText('Working...')).toBeInTheDocument(); + }); +}); diff --git a/src/features/media-library/components/background-task-progress.tsx b/src/features/media-library/components/background-task-progress.tsx new file mode 100644 index 000000000..b4730e453 --- /dev/null +++ b/src/features/media-library/components/background-task-progress.tsx @@ -0,0 +1,66 @@ +import type { ReactNode } from 'react'; +import { cn } from '@/shared/ui/cn'; + +interface BackgroundTaskProgressProps { + icon: ReactNode; + label: string; + progressAriaLabel: string; + progressPercent?: number | null; + indeterminate?: boolean; + meta?: ReactNode; + trailing?: ReactNode; + fillClassName: string; +} + +export function BackgroundTaskProgress({ + icon, + label, + progressAriaLabel, + progressPercent = null, + indeterminate = false, + meta, + trailing, + fillClassName, +}: BackgroundTaskProgressProps) { + const clampedPercent = progressPercent == null + ? null + : Math.max(0, Math.min(100, Math.round(progressPercent))); + + return ( +
+
+ {icon} +
+
+ + {label} + + {meta && ( +
+ {meta} +
+ )} +
+
+
+
+
+ {trailing} +
+
+ ); +} diff --git a/src/features/media-library/components/compositions-section.tsx b/src/features/media-library/components/compositions-section.tsx index 00ed03720..6ede2d955 100644 --- a/src/features/media-library/components/compositions-section.tsx +++ b/src/features/media-library/components/compositions-section.tsx @@ -52,6 +52,7 @@ export function CompositionsSection() { const viewMode = useMediaLibraryStore((s) => s.viewMode); const mediaItemSize = useMediaLibraryStore((s) => s.mediaItemSize); const selectedCompositionIds = useMediaLibraryStore((s) => s.selectedCompositionIds); + const isTranscriptionDialogOpen = useEditorStore((s) => s.transcriptionDialogDepth > 0); const selectedCompositionIdSet = useMemo(() => new Set(selectedCompositionIds), [selectedCompositionIds]); const [open, setOpen] = useState(true); const [deleteTarget, setDeleteTarget] = useState(null); @@ -200,6 +201,7 @@ export function CompositionsSection() { composition={comp} viewMode={viewMode} selected={selectedCompositionIdSet.has(comp.id)} + isTranscriptionDialogOpen={isTranscriptionDialogOpen} dragDisabled={wouldCreateCompositionCycle({ parentCompositionId: activeCompositionId, insertedCompositionId: comp.id, @@ -263,6 +265,7 @@ interface CompositionCardProps { composition: SubComposition; viewMode: 'grid' | 'list'; selected: boolean; + isTranscriptionDialogOpen: boolean; dragDisabled: boolean; isEditing: boolean; editValue: string; @@ -279,6 +282,7 @@ const CompositionCard = memo(function CompositionCard({ composition, viewMode, selected, + isTranscriptionDialogOpen, dragDisabled, isEditing, editValue, @@ -375,7 +379,7 @@ const CompositionCard = memo(function CompositionCard({ [isEditing, onSelect] ); - const canHoverPreview = composition.durationInFrames > 0; + const canHoverPreview = composition.durationInFrames > 0 && !isTranscriptionDialogOpen; const updateSkimPreview = useCallback((clientX: number) => { const thumbnailContainer = thumbnailContainerRef.current; diff --git a/src/features/media-library/components/media-card.test.tsx b/src/features/media-library/components/media-card.test.tsx index 4c8a1eca7..01b4c648c 100644 --- a/src/features/media-library/components/media-card.test.tsx +++ b/src/features/media-library/components/media-card.test.tsx @@ -7,6 +7,7 @@ const mediaLibraryServiceMocks = vi.hoisted(() => ({ getThumbnailBlobUrl: vi.fn(), getMediaFile: vi.fn(), getMediaBlobUrl: vi.fn(), + updateMediaCaptions: vi.fn(), })); const proxyServiceMocks = vi.hoisted(() => ({ @@ -20,6 +21,8 @@ const proxyServiceMocks = vi.hoisted(() => ({ const mediaTranscriptionServiceMocks = vi.hoisted(() => ({ transcribeMedia: vi.fn(), + deleteTranscript: vi.fn(), + cancelTranscription: vi.fn(), })); const mediaStoreState = vi.hoisted(() => ({ @@ -28,7 +31,7 @@ const mediaStoreState = vi.hoisted(() => ({ importingIds: [] as string[], proxyStatus: new Map(), proxyProgress: new Map(), - transcriptStatus: new Map(), + transcriptStatus: new Map(), transcriptProgress: new Map(), taggingMediaIds: new Set(), setProxyStatus: vi.fn(), @@ -37,7 +40,18 @@ const mediaStoreState = vi.hoisted(() => ({ setTranscriptProgress: vi.fn(), clearTranscriptProgress: vi.fn(), setTaggingMedia: vi.fn(), + updateMediaCaptions: vi.fn(), showNotification: vi.fn(), + analysisProgress: null as null | { total: number; completed: number; cancelRequested: boolean }, + beginAnalysisRun: vi.fn(), + incrementAnalysisCompleted: vi.fn(), + requestAnalysisCancel: vi.fn(), + endAnalysisRun: vi.fn(), +})); + +const analysisMocks = vi.hoisted(() => ({ + captionVideo: vi.fn(), + captionImage: vi.fn(), })); const editorStoreState = vi.hoisted(() => ({ @@ -47,6 +61,10 @@ const editorStoreState = vi.hoisted(() => ({ mediaSkimPreviewMediaId: null as string | null, })); +const playbackStoreState = vi.hoisted(() => ({ + pause: vi.fn(), +})); + const sourcePlayerStoreState = vi.hoisted(() => ({ setCurrentMediaId: vi.fn(), clearInOutPoints: vi.fn(), @@ -82,6 +100,41 @@ vi.mock('@/components/ui/button', () => ({ }) => , })); +vi.mock('./transcribe-dialog', () => ({ + TranscribeDialog: ({ + open, + onStart, + onCancel, + }: { + open: boolean; + onStart: (values: { + model: string; + quantization: string; + language: string; + }) => void; + onCancel: () => void; + }) => + open ? ( +
+ + +
+ ) : null, +})); + vi.mock('./media-info-popover', () => ({ MediaInfoPopover: ({ onSeekToCaption }: { onSeekToCaption?: (timeSec: number) => void }) => (
+ ); } // Grid view return ( + <> + {transcribeDialog}
)} {!isBroken && proxyStatus === 'generating' && ( -
+
+ +
+ )} + {!isBroken && isTagging && ( +
)} @@ -883,6 +1027,21 @@ export const MediaCard = memo(function MediaCard({ style={{ left: `${skimProgress * 100}%` }} /> )} + {!isBroken && !isImporting && isTranscribing && transcriptProgressPercent !== null && ( +
+
+
+ )}
{/* Content footer - minimal */} @@ -918,5 +1077,6 @@ export const MediaCard = memo(function MediaCard({
+ ); }); diff --git a/src/features/media-library/components/media-info-popover.test.tsx b/src/features/media-library/components/media-info-popover.test.tsx new file mode 100644 index 000000000..94f73c1c9 --- /dev/null +++ b/src/features/media-library/components/media-info-popover.test.tsx @@ -0,0 +1,125 @@ +import type { ReactNode, MouseEvent } from 'react'; +import React, { createContext, cloneElement, isValidElement, useContext } from 'react'; +import { render, screen, fireEvent, waitFor } from '@testing-library/react'; +import { describe, it, expect, vi, beforeEach } from 'vitest'; +import type { MediaMetadata, MediaTranscript } from '@/types/storage'; + +const mediaTranscriptionServiceMocks = vi.hoisted(() => ({ + getTranscript: vi.fn(), +})); + +vi.mock('@/components/ui/popover', () => { + const PopoverContext = createContext<{ + open: boolean; + onOpenChange: (open: boolean) => void; + }>({ + open: false, + onOpenChange: () => {}, + }); + + return { + Popover: ({ + children, + open = false, + onOpenChange = () => {}, + }: { + children: ReactNode; + open?: boolean; + onOpenChange?: (open: boolean) => void; + }) => ( + +
{children}
+
+ ), + PopoverTrigger: ({ + children, + asChild, + }: { + children: ReactNode; + asChild?: boolean; + }) => { + const { open, onOpenChange } = useContext(PopoverContext); + if (asChild && isValidElement(children)) { + return cloneElement(children, { + onClick: (event: MouseEvent) => { + children.props.onClick?.(event); + onOpenChange(!open); + }, + }); + } + return ; + }, + PopoverContent: ({ children }: { children: ReactNode }) => { + const { open } = useContext(PopoverContext); + return open ?
{children}
: null; + }, + }; +}); + +vi.mock('../services/media-transcription-service', () => ({ + mediaTranscriptionService: mediaTranscriptionServiceMocks, +})); + +vi.mock('../transcription/registry', () => ({ + getMediaTranscriptionModelLabel: (model: string) => model === 'whisper-tiny' ? 'Tiny' : model, +})); + +import { MediaInfoPopover } from './media-info-popover'; + +function makeMedia(overrides: Partial = {}): MediaMetadata { + return { + id: 'media-1', + storageType: 'handle', + fileName: 'clip.mp4', + fileSize: 1024, + mimeType: 'video/mp4', + duration: 5, + width: 1920, + height: 1080, + fps: 30, + codec: 'h264', + bitrate: 5000, + tags: [], + createdAt: 1, + updatedAt: 1, + ...overrides, + }; +} + +describe('MediaInfoPopover', () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + it('loads and displays transcript details when opened', async () => { + const transcript: MediaTranscript = { + id: 'media-1', + mediaId: 'media-1', + model: 'whisper-tiny', + quantization: 'q8', + text: 'Hello world from transcript', + segments: [ + { text: 'Hello world', start: 1.25, end: 2.5 }, + ], + createdAt: 1, + updatedAt: 1, + }; + mediaTranscriptionServiceMocks.getTranscript.mockResolvedValue(transcript); + const onSeekToCaption = vi.fn(); + + render(); + + fireEvent.click(screen.getByTitle('Media info')); + + await waitFor(() => { + expect(mediaTranscriptionServiceMocks.getTranscript).toHaveBeenCalledWith('media-1'); + }); + + expect(await screen.findByText('Transcript (1)')).toBeInTheDocument(); + expect(screen.getByText('Tiny')).toBeInTheDocument(); + expect(screen.getByText('Hello world from transcript')).toBeInTheDocument(); + + fireEvent.click(screen.getByRole('button', { name: '0:01' })); + expect(onSeekToCaption).toHaveBeenCalledWith(1.25); + }); +}); diff --git a/src/features/media-library/components/media-info-popover.tsx b/src/features/media-library/components/media-info-popover.tsx index 5645a090a..805a4bb25 100644 --- a/src/features/media-library/components/media-info-popover.tsx +++ b/src/features/media-library/components/media-info-popover.tsx @@ -1,8 +1,11 @@ -import { Info, Video, FileAudio, Image as ImageIcon, Film, Clock, Maximize2, HardDrive, FileType, Sparkles } from 'lucide-react'; +import { Info, Video, FileAudio, Image as ImageIcon, Film, Clock, Maximize2, HardDrive, FileType, Loader2, FileText } from 'lucide-react'; +import { useEffect, useState } from 'react'; import { Popover, PopoverContent, PopoverTrigger } from '@/components/ui/popover'; -import type { MediaMetadata } from '@/types/storage'; +import type { MediaMetadata, MediaTranscript } from '@/types/storage'; import { getMediaType, formatDuration } from '../utils/validation'; import { formatBytes } from '@/shared/utils/format-utils'; +import { mediaTranscriptionService } from '../services/media-transcription-service'; +import { getMediaTranscriptionModelLabel } from '../transcription/registry'; function formatTimestamp(sec: number): string { const m = Math.floor(sec / 60); @@ -19,8 +22,12 @@ interface MediaInfoPopoverProps { } export function MediaInfoPopover({ media, triggerClassName, onSeekToCaption }: MediaInfoPopoverProps) { + const [open, setOpen] = useState(false); + const [transcript, setTranscript] = useState(null); + const [transcriptLoading, setTranscriptLoading] = useState(false); const mediaType = getMediaType(media.mimeType); const typeLabel = mediaType === 'video' ? 'Video' : mediaType === 'audio' ? 'Audio' : 'Image'; + const isTranscribable = mediaType === 'video' || mediaType === 'audio'; const rows: Array<{ icon: React.ReactNode; label: string; value: string }> = []; @@ -46,8 +53,33 @@ export function MediaInfoPopover({ media, triggerClassName, onSeekToCaption }: M rows.push({ icon: , label: 'Frame Rate', value: `${media.fps.toFixed(2)} fps` }); } + useEffect(() => { + if (!open || !isTranscribable) { + return; + } + + let cancelled = false; + setTranscriptLoading(true); + + void mediaTranscriptionService.getTranscript(media.id) + .then((loadedTranscript) => { + if (!cancelled) { + setTranscript(loadedTranscript ?? null); + } + }) + .finally(() => { + if (!cancelled) { + setTranscriptLoading(false); + } + }); + + return () => { + cancelled = true; + }; + }, [isTranscribable, media.id, open]); + return ( - + - {caption.text} + {transcriptLoading ? ( +
+ + Loading transcript... +
+ ) : transcript ? ( +
+

+ {transcript.text} +

+
+ {transcript.segments.map((segment, i) => ( +
+ + {segment.text} +
+ ))}
- ))} -
+
+ ) : null}
)} diff --git a/src/features/media-library/components/media-library.tsx b/src/features/media-library/components/media-library.tsx index 6119ba65f..40d8c21f9 100644 --- a/src/features/media-library/components/media-library.tsx +++ b/src/features/media-library/components/media-library.tsx @@ -1,5 +1,6 @@ import { useEffect, useRef, useState, useMemo, memo, useCallback } from 'react'; -import { Search, Filter, SortAsc, Video, FileAudio, Image as ImageIcon, Trash2, Grid3x3, List, AlertTriangle, Info, X, FolderOpen, Link2Off, ChevronRight, Film, ArrowLeft, Zap, Loader2, Copy, Check, Upload } from 'lucide-react'; +import { Search, Filter, SortAsc, Video, FileAudio, Image as ImageIcon, Trash2, Grid3x3, List, AlertTriangle, Info, X, FolderOpen, Link2Off, ChevronRight, Film, ArrowLeft, Zap, Loader2, Copy, Check, Upload, Sparkles, FileText, ScanSearch } from 'lucide-react'; +import { SceneBrowserPanel, useSceneBrowserStore } from '../deps/scene-browser'; import { createLogger } from '@/shared/logging/logger'; const logger = createLogger('MediaLibrary'); @@ -28,10 +29,17 @@ import { CollapsibleContent, CollapsibleTrigger, } from '@/components/ui/collapsible'; +import { + Tooltip, + TooltipContent, + TooltipProvider, + TooltipTrigger, +} from '@/components/ui/tooltip'; import { MarqueeOverlay } from '@/components/marquee-overlay'; import { cn } from '@/shared/ui/cn'; import { MediaGrid } from './media-grid'; import { CompositionsSection } from './compositions-section'; +import { BackgroundTaskProgress } from './background-task-progress'; import { MissingMediaDialog } from './missing-media-dialog'; import { OrphanedClipsDialog } from './orphaned-clips-dialog'; import { UnsupportedAudioCodecDialog } from './unsupported-audio-codec-dialog'; @@ -47,10 +55,16 @@ import { import { useProjectStore } from '@/features/media-library/deps/projects'; import { proxyService } from '../services/proxy-service'; import { mediaLibraryService } from '../services/media-library-service'; +import { mediaTranscriptionService } from '../services/media-transcription-service'; +import { mediaAnalysisService } from '../services/media-analysis-service'; import { extractValidMediaFileEntriesFromDataTransfer } from '../utils/file-drop'; import { getSharedProxyKey } from '../utils/proxy-key'; import { getMediaType } from '../utils/validation'; import { getProjectBrokenMediaIds } from '@/features/media-library/utils/broken-media'; +import { + getTranscriptionOverallProgress, + getTranscriptionStageLabel, +} from '@/shared/utils/transcription-progress'; import type { MediaMetadata } from '@/types/storage'; import { isMarqueeJustFinished, useMarqueeSelection, type MarqueeItem } from '@/hooks/use-marquee-selection'; @@ -72,6 +86,21 @@ function CopyButton({ text }: { text: string }) { ); } +function HeaderActionTooltip({ + label, + children, +}: { + label: string; + children: React.ReactNode; +}) { + return ( + + {children} + {label} + + ); +} + const GROUP_ICONS = { video: Video, audio: FileAudio, @@ -158,6 +187,8 @@ export const MediaLibrary = memo(function MediaLibrary({ onMediaSelect }: MediaL const setSortBy = useMediaLibraryStore((s) => s.setSortBy); const viewMode = useMediaLibraryStore((s) => s.viewMode); const setViewMode = useMediaLibraryStore((s) => s.setViewMode); + const sceneBrowserOpen = useSceneBrowserStore((s) => s.open); + const toggleSceneBrowser = useSceneBrowserStore((s) => s.toggleBrowser); const mediaItemSize = useMediaLibraryStore((s) => s.mediaItemSize); const setMediaItemSize = useMediaLibraryStore((s) => s.setMediaItemSize); const selectedMediaIds = useMediaLibraryStore((s) => s.selectedMediaIds); @@ -175,6 +206,8 @@ export const MediaLibrary = memo(function MediaLibrary({ onMediaSelect }: MediaL const projectStoreProjectId = useProjectStore((s) => s.currentProject?.id ?? null); const proxyStatus = useMediaLibraryStore((s) => s.proxyStatus); const proxyProgress = useMediaLibraryStore((s) => s.proxyProgress); + const transcriptStatus = useMediaLibraryStore((s) => s.transcriptStatus); + const transcriptProgress = useMediaLibraryStore((s) => s.transcriptProgress); const filteredMediaItems = useFilteredMediaItems(); const mediaGroups = useMemo(() => { const groups: { key: string; label: string; icon: 'video' | 'audio' | 'image' | 'gif'; items: MediaMetadata[] }[] = []; @@ -487,6 +520,19 @@ export const MediaLibrary = memo(function MediaLibrary({ onMediaSelect }: MediaL return count; }, [proxyStatus]); + const analysisProgress = useMediaLibraryStore((s) => s.analysisProgress); + const analysisPercent = analysisProgress && analysisProgress.total > 0 + ? (analysisProgress.completed / analysisProgress.total) * 100 + : 0; + + const transcribingCount = useMemo(() => { + let count = 0; + for (const status of transcriptStatus.values()) { + if (status === 'queued' || status === 'transcribing') count++; + } + return count; + }, [transcriptStatus]); + const currentProjectBrokenMediaIds = useMemo( () => getProjectBrokenMediaIds(brokenMediaIds, mediaById), [brokenMediaIds, mediaById] @@ -506,6 +552,31 @@ export const MediaLibrary = memo(function MediaLibrary({ onMediaSelect }: MediaL return count > 0 ? total / count : 0; }, [proxyStatus, proxyProgress, generatingCount]); + const transcribingAvgProgress = useMemo(() => { + if (transcribingCount === 0) return 0; + let total = 0; + let count = 0; + for (const [id, status] of transcriptStatus.entries()) { + if (status === 'queued' || status === 'transcribing') { + const progress = transcriptProgress.get(id); + total += progress ? getTranscriptionOverallProgress(progress) : 0; + count++; + } + } + return count > 0 ? total / count : 0; + }, [transcriptStatus, transcriptProgress, transcribingCount]); + + const singleTranscriptionStageLabel = useMemo(() => { + if (transcribingCount !== 1) return null; + for (const [id, status] of transcriptStatus.entries()) { + if (status === 'queued' || status === 'transcribing') { + const progress = transcriptProgress.get(id); + return progress ? getTranscriptionStageLabel(progress.stage) : null; + } + } + return null; + }, [transcriptStatus, transcriptProgress, transcribingCount]); + const handleGenerateSelectedProxies = async () => { const selectedItems = selectedMediaIds .map((id) => mediaById[id]) @@ -542,6 +613,16 @@ export const MediaLibrary = memo(function MediaLibrary({ onMediaSelect }: MediaL } }; + const handleCancelAllTranscriptions = () => { + for (const [mediaId, status] of transcriptStatus.entries()) { + if (status !== 'queued' && status !== 'transcribing') { + continue; + } + + mediaTranscriptionService.cancelTranscription(mediaId); + } + }; + // Count selected items that are eligible for proxy generation const selectedProxyEligibleCount = useMemo(() => { return selectedMediaIds.filter((id) => { @@ -622,85 +703,112 @@ export const MediaLibrary = memo(function MediaLibrary({ onMediaSelect }: MediaL return (
{/* Header toolbar */} -
-
- {/* Import action */} - - - {/* Missing media indicator */} - {currentProjectBrokenMediaIds.length > 0 && ( - - )} - - - {/* Selection indicator & actions */} - {selectedAssetCount > 0 && ( - <> -
+
+ +
+ {/* Import action */} + + + - {/* Selection badge */} -
- {selectedAssetCount} - selected - -
+ {/* Scene browser view toggle — lives here with Import (not in + the filter row) because it switches the whole panel between + media-library and scene-captioner views; the search/filter + bar below only scopes whichever view is mounted. */} + + + - {/* Generate proxies for selection */} - {selectedProxyEligibleCount > 0 && ( + {/* Missing media indicator */} + {currentProjectBrokenMediaIds.length > 0 && ( + - )} + + )} - {/* Delete action */} - - - )} -
+ + {/* Selection indicator & actions */} + {selectedAssetCount > 0 && ( + <> +
+ + {/* Selection badge */} +
+ {selectedAssetCount} + selected + + + +
+ + {/* Generate proxies for selection */} + {selectedProxyEligibleCount > 0 && ( + + + + )} + + {/* Delete action */} + + + + + )} +
+
{/* Error message */} @@ -772,7 +880,9 @@ export const MediaLibrary = memo(function MediaLibrary({ onMediaSelect }: MediaL
)} - {/* Search and filters */} + {/* Search and filters — hidden in Scene mode since they only scope + the media library grid; the scene browser has its own search. */} + {!sceneBrowserOpen && (
{/* Search */}
@@ -923,6 +1033,7 @@ export const MediaLibrary = memo(function MediaLibrary({ onMediaSelect }: MediaL
+ )} {/* Composition navigation banner — shown when inside a sub-composition */} {activeCompositionId !== null && activeCompLabel && ( @@ -941,9 +1052,13 @@ export const MediaLibrary = memo(function MediaLibrary({ onMediaSelect }: MediaL {/* Scrollable content: wrapper provides relative context for the drag overlay */}
+ {sceneBrowserOpen && }
+ {/* Background AI analysis status */} + {analysisProgress && ( + } + label={ + analysisProgress.total > 1 + ? `Analyzing ${Math.min(analysisProgress.completed + 1, analysisProgress.total)} of ${analysisProgress.total} with AI` + : 'Analyzing 1 item with AI' + } + progressAriaLabel="AI analysis progress" + progressPercent={analysisPercent} + meta={( + <> + {Math.round(analysisPercent)}% + {!analysisProgress.cancelRequested ? ( + + ) : ( + Cancelling… + )} + + )} + trailing={} + fillClassName="bg-purple-500" + /> + )} + + {/* Transcript generation progress bar */} + {transcribingCount > 0 && ( + } + label={`Generating ${transcribingCount} ${transcribingCount === 1 ? 'transcript' : 'transcripts'} in background`} + progressAriaLabel="Transcript generation progress" + progressPercent={transcribingAvgProgress * 100} + meta={( + <> + {singleTranscriptionStageLabel && ( + + {singleTranscriptionStageLabel} + + )} + + {Math.round(transcribingAvgProgress * 100)}% + + + + )} + fillClassName="bg-blue-500" + /> + )} + {/* Proxy generation progress bar */} {generatingCount > 0 && ( -
-
- -
-
- - Generating {generatingCount} {generatingCount === 1 ? 'proxy' : 'proxies'} in background - -
- - {Math.round(generatingAvgProgress * 100)}% - - -
-
-
-
-
-
-
-
+ } + label={`Generating ${generatingCount} ${generatingCount === 1 ? 'proxy' : 'proxies'} in background`} + progressAriaLabel="Proxy generation progress" + progressPercent={generatingAvgProgress * 100} + meta={( + <> + + {Math.round(generatingAvgProgress * 100)}% + + + + )} + fillClassName="bg-green-500" + /> )} {/* Delete confirmation dialog */} diff --git a/src/features/media-library/components/transcribe-dialog.test.tsx b/src/features/media-library/components/transcribe-dialog.test.tsx new file mode 100644 index 000000000..fb10e26fa --- /dev/null +++ b/src/features/media-library/components/transcribe-dialog.test.tsx @@ -0,0 +1,261 @@ +import type { ReactNode } from 'react'; +import React, { useContext } from 'react'; +import { fireEvent, render, screen } from '@testing-library/react'; +import { beforeEach, describe, expect, it, vi } from 'vitest'; + +const settingsStoreState = vi.hoisted(() => ({ + defaultWhisperModel: 'whisper-base', + defaultWhisperQuantization: 'hybrid', + defaultWhisperLanguage: '', +})); + +const editorStoreState = vi.hoisted(() => ({ + clearMediaSkimPreview: vi.fn(), + clearCompoundClipSkimPreview: vi.fn(), + beginTranscriptionDialog: vi.fn(), + endTranscriptionDialog: vi.fn(), +})); + +const playbackStoreState = vi.hoisted(() => ({ + setPreviewFrame: vi.fn(), + pause: vi.fn(), +})); + +vi.mock('@/features/media-library/deps/settings-contract', () => ({ + useSettingsStore: (selector: (state: typeof settingsStoreState) => unknown) => selector(settingsStoreState), +})); + +vi.mock('@/app/state/editor', () => ({ + useEditorStore: (selector: (state: typeof editorStoreState) => unknown) => selector(editorStoreState), +})); + +vi.mock('@/shared/state/playback', () => ({ + usePlaybackStore: { + getState: () => playbackStoreState, + }, +})); + +vi.mock('../transcription/registry', () => ({ + getMediaTranscriptionModelOptions: () => [ + { value: 'whisper-base', label: 'Whisper Base' }, + ], +})); + +vi.mock('@/shared/utils/whisper-settings', () => ({ + getWhisperLanguageSelectValue: (value: string) => value, + getWhisperLanguageSettingValue: (value: string) => value, + normalizeSelectableWhisperModel: (value: string) => value, + WHISPER_LANGUAGE_OPTIONS: [ + { value: '', label: 'Auto-detect' }, + ], + WHISPER_QUANTIZATION_OPTIONS: [ + { value: 'hybrid', label: 'Hybrid' }, + ], +})); + +vi.mock('@/components/ui/button', () => ({ + Button: ({ + children, + onClick, + disabled, + }: { + children: ReactNode; + onClick?: () => void; + disabled?: boolean; + }) => ( + + ), +})); + +vi.mock('@/components/ui/label', () => ({ + Label: ({ children }: { children: ReactNode }) => , +})); + +vi.mock('@/components/ui/combobox', () => ({ + Combobox: ({ value, onValueChange, disabled }: { + value: string; + onValueChange: (value: string) => void; + disabled?: boolean; + }) => ( + onValueChange(event.target.value)} + /> + ), +})); + +vi.mock('@/components/ui/select', () => ({ + Select: ({ + children, + value, + onValueChange, + disabled, + }: { + children: ReactNode; + value: string; + onValueChange: (value: string) => void; + disabled?: boolean; + }) => ( + + ), + SelectTrigger: ({ children }: { children: ReactNode }) => <>{children}, + SelectValue: () => null, + SelectContent: ({ children }: { children: ReactNode }) => <>{children}, + SelectItem: ({ children, value }: { children: ReactNode; value: string }) => ( + + ), +})); + +vi.mock('lucide-react', () => ({ + Loader2: () => , + Square: () => , +})); + +vi.mock('@/components/ui/dialog', async () => { + const ReactModule = await import('react'); + const DialogContext = ReactModule.createContext<{ + open: boolean; + onOpenChange: (open: boolean) => void; + }>({ + open: false, + onOpenChange: () => {}, + }); + + return { + Dialog: ({ + open, + onOpenChange, + children, + }: { + open: boolean; + onOpenChange: (open: boolean) => void; + children: ReactNode; + }) => ( + + {open ? ( +
+ + {children} +
+ ) : null} +
+ ), + DialogContent: ({ + children, + hideCloseButton, + }: { + children: ReactNode; + hideCloseButton?: boolean; + }) => { + const { open, onOpenChange } = useContext(DialogContext); + if (!open) return null; + return ( +
+ {!hideCloseButton && ( + + )} + {children} +
+ ); + }, + DialogHeader: ({ children }: { children: ReactNode }) =>
{children}
, + DialogTitle: ({ children }: { children: ReactNode }) =>

{children}

, + DialogDescription: ({ children }: { children: ReactNode }) =>

{children}

, + DialogFooter: ({ children }: { children: ReactNode }) =>
{children}
, + }; +}); + +import { TranscribeDialog } from './transcribe-dialog'; + +describe('TranscribeDialog', () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + it('clears background skim and scrub previews when opened', () => { + render( + + ); + + expect(editorStoreState.clearMediaSkimPreview).toHaveBeenCalledTimes(1); + expect(editorStoreState.clearCompoundClipSkimPreview).toHaveBeenCalledTimes(1); + expect(editorStoreState.beginTranscriptionDialog).toHaveBeenCalledTimes(1); + expect(playbackStoreState.setPreviewFrame).toHaveBeenCalledWith(null); + expect(playbackStoreState.pause).toHaveBeenCalledTimes(1); + }); + + it('requires stopping before the dialog can close mid-transcription', () => { + const onOpenChange = vi.fn(); + const onCancel = vi.fn(); + + render( + + ); + + expect(screen.queryByRole('button', { name: 'Close' })).not.toBeInTheDocument(); + expect(screen.queryByText('Close')).not.toBeInTheDocument(); + expect(screen.getByRole('button', { name: 'Stop' })).toBeInTheDocument(); + + fireEvent.click(screen.getByTestId('dialog-dismiss')); + expect(onOpenChange).not.toHaveBeenCalled(); + + fireEvent.click(screen.getByRole('button', { name: 'Stop' })); + expect(onCancel).toHaveBeenCalledTimes(1); + }); + + it('allows closing again once transcription is idle', () => { + const onOpenChange = vi.fn(); + + render( + + ); + + fireEvent.click(screen.getByRole('button', { name: 'Close' })); + expect(onOpenChange).toHaveBeenCalledWith(false); + }); +}); diff --git a/src/features/media-library/components/transcribe-dialog.tsx b/src/features/media-library/components/transcribe-dialog.tsx new file mode 100644 index 000000000..2a306b89b --- /dev/null +++ b/src/features/media-library/components/transcribe-dialog.tsx @@ -0,0 +1,255 @@ +import { useCallback, useEffect, useMemo, useState } from 'react'; +import { Loader2, Square } from 'lucide-react'; +import { + Dialog, + DialogContent, + DialogDescription, + DialogFooter, + DialogHeader, + DialogTitle, +} from '@/components/ui/dialog'; +import { Button } from '@/components/ui/button'; +import { Label } from '@/components/ui/label'; +import { Combobox } from '@/components/ui/combobox'; +import { useEditorStore } from '@/app/state/editor'; +import { usePlaybackStore } from '@/shared/state/playback'; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from '@/components/ui/select'; +import { useSettingsStore } from '@/features/media-library/deps/settings-contract'; +import { + getMediaTranscriptionModelOptions, +} from '../transcription/registry'; +import { + getWhisperLanguageSelectValue, + getWhisperLanguageSettingValue, + normalizeSelectableWhisperModel, + WHISPER_LANGUAGE_OPTIONS, + WHISPER_QUANTIZATION_OPTIONS, +} from '@/shared/utils/whisper-settings'; +import type { + MediaTranscriptModel, + MediaTranscriptQuantization, +} from '@/types/storage'; + +export interface TranscribeDialogValues { + model: MediaTranscriptModel; + quantization: MediaTranscriptQuantization; + language: string; +} + +interface TranscribeDialogProps { + open: boolean; + onOpenChange: (open: boolean) => void; + fileName: string; + hasTranscript: boolean; + isRunning: boolean; + progressPercent: number | null; + progressLabel: string; + errorMessage?: string | null; + onStart: (values: TranscribeDialogValues) => void; + onCancel: () => void; +} + +export function TranscribeDialog({ + open, + onOpenChange, + fileName, + hasTranscript, + isRunning, + progressPercent, + progressLabel, + errorMessage, + onStart, + onCancel, +}: TranscribeDialogProps) { + const defaultModel = useSettingsStore((s) => s.defaultWhisperModel); + const defaultQuantization = useSettingsStore((s) => s.defaultWhisperQuantization); + const defaultLanguage = useSettingsStore((s) => s.defaultWhisperLanguage); + const clearMediaSkimPreview = useEditorStore((s) => s.clearMediaSkimPreview); + const clearCompoundClipSkimPreview = useEditorStore((s) => s.clearCompoundClipSkimPreview); + const beginTranscriptionDialog = useEditorStore((s) => s.beginTranscriptionDialog); + const endTranscriptionDialog = useEditorStore((s) => s.endTranscriptionDialog); + + const modelOptions = useMemo(() => getMediaTranscriptionModelOptions(), []); + + const [model, setModel] = useState(() => + normalizeSelectableWhisperModel(defaultModel), + ); + const [quantization, setQuantization] = useState(defaultQuantization); + const [languageValue, setLanguageValue] = useState(() => + getWhisperLanguageSelectValue(defaultLanguage), + ); + + useEffect(() => { + if (!open) return; + setModel(normalizeSelectableWhisperModel(defaultModel)); + setQuantization(defaultQuantization); + setLanguageValue(getWhisperLanguageSelectValue(defaultLanguage)); + }, [open, defaultLanguage, defaultModel, defaultQuantization]); + + useEffect(() => { + if (!open) return; + beginTranscriptionDialog(); + clearMediaSkimPreview(); + clearCompoundClipSkimPreview(); + usePlaybackStore.getState().setPreviewFrame(null); + usePlaybackStore.getState().pause(); + + return () => { + endTranscriptionDialog(); + }; + }, [ + beginTranscriptionDialog, + clearCompoundClipSkimPreview, + clearMediaSkimPreview, + endTranscriptionDialog, + open, + ]); + + const handleStart = () => { + onStart({ + model, + quantization, + language: getWhisperLanguageSettingValue(languageValue), + }); + }; + + const handleOpenChange = useCallback((nextOpen: boolean) => { + if (isRunning && !nextOpen) { + return; + } + onOpenChange(nextOpen); + }, [isRunning, onOpenChange]); + + const title = hasTranscript ? 'Refresh Transcript' : 'Generate Transcript'; + + return ( + + event.preventDefault()} + onInteractOutside={(event) => event.preventDefault()} + onEscapeKeyDown={(event) => { + if (isRunning) event.preventDefault(); + }} + > + + {title} + {fileName} + + +
+
+ + +
+ +
+ + +
+ +
+ + +
+ + {errorMessage && !isRunning && ( +
+ {errorMessage} +
+ )} + + {isRunning && ( +
+
+ + {progressLabel} +
+ {progressPercent !== null && ( +
+
+
+ )} +
+ )} +
+ + + {isRunning ? ( + + ) : ( + <> + + + + )} + + +
+ ); +} diff --git a/src/features/media-library/contracts/timeline.ts b/src/features/media-library/contracts/timeline.ts index b143f9fff..1c605e6dd 100644 --- a/src/features/media-library/contracts/timeline.ts +++ b/src/features/media-library/contracts/timeline.ts @@ -10,6 +10,10 @@ export { getMediaTranscriptionModelLabel, getMediaTranscriptionModelOptions, } from '../transcription/registry'; +export { + TranscribeDialog, + type TranscribeDialogValues, +} from '../components/transcribe-dialog'; export { opfsService } from '../services/opfs-service'; export { resolveMediaUrl, diff --git a/src/features/media-library/deps/analysis-contract.ts b/src/features/media-library/deps/analysis-contract.ts new file mode 100644 index 000000000..9291c6d1f --- /dev/null +++ b/src/features/media-library/deps/analysis-contract.ts @@ -0,0 +1,28 @@ +/** + * Cross-feature contract — analysis infrastructure used by media-library. + * + * Split out of `analysis.ts` so additional analysis imports (embeddings, + * future providers) stay in one auditable place for the boundary checker. + */ + +export { captionVideo, captionImage } from '@/infrastructure/analysis'; +export type { MediaCaption, CaptioningProgress, CaptioningOptions } from '@/infrastructure/analysis'; +export { + embeddingsProvider, + EMBEDDING_MODEL_ID, + EMBEDDING_MODEL_DIM, + clipProvider, + CLIP_MODEL_ID, + CLIP_EMBEDDING_DIM, + buildEmbeddingText, + extractDominantColors, + extractDominantColorPhrase, +} from '@/infrastructure/analysis'; +export type { + EmbeddingsOptions, + EmbeddingsProgress, + EmbeddingsProvider, + BuildEmbeddingTextInput, + TranscriptSegment, + PaletteEntry, +} from '@/infrastructure/analysis'; diff --git a/src/features/media-library/deps/analysis.ts b/src/features/media-library/deps/analysis.ts index c68c392c2..6195064ad 100644 --- a/src/features/media-library/deps/analysis.ts +++ b/src/features/media-library/deps/analysis.ts @@ -1,2 +1 @@ -export { captionVideo, captionImage } from '@/infrastructure/analysis'; -export type { MediaCaption, CaptioningProgress, CaptioningOptions } from '@/infrastructure/analysis'; +export * from './analysis-contract'; diff --git a/src/features/media-library/deps/composition-runtime-contract.ts b/src/features/media-library/deps/composition-runtime-contract.ts index 574b11249..5a8c7362c 100644 --- a/src/features/media-library/deps/composition-runtime-contract.ts +++ b/src/features/media-library/deps/composition-runtime-contract.ts @@ -8,4 +8,7 @@ export { startPreviewAudioConform, startPreviewAudioStartupWarm, } from '@/features/composition-runtime/utils/audio-decode-cache'; -export { deletePreviewAudioConform } from '@/features/composition-runtime/utils/preview-audio-conform'; +export { + deletePreviewAudioConform, + resolvePreviewAudioConformUrl, +} from '@/features/composition-runtime/utils/preview-audio-conform'; diff --git a/src/features/media-library/deps/scene-browser-contract.ts b/src/features/media-library/deps/scene-browser-contract.ts new file mode 100644 index 000000000..54078cba9 --- /dev/null +++ b/src/features/media-library/deps/scene-browser-contract.ts @@ -0,0 +1,10 @@ +/** + * Adapter — media-library mounts the Scene Browser panel and opens it from + * the info popover through this contract. + */ + +export { + SceneBrowserPanel, + useSceneBrowserStore, + invalidateMediaCaptionThumbnails, +} from '@/features/scene-browser'; diff --git a/src/features/media-library/deps/scene-browser.ts b/src/features/media-library/deps/scene-browser.ts new file mode 100644 index 000000000..bf30c3858 --- /dev/null +++ b/src/features/media-library/deps/scene-browser.ts @@ -0,0 +1 @@ +export * from './scene-browser-contract'; diff --git a/src/features/media-library/deps/settings-contract.ts b/src/features/media-library/deps/settings-contract.ts index c704a2be5..aa61d8958 100644 --- a/src/features/media-library/deps/settings-contract.ts +++ b/src/features/media-library/deps/settings-contract.ts @@ -2,4 +2,9 @@ * Adapter — re-exports settings store for media-library consumption. */ -export { useSettingsStore } from '@/features/settings/stores/settings-store'; +export { + useSettingsStore, + resolveCaptioningIntervalSec, + DEFAULT_CAPTIONING_INTERVAL_SECONDS, +} from '@/features/settings/stores/settings-store'; +export type { CaptioningIntervalUnit } from '@/features/settings/stores/settings-store'; diff --git a/src/features/media-library/services/media-analysis-service.test.ts b/src/features/media-library/services/media-analysis-service.test.ts new file mode 100644 index 000000000..eda66eb79 --- /dev/null +++ b/src/features/media-library/services/media-analysis-service.test.ts @@ -0,0 +1,167 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest'; +import type { MediaMetadata } from '@/types/storage'; + +const captionImageMock = vi.fn(); +const captionVideoMock = vi.fn(); +const resolveCaptioningIntervalSecMock = vi.fn(() => 3); +const saveCaptionThumbnailMock = vi.fn(); +const deleteCaptionThumbnailsMock = vi.fn(); +const deleteCaptionEmbeddingsMock = vi.fn(); +const updateMediaCaptionsMock = vi.fn(); +const getMediaBlobUrlMock = vi.fn(); +const invalidateMediaCaptionThumbnailsMock = vi.fn(); +const storeGetStateMock = vi.fn(); + +let storeState: ReturnType; + +vi.mock('../deps/analysis', () => ({ + captionImage: captionImageMock, + captionVideo: captionVideoMock, + EMBEDDING_MODEL_ID: 'embed-model', + EMBEDDING_MODEL_DIM: 384, + CLIP_MODEL_ID: 'clip-model', + CLIP_EMBEDDING_DIM: 512, + embeddingsProvider: { + ensureReady: vi.fn(), + embedBatch: vi.fn(), + }, + clipProvider: { + ensureReady: vi.fn(), + embedImages: vi.fn(), + }, + buildEmbeddingText: vi.fn(() => 'caption text'), + extractDominantColors: vi.fn(), +})); + +vi.mock('../deps/settings-contract', () => ({ + useSettingsStore: { + getState: () => ({ + captioningIntervalUnit: 'seconds', + captioningIntervalValue: 3, + }), + }, + resolveCaptioningIntervalSec: resolveCaptioningIntervalSecMock, +})); + +vi.mock('@/infrastructure/storage', () => ({ + saveCaptionThumbnail: saveCaptionThumbnailMock, + deleteCaptionThumbnails: deleteCaptionThumbnailsMock, + deleteCaptionEmbeddings: deleteCaptionEmbeddingsMock, + saveCaptionEmbeddings: vi.fn(), + saveCaptionImageEmbeddings: vi.fn(), + getTranscript: vi.fn(), +})); + +vi.mock('../deps/scene-browser', () => ({ + invalidateMediaCaptionThumbnails: invalidateMediaCaptionThumbnailsMock, +})); + +vi.mock('../stores/media-library-store', () => ({ + useMediaLibraryStore: { + getState: storeGetStateMock, + }, +})); + +vi.mock('./media-library-service', () => ({ + mediaLibraryService: { + getMediaBlobUrl: getMediaBlobUrlMock, + updateMediaCaptions: updateMediaCaptionsMock, + }, +})); + +vi.mock('../utils/validation', () => ({ + getMediaType: (mimeType: string) => { + if (mimeType.startsWith('image/')) return 'image'; + if (mimeType.startsWith('video/')) return 'video'; + return 'unknown'; + }, +})); + +const { mediaAnalysisService } = await import('./media-analysis-service'); + +function createStoreState() { + return { + analysisProgress: null as { total: number; completed: number; cancelRequested: boolean } | null, + beginAnalysisRun: vi.fn((count: number) => { + storeState.analysisProgress = { total: count, completed: 0, cancelRequested: false }; + }), + incrementAnalysisCompleted: vi.fn(), + endAnalysisRun: vi.fn(() => { + storeState.analysisProgress = null; + }), + setTaggingMedia: vi.fn(), + updateMediaCaptions: vi.fn(), + showNotification: vi.fn(), + }; +} + +function makeMedia(overrides: Partial = {}): MediaMetadata { + return { + id: 'media-1', + fileName: 'frame.png', + storageType: 'opfs', + fileSize: 1024, + mimeType: 'image/png', + duration: 0, + width: 1920, + height: 1080, + fps: 30, + bitrate: 0, + codec: 'png', + thumbnailId: 'thumb-1', + tags: [], + createdAt: 1, + updatedAt: 1, + ...overrides, + }; +} + +describe('mediaAnalysisService.analyzeMedia', () => { + beforeEach(() => { + vi.clearAllMocks(); + storeState = createStoreState(); + storeGetStateMock.mockImplementation(() => storeState); + getMediaBlobUrlMock.mockResolvedValue('blob:media-1'); + updateMediaCaptionsMock.mockImplementation(async (_mediaId: string, captions: unknown) => ({ + ...makeMedia(), + aiCaptions: captions as MediaMetadata['aiCaptions'], + })); + captionVideoMock.mockReset(); + captionImageMock.mockReset(); + resolveCaptioningIntervalSecMock.mockReturnValue(3); + vi.stubGlobal('fetch', vi.fn(async () => new Response(new Blob(['image-bytes'], { type: 'image/png' })))); + Object.defineProperty(URL, 'revokeObjectURL', { + configurable: true, + value: vi.fn(), + }); + }); + + it('keeps existing scene assets intact when re-analysis fails', async () => { + const media = makeMedia({ + aiCaptions: [{ timeSec: 0, text: 'Existing caption', thumbRelPath: 'media/media-1/cache/ai/captions-thumbs/0.jpg' }], + }); + captionImageMock.mockRejectedValue(new Error('caption failed')); + + await expect(mediaAnalysisService.analyzeMedia(media)).resolves.toBe(false); + + expect(deleteCaptionThumbnailsMock).not.toHaveBeenCalled(); + expect(deleteCaptionEmbeddingsMock).not.toHaveBeenCalled(); + expect(updateMediaCaptionsMock).not.toHaveBeenCalled(); + }); + + it('clears caption metadata and old assets when a rerun finds no scenes', async () => { + const media = makeMedia({ + aiCaptions: [{ timeSec: 0, text: 'Existing caption', thumbRelPath: 'media/media-1/cache/ai/captions-thumbs/0.jpg' }], + }); + captionImageMock.mockResolvedValue([]); + + await expect(mediaAnalysisService.analyzeMedia(media)).resolves.toBe(true); + + expect(updateMediaCaptionsMock).toHaveBeenCalledWith(media.id, [], { + sampleIntervalSec: 3, + }); + expect(storeState.updateMediaCaptions).toHaveBeenCalledWith(media.id, []); + expect(deleteCaptionThumbnailsMock).toHaveBeenCalledWith(media.id); + expect(deleteCaptionEmbeddingsMock).toHaveBeenCalledWith(media.id); + }); +}); diff --git a/src/features/media-library/services/media-analysis-service.ts b/src/features/media-library/services/media-analysis-service.ts new file mode 100644 index 000000000..1963c9f7a --- /dev/null +++ b/src/features/media-library/services/media-analysis-service.ts @@ -0,0 +1,385 @@ +/** + * Runs the "Analyze with AI" pipeline for a single media item — captions, + * dominant-color palette, text embeddings, and CLIP image embeddings — so + * both the media card's per-item menu and the scene browser's "analyze all" + * action hit the exact same path. + * + * Extracted from `media-card.tsx` so there's one authoritative flow for + * wiping stale thumbs/embeddings, running the captioner, indexing, and + * persisting to the workspace. The call site does nothing but drive UI. + */ + +import type { MediaMetadata } from '@/types/storage'; +import { + captionVideo, + captionImage, + type MediaCaption, + embeddingsProvider, + EMBEDDING_MODEL_ID, + EMBEDDING_MODEL_DIM, + clipProvider, + CLIP_MODEL_ID, + CLIP_EMBEDDING_DIM, + buildEmbeddingText, + extractDominantColors, +} from '../deps/analysis'; +import { + useSettingsStore, + resolveCaptioningIntervalSec, +} from '../deps/settings-contract'; +import { + saveCaptionThumbnail, + deleteCaptionThumbnails, + deleteCaptionEmbeddings, + saveCaptionEmbeddings, + saveCaptionImageEmbeddings, + getTranscript, +} from '@/infrastructure/storage'; +import { invalidateMediaCaptionThumbnails } from '../deps/scene-browser'; +import { useMediaLibraryStore } from '../stores/media-library-store'; +import { mediaLibraryService } from './media-library-service'; +import { getMediaType } from '../utils/validation'; +import { createLogger } from '@/shared/logging/logger'; + +const logger = createLogger('MediaAnalysisService'); + +export interface AnalyzeBatchResult { + analyzed: number; + skipped: number; + failed: number; +} + +export interface AnalyzeBatchOptions { + /** When true, only analyze media that has no captions yet. Default: false (re-analyze everything). */ + onlyMissing?: boolean; + /** Optional filter for which media to consider (e.g. a single scope id). */ + mediaIds?: readonly string[]; +} + +class MediaAnalysisService { + private batchInFlight = false; + + /** + * Analyze a single media item end-to-end. Accepts either a mediaId (resolved + * from the library store) or the full `MediaMetadata` when the caller + * already has it. Returns true on success, false on failure — notifications + * are surfaced via the media-library store either way. + * + * When called standalone (not from `analyzeBatch`), wraps itself as a + * 1-item run so the background progress bar shows a concrete 0→100% + * instead of a pulsing indeterminate bar. + */ + async analyzeMedia(mediaOrId: string | MediaMetadata): Promise { + const store = useMediaLibraryStore.getState(); + const media = typeof mediaOrId === 'string' + ? store.mediaItems.find((m) => m.id === mediaOrId) + : mediaOrId; + if (!media) return false; + + const ownsRun = !this.batchInFlight && !store.analysisProgress; + if (ownsRun) { + store.beginAnalysisRun(1); + } + try { + const ok = await this.analyzeOne(media); + if (ownsRun) { + useMediaLibraryStore.getState().incrementAnalysisCompleted(1); + } + return ok; + } finally { + if (ownsRun) { + useMediaLibraryStore.getState().endAnalysisRun(); + } + } + } + + private async analyzeOne(media: MediaMetadata): Promise { + const store = useMediaLibraryStore.getState(); + const mediaType = getMediaType(media.mimeType); + if (mediaType !== 'video' && mediaType !== 'image') return false; + + const { captioningIntervalUnit, captioningIntervalValue } = useSettingsStore.getState(); + const sampleIntervalSec = resolveCaptioningIntervalSec( + captioningIntervalUnit, + captioningIntervalValue, + media.fps, + ); + + store.setTaggingMedia(media.id, true); + + try { + // Drop every in-memory thumbnail URL and semantic cache entry tied to + // this media before re-analysis starts. If the rerun fails, the old + // on-disk assets still exist and can be rehydrated on demand; if it + // succeeds, fresh thumbs/embeddings repopulate the caches below. + invalidateMediaCaptionThumbnails(media.id); + + let captions: MediaCaption[]; + const stagedThumbnailBlobs = new Map(); + + const stageThumbnail = async (index: number, blob: Blob): Promise => { + stagedThumbnailBlobs.set(index, blob); + return undefined; + }; + + if (mediaType === 'video') { + const blobUrl = await mediaLibraryService.getMediaBlobUrl(media.id); + if (!blobUrl) throw new Error('Could not load media file'); + + const video = document.createElement('video'); + video.muted = true; + video.preload = 'auto'; + video.src = blobUrl; + + await new Promise((resolve, reject) => { + video.onloadedmetadata = () => resolve(); + video.onerror = () => reject(new Error('Failed to load video')); + }); + + try { + captions = await captionVideo(video, { + sampleIntervalSec, + saveThumbnail: stageThumbnail, + }); + } finally { + video.src = ''; + URL.revokeObjectURL(blobUrl); + } + } else { + const blobUrl = await mediaLibraryService.getMediaBlobUrl(media.id); + if (!blobUrl) throw new Error('Could not load media file'); + + const response = await fetch(blobUrl); + const blob = await response.blob(); + URL.revokeObjectURL(blobUrl); + captions = await captionImage(blob, { saveThumbnail: stageThumbnail }); + } + + if (captions.length > 0) { + let embeddingModel: string | undefined; + let embeddingDim: number | undefined; + let imageEmbeddingModel: string | undefined; + let imageEmbeddingDim: number | undefined; + let captionsWithEmbeddings = captions; + + const thumbBlobs = captions.map((_, index) => + stagedThumbnailBlobs.get(index) ?? null, + ); + + const colorResults = await Promise.all( + thumbBlobs.map(async (blob) => { + if (!blob) return { phrase: '', palette: [] as const }; + try { return await extractDominantColors(blob); } + catch { return { phrase: '', palette: [] as const }; } + }), + ); + const palettesByIndex = colorResults.map((r) => r.palette); + + captionsWithEmbeddings = captions.map((caption, i) => { + const palette = palettesByIndex[i]; + const next = { ...caption } as typeof caption & { + palette?: typeof palette; + }; + if (palette && palette.length > 0) next.palette = [...palette]; + return next; + }); + + try { + await embeddingsProvider.ensureReady(); + + const transcript = await getTranscript(media.id).catch(() => null); + + const texts = captions.map((caption, i) => buildEmbeddingText({ + caption: { text: caption.text, timeSec: caption.timeSec }, + sceneData: caption.sceneData, + transcriptSegments: transcript?.segments, + colorPhrase: colorResults[i]?.phrase ?? '', + })); + + const vectors = await embeddingsProvider.embedBatch(texts); + if (vectors.length === captions.length) { + await saveCaptionEmbeddings(media.id, vectors, EMBEDDING_MODEL_DIM); + embeddingModel = EMBEDDING_MODEL_ID; + embeddingDim = EMBEDDING_MODEL_DIM; + captionsWithEmbeddings = captionsWithEmbeddings.map((caption, i) => ({ + ...caption, + embedding: Array.from(vectors[i]!), + })); + } + } catch (error) { + store.showNotification({ + type: 'warning', + message: `Semantic indexing skipped for "${media.fileName}" — keyword search still works.`, + }); + void error; + } + + try { + const validBlobs = thumbBlobs.filter((b): b is Blob => b !== null); + if (validBlobs.length > 0 && validBlobs.length === captions.length) { + await clipProvider.ensureReady(); + const imageVectors = await clipProvider.embedImages(validBlobs); + if (imageVectors.length === captions.length) { + await saveCaptionImageEmbeddings(media.id, imageVectors, CLIP_EMBEDDING_DIM); + imageEmbeddingModel = CLIP_MODEL_ID; + imageEmbeddingDim = CLIP_EMBEDDING_DIM; + } + } + } catch (error) { + void error; + } + + if (stagedThumbnailBlobs.size > 0) { + captionsWithEmbeddings = await Promise.all( + captionsWithEmbeddings.map(async (caption, index) => { + const blob = stagedThumbnailBlobs.get(index); + if (!blob) return caption; + try { + const thumbRelPath = await saveCaptionThumbnail(media.id, index, blob); + return { ...caption, thumbRelPath }; + } catch { + return caption; + } + }), + ); + } + + await mediaLibraryService.updateMediaCaptions(media.id, captionsWithEmbeddings, { + sampleIntervalSec, + embeddingModel, + embeddingDim, + imageEmbeddingModel, + imageEmbeddingDim, + }); + store.updateMediaCaptions(media.id, captionsWithEmbeddings); + + const sceneCaptionCountLabel = `${captions.length} scene caption${captions.length === 1 ? '' : 's'}`; + store.showNotification({ + type: 'success', + message: `Generated ${sceneCaptionCountLabel} for "${media.fileName}"`, + }); + } else { + await mediaLibraryService.updateMediaCaptions(media.id, [], { + sampleIntervalSec, + }); + store.updateMediaCaptions(media.id, []); + await deleteCaptionThumbnails(media.id); + await deleteCaptionEmbeddings(media.id); + store.showNotification({ + type: 'info', + message: `No scene captions generated for "${media.fileName}"`, + }); + } + return true; + } catch (error) { + store.showNotification({ + type: 'error', + message: error instanceof Error ? error.message : 'Failed to analyze media', + }); + return false; + } finally { + store.setTaggingMedia(media.id, false); + } + } + + /** + * Analyze a batch of media sequentially. Sequential avoids thrashing the + * shared WebGPU device and CLIP model — parallelism here would starve the + * preview canvas and risk OOM on longer videos. + * + * A single batch is the unit of concurrency — calling twice while one is + * running is a no-op (second call resolves immediately with zeros). The + * per-item tagging flag blocks any overlapping per-card "Analyze" clicks. + */ + async analyzeBatch(options: AnalyzeBatchOptions = {}): Promise { + if (this.batchInFlight) { + return { analyzed: 0, skipped: 0, failed: 0 }; + } + this.batchInFlight = true; + + const store = useMediaLibraryStore.getState(); + const all = store.mediaItems; + const pool = options.mediaIds + ? all.filter((m) => options.mediaIds!.includes(m.id)) + : all; + + const targets = pool.filter((m) => { + const type = getMediaType(m.mimeType); + if (type !== 'video' && type !== 'image') return false; + if (options.onlyMissing && (m.aiCaptions?.length ?? 0) > 0) return false; + return true; + }); + + let analyzed = 0; + let failed = 0; + let cancelled = 0; + const skipped = pool.length - targets.length; + + try { + if (targets.length === 0) { + store.showNotification({ + type: 'info', + message: options.onlyMissing + ? 'No unanalyzed media to process.' + : 'No media to analyze.', + }); + return { analyzed: 0, skipped, failed: 0 }; + } + + store.beginAnalysisRun(targets.length); + store.showNotification({ + type: 'info', + message: targets.length === 1 + ? `Analyzing "${firstName(targets)}"…` + : `Analyzing ${targets.length} media files…`, + }); + + for (const media of targets) { + // Cancel is cooperative — the in-flight item finishes first. Any + // remaining items are skipped but still counted toward `completed` + // so the progress bar reaches 100% and unmounts cleanly instead + // of stranding the user with a stuck bar. + const { analysisProgress } = useMediaLibraryStore.getState(); + if (analysisProgress?.cancelRequested) { + cancelled = targets.length - (analyzed + failed); + useMediaLibraryStore.getState().incrementAnalysisCompleted(cancelled); + break; + } + logger.info('batch analyzing media', { mediaId: media.id, fileName: media.fileName }); + const ok = await this.analyzeOne(media); + if (ok) analyzed += 1; + else failed += 1; + useMediaLibraryStore.getState().incrementAnalysisCompleted(1); + } + + if (targets.length > 1) { + const suffix = failed > 0 ? ` — ${failed} failed` : ''; + const cancelSuffix = cancelled > 0 ? ` (${cancelled} cancelled)` : ''; + store.showNotification({ + type: cancelled > 0 ? 'warning' : (failed === 0 ? 'success' : 'warning'), + message: `Analyzed ${analyzed}/${targets.length}${suffix}${cancelSuffix}`, + }); + } + } finally { + useMediaLibraryStore.getState().endAnalysisRun(); + this.batchInFlight = false; + } + + return { analyzed, skipped, failed }; + } + + /** Ask the currently running analysis to stop after the in-flight item. */ + requestCancel(): void { + useMediaLibraryStore.getState().requestAnalysisCancel(); + } + + isBatchInFlight(): boolean { + return this.batchInFlight; + } +} + +function firstName(items: readonly MediaMetadata[]): string { + return items[0]?.fileName ?? ''; +} + +export const mediaAnalysisService = new MediaAnalysisService(); diff --git a/src/features/media-library/services/media-captioning-service.ts b/src/features/media-library/services/media-captioning-service.ts new file mode 100644 index 000000000..addd0f487 --- /dev/null +++ b/src/features/media-library/services/media-captioning-service.ts @@ -0,0 +1,273 @@ +/** + * Bridges AI captions (vision-language-model frame descriptions) into timeline + * text items. Mirrors {@link MediaTranscriptionService.insertTranscriptAsCaptions} + * but sources from `MediaCaption[]` (point-in-time descriptions) rather than + * whisper speech-to-text segments. + * + * Keep both services aligned in behavior — if one gains new track-placement + * or replacement logic, the other usually needs the same treatment. + */ + +import { useSelectionStore } from '@/shared/state/selection'; +import { createLogger } from '@/shared/logging/logger'; +import type { MediaCaption } from '@/infrastructure/analysis'; +import type { + AudioItem, + TextItem, + TimelineItem, + TimelineTrack, + VideoItem, +} from '@/types/timeline'; +import { + aiCaptionsToSegments, + buildCaptionTextItems, + buildCaptionTrackAbove, + findReplaceableCaptionItemsForClip, + findCompatibleCaptionTrackForRanges, + isCaptionTrackCandidate, + getCaptionTextItemTemplate, + getCaptionRangeForClip, +} from '../utils/caption-items'; +import { useProjectStore } from '@/features/media-library/deps/projects'; +import { useTimelineStore } from '@/features/media-library/deps/timeline-stores'; + +const logger = createLogger('MediaCaptioningService'); + +type CaptionableClip = AudioItem | VideoItem; + +interface InsertAiCaptionsOptions { + /** Restrict insertion to these clip ids. Defaults to selection/playhead heuristics. */ + clipIds?: readonly string[]; + /** If true, pre-existing AI-caption items on matched clips are removed first. */ + replaceExisting?: boolean; + /** Sample interval reported by the captioning provider — used to size trailing caption duration. */ + sampleIntervalSec?: number; +} + +export interface InsertAiCaptionsResult { + insertedItemCount: number; + removedItemCount: number; + /** `true` when no compatible clip was found on the timeline. */ + noTargetClips: boolean; +} + +class MediaCaptioningService { + /** + * Insert AI captions as timeline text items anchored to the clips that use + * `mediaId`. Finds a compatible existing caption track per clip, or creates + * one. Returns `noTargetClips: true` when the media isn't on the timeline + * yet — callers should treat that as a soft outcome, not an error. + */ + async insertAiCaptionsOnTimeline( + mediaId: string, + captions: readonly MediaCaption[], + options: InsertAiCaptionsOptions = {}, + ): Promise { + logger.info('insertAiCaptionsOnTimeline invoked', { + mediaId, + captionCount: captions.length, + options, + }); + + if (captions.length === 0) { + return { insertedItemCount: 0, removedItemCount: 0, noTargetClips: false }; + } + + const segments = aiCaptionsToSegments(captions, options.sampleIntervalSec); + logger.info('aiCaptionsToSegments produced segments', { + mediaId, + segmentCount: segments.length, + firstSegment: segments[0], + lastSegment: segments.at(-1), + }); + if (segments.length === 0) { + return { insertedItemCount: 0, removedItemCount: 0, noTargetClips: false }; + } + + const timeline = useTimelineStore.getState(); + const project = useProjectStore.getState().currentProject; + const targetClips = this.resolveTargetClips(mediaId, options.clipIds); + logger.info('resolveTargetClips result', { + mediaId, + targetClipCount: targetClips.length, + targetClipIds: targetClips.map((c) => c.id), + allClipsWithMediaId: timeline.items.filter((i) => 'mediaId' in i && i.mediaId === mediaId).length, + }); + if (targetClips.length === 0) { + logger.info(`No timeline clips for media ${mediaId} — captions saved but not inserted`); + return { insertedItemCount: 0, removedItemCount: 0, noTargetClips: true }; + } + + const canvasWidth = project?.metadata.width ?? 1920; + const canvasHeight = project?.metadata.height ?? 1080; + const newTracks: TimelineTrack[] = [...timeline.tracks]; + const generatedCaptionIdsToRemove = options.replaceExisting + ? new Set( + targetClips.flatMap((clip) => + findReplaceableCaptionItemsForClip(timeline.items, clip, 'ai-captions').map((item) => item.id), + ), + ) + : new Set(); + const plannedItems = timeline.items.filter((item) => !generatedCaptionIdsToRemove.has(item.id)); + const insertedItems: TextItem[] = []; + + for (const clip of targetClips) { + const clipRange = getCaptionRangeForClip(clip, segments, timeline.fps); + logger.info('per-clip getCaptionRangeForClip result', { + clipId: clip.id, + clipFrom: clip.from, + clipDurationInFrames: clip.durationInFrames, + sourceStart: clip.sourceStart, + sourceEnd: clip.sourceEnd, + sourceFps: clip.sourceFps, + timelineFps: timeline.fps, + clipRange, + }); + if (!clipRange) { + continue; + } + + const existingGeneratedCaptions = options.replaceExisting + ? findReplaceableCaptionItemsForClip(timeline.items, clip, 'ai-captions') + : []; + const preferredTrackId = this.resolvePreferredTrackId( + newTracks, + plannedItems, + existingGeneratedCaptions, + clipRange, + ); + + let targetTrack = preferredTrackId + ? newTracks.find((track) => track.id === preferredTrackId) ?? null + : findCompatibleCaptionTrackForRanges( + newTracks, + plannedItems, + [{ startFrame: clipRange.startFrame, endFrame: clipRange.endFrame }], + ); + + if (!targetTrack) { + // Drop the caption track directly above the clip's own track — that's + // where users expect overlaid subtitles. `buildCaptionTrackAbove` + // picks a fractional order between the clip track and the next track + // up so no existing tracks need to shift. + const clipTrack = newTracks.find((track) => track.id === clip.trackId); + targetTrack = clipTrack + ? buildCaptionTrackAbove(newTracks, clipTrack.order) + : buildCaptionTrackAbove(newTracks, 0); + newTracks.push(targetTrack); + newTracks.sort((a, b) => a.order - b.order); + } + + const clipCaptionItems = buildCaptionTextItems({ + mediaId, + trackId: targetTrack.id, + segments, + clip, + timelineFps: timeline.fps, + canvasWidth, + canvasHeight, + sourceType: 'ai-captions', + styleTemplate: existingGeneratedCaptions[0] + ? getCaptionTextItemTemplate(existingGeneratedCaptions[0]) + : undefined, + }); + logger.info('buildCaptionTextItems produced items', { + clipId: clip.id, + trackId: targetTrack.id, + itemCount: clipCaptionItems.length, + }); + + if (clipCaptionItems.length === 0) { + continue; + } + + insertedItems.push(...clipCaptionItems); + plannedItems.push(...clipCaptionItems); + } + + logger.info('insertAiCaptionsOnTimeline finishing', { + mediaId, + insertedItemCount: insertedItems.length, + removedItemCount: generatedCaptionIdsToRemove.size, + trackChangeCount: newTracks.length - timeline.tracks.length, + }); + + const tracksChanged = newTracks.length !== timeline.tracks.length + || newTracks.some((track, index) => track.id !== timeline.tracks[index]?.id); + if (tracksChanged) { + timeline.setTracks(newTracks); + } + + if (generatedCaptionIdsToRemove.size > 0) { + timeline.removeItems([...generatedCaptionIdsToRemove]); + } + + if (insertedItems.length > 0) { + timeline.addItems(insertedItems); + useSelectionStore.getState().selectItems(insertedItems.map((item) => item.id)); + } + + return { + insertedItemCount: insertedItems.length, + removedItemCount: generatedCaptionIdsToRemove.size, + noTargetClips: false, + }; + } + + private resolveTargetClips( + mediaId: string, + clipIds?: readonly string[], + ): CaptionableClip[] { + const timeline = useTimelineStore.getState(); + const selection = useSelectionStore.getState(); + + const matchingClips = timeline.items + .filter((item): item is CaptionableClip => + (item.type === 'video' || item.type === 'audio') && item.mediaId === mediaId, + ) + .sort((a, b) => a.from - b.from); + + if (matchingClips.length === 0) return []; + + if (clipIds && clipIds.length > 0) { + const requested = new Set(clipIds); + return matchingClips.filter((clip) => requested.has(clip.id)); + } + + const selectedClips = selection.selectedItemIds + .map((id) => matchingClips.find((clip) => clip.id === id)) + .filter((clip): clip is CaptionableClip => clip !== undefined); + if (selectedClips.length > 0) return selectedClips; + + // Default: caption every clip that uses this media. The whisper flow + // picks a single clip when many exist (it's long-form speech), but AI + // frame captions are inherently per-frame-range — applying to all clips + // is the less surprising default here. + return matchingClips; + } + + private resolvePreferredTrackId( + tracks: readonly TimelineTrack[], + items: readonly TimelineItem[], + existingCaptions: ReadonlyArray<{ trackId: string }>, + range: { startFrame: number; endFrame: number }, + ): string | null { + const trackIds = [...new Set(existingCaptions.map((item) => item.trackId))]; + if (trackIds.length !== 1) return null; + + const preferredTrack = tracks.find((track) => track.id === trackIds[0]); + if (!preferredTrack || !isCaptionTrackCandidate(preferredTrack, items)) { + return null; + } + + const hasOverlap = items.some((item) => { + if (item.trackId !== preferredTrack.id) return false; + const itemEnd = item.from + item.durationInFrames; + return item.from < range.endFrame && itemEnd > range.startFrame; + }); + + return hasOverlap ? null : preferredTrack.id; + } +} + +export const mediaCaptioningService = new MediaCaptioningService(); diff --git a/src/features/media-library/services/media-library-service.ts b/src/features/media-library/services/media-library-service.ts index 390e48e64..34d18e91a 100644 --- a/src/features/media-library/services/media-library-service.ts +++ b/src/features/media-library/services/media-library-service.ts @@ -56,6 +56,8 @@ import { getMediaForProject as getMediaForProjectDB, deleteTranscript, } from '@/infrastructure/storage'; +import { saveCaptions, deleteCaptions } from '@/infrastructure/storage/workspace-fs/captions'; +import { deleteScenes } from '@/infrastructure/storage/workspace-fs/scenes'; import { filmstripCache, gifFrameCache, waveformCache } from '@/features/media-library/deps/timeline-services'; import { opfsService } from './opfs-service'; import { proxyService } from './proxy-service'; @@ -91,12 +93,13 @@ const IMPORT_BACKGROUND_WARM_DELAY_MS = 600; const IMPORT_BACKGROUND_HEAVY_DELAY_MS = 2200; /** - * Media Library Service - Coordinates OPFS + IndexedDB + metadata extraction + * Media Library Service - Coordinates handle/OPFS media access with + * workspace-backed metadata, thumbnails, and derived caches. * * Includes in-memory thumbnail URL cache to prevent flicker on re-renders. * - * Provides atomic operations for media management, ensuring OPFS and IndexedDB - * stay in sync. + * Provides atomic operations for media management while keeping origin-scoped + * sources and the workspace folder in sync. */ class MediaLibraryService { /** In-memory cache for thumbnail blob URLs to prevent flicker on re-renders */ @@ -110,6 +113,22 @@ class MediaLibraryService { } } + private async deleteCaptionsSafely(mediaId: string): Promise { + try { + await deleteCaptions(mediaId); + } catch (error) { + logger.warn('Failed to delete captions:', error); + } + } + + private async deleteScenesSafely(mediaId: string): Promise { + try { + await deleteScenes(mediaId); + } catch (error) { + logger.warn('Failed to delete scenes:', error); + } + } + private async deleteThumbnailsSafely(mediaId: string): Promise { this.clearThumbnailCache(mediaId); try { @@ -142,7 +161,7 @@ class MediaLibraryService { /** * Clear waveform caches for a fully-dereferenced media item. Removes - * the in-memory LRU entry, the IndexedDB binned persistence, and the + * the in-memory LRU entry, the persisted binned waveform cache, and the * OPFS + workspace-folder multi-resolution mirrors. */ private async clearWaveformCacheSafely(mediaId: string): Promise { @@ -217,7 +236,7 @@ class MediaLibraryService { } /** - * Get all media items from IndexedDB + * Get all media items from workspace storage */ async getAllMedia(): Promise { return getAllMediaDB(); @@ -320,7 +339,7 @@ class MediaLibraryService { // Check for unsupported audio codec (included in metadata from worker) const codecCheck = mediaProcessorService.hasUnsupportedAudioCodec(metadata); - // Stage 6: Save metadata to IndexedDB with file handle + // Stage 6: Save metadata with the file handle-backed source reference const mediaMetadata: MediaMetadata = { id, storageType: 'handle', @@ -656,6 +675,8 @@ class MediaLibraryService { await deleteMediaDB(mediaId); await this.deleteTranscriptSafely(mediaId); + await this.deleteCaptionsSafely(mediaId); + await this.deleteScenesSafely(mediaId); await this.deleteThumbnailsSafely(mediaId); await this.clearGifFrameCacheSafely(mediaId); await this.clearFilmstripCacheSafely(mediaId); @@ -758,6 +779,8 @@ class MediaLibraryService { await deleteMediaDB(id); await this.deleteTranscriptSafely(id); + await this.deleteCaptionsSafely(id); + await this.deleteScenesSafely(id); } /** @@ -1009,11 +1032,41 @@ class MediaLibraryService { /** * Update AI-generated captions for a media item. + * + * Captions live in `cache/ai/captions.json` as the authoritative source. + * We also mirror them onto `MediaMetadata.aiCaptions` so in-memory zustand + * consumers and search (`media-library-store.ts`) don't need a separate + * hydration pass — the mirror stays consistent because this is the only + * writer. */ async updateMediaCaptions( mediaId: string, - captions: Array<{ timeSec: number; text: string }>, + captions: NonNullable, + options?: { + service?: string; + model?: string; + sampleIntervalSec?: number; + embeddingModel?: string; + embeddingDim?: number; + imageEmbeddingModel?: string; + imageEmbeddingDim?: number; + }, ): Promise { + try { + await saveCaptions({ + mediaId, + captions, + service: options?.service ?? 'lfm-captioning', + model: options?.model ?? 'lfm-2.5-vl', + sampleIntervalSec: options?.sampleIntervalSec, + embeddingModel: options?.embeddingModel, + embeddingDim: options?.embeddingDim, + imageEmbeddingModel: options?.imageEmbeddingModel, + imageEmbeddingDim: options?.imageEmbeddingDim, + }); + } catch (error) { + logger.warn(`Failed to persist captions for ${mediaId}; metadata mirror will still update`, error); + } return updateMediaDB(mediaId, { aiCaptions: captions }); } @@ -1071,7 +1124,7 @@ class MediaLibraryService { } /** - * Validate sync between OPFS and IndexedDB + * Validate sync between OPFS and workspace-backed metadata * Returns list of issues found * * Note: Only validates OPFS-based media. Handle-based media is validated diff --git a/src/features/media-library/services/media-transcription-service.test.ts b/src/features/media-library/services/media-transcription-service.test.ts new file mode 100644 index 000000000..79e3f674d --- /dev/null +++ b/src/features/media-library/services/media-transcription-service.test.ts @@ -0,0 +1,532 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest'; +import { waitFor } from '@testing-library/react'; +import type { MediaTranscript } from '@/types/storage'; +import type { TimelineItem, TimelineTrack, VideoItem } from '@/types/timeline'; + +const saveTranscriptMock = vi.fn(); +const getTranscriptMock = vi.fn(); +const useTimelineStoreGetStateMock = vi.fn(); +const useProjectStoreGetStateMock = vi.fn(); +const useSelectionStoreGetStateMock = vi.fn(); +const usePlaybackStoreGetStateMock = vi.fn(); +const transcribeCollectMock = vi.fn(); +const transcribeMock = vi.fn(); +const getMediaMock = vi.fn(); +const getMediaFileMock = vi.fn(); +const startPreviewAudioConformMock = vi.fn(); +const resolvePreviewAudioConformUrlMock = vi.fn(); + +vi.mock('@/infrastructure/storage', () => ({ + deleteTranscript: vi.fn(), + getTranscript: getTranscriptMock, + getTranscriptMediaIds: vi.fn(), + saveTranscript: saveTranscriptMock, +})); + +vi.mock('@/shared/state/selection', () => ({ + useSelectionStore: { + getState: useSelectionStoreGetStateMock, + }, +})); + +vi.mock('@/shared/state/playback', () => ({ + usePlaybackStore: { + getState: usePlaybackStoreGetStateMock, + }, +})); + +vi.mock('@/features/media-library/deps/projects', () => ({ + useProjectStore: { + getState: useProjectStoreGetStateMock, + }, +})); + +vi.mock('@/features/media-library/deps/timeline-stores', () => ({ + useTimelineStore: { + getState: useTimelineStoreGetStateMock, + }, +})); + +vi.mock('@/features/media-library/deps/settings-contract', () => ({ + useSettingsStore: { + getState: () => ({ + defaultWhisperModel: 'tiny', + defaultWhisperQuantization: 'q8', + defaultWhisperLanguage: 'auto', + }), + }, +})); + +vi.mock('../transcription/registry', () => ({ + getDefaultMediaTranscriptionAdapter: () => ({ + createTranscriber: () => ({ + transcribe: transcribeMock, + }), + }), + getMediaTranscriptionModelLabel: () => 'Tiny', +})); + +vi.mock('./media-library-service', () => ({ + mediaLibraryService: { + getMedia: getMediaMock, + getMediaFile: getMediaFileMock, + }, +})); + +vi.mock('@/features/media-library/deps/composition-runtime-contract', () => ({ + needsCustomAudioDecoder: vi.fn((codec?: string) => codec === 'pcm-s16be'), + startPreviewAudioConform: startPreviewAudioConformMock, + resolvePreviewAudioConformUrl: resolvePreviewAudioConformUrlMock, +})); + +const { mediaTranscriptionService } = await import('./media-transcription-service'); + +function makeTrack(id: string, order: number): TimelineTrack { + return { + id, + name: id, + height: 64, + locked: false, + visible: true, + muted: false, + solo: false, + order, + items: [], + }; +} + +function makeTextItem( + id: string, + trackId: string, + from: number, + durationInFrames: number, +): TimelineItem { + return { + id, + type: 'text', + trackId, + from, + durationInFrames, + label: id, + text: id, + color: '#fff', + }; +} + +describe('mediaTranscriptionService.insertTranscriptAsCaptions', () => { + beforeEach(() => { + vi.clearAllMocks(); + useSelectionStoreGetStateMock.mockReturnValue({ + selectedItemIds: [], + selectItems: vi.fn(), + }); + usePlaybackStoreGetStateMock.mockReturnValue({ currentFrame: 0 }); + useProjectStoreGetStateMock.mockReturnValue({ + currentProject: { + metadata: { + width: 1920, + height: 1080, + }, + }, + }); + transcribeMock.mockReturnValue({ + collect: transcribeCollectMock, + }); + transcribeCollectMock.mockResolvedValue([]); + getMediaMock.mockResolvedValue(null); + getMediaFileMock.mockResolvedValue(null); + startPreviewAudioConformMock.mockResolvedValue(undefined); + resolvePreviewAudioConformUrlMock.mockResolvedValue(null); + }); + + it('creates a new captions track above the clip track when no compatible track exists', async () => { + const clip: VideoItem = { + id: 'clip-1', + type: 'video', + trackId: 'track-video', + from: 0, + durationInFrames: 90, + label: 'Clip', + mediaId: 'media-1', + src: 'blob:test', + sourceStart: 0, + sourceEnd: 90, + sourceDuration: 90, + sourceFps: 30, + speed: 1, + }; + const initialTracks = [ + makeTrack('track-top', 0), + makeTrack('track-video', 1), + makeTrack('track-bottom', 2), + ]; + const setTracks = vi.fn(); + const removeItems = vi.fn(); + const addItems = vi.fn(); + + useTimelineStoreGetStateMock.mockReturnValue({ + fps: 30, + tracks: initialTracks, + items: [ + clip, + makeTextItem('top-blocker', 'track-top', 0, 90), + makeTextItem('bottom-blocker', 'track-bottom', 0, 90), + ], + setTracks, + removeItems, + addItems, + }); + + const transcript: MediaTranscript = { + id: 'media-1', + mediaId: 'media-1', + model: 'tiny', + language: 'auto', + quantization: 'q8', + text: 'Hello there', + segments: [ + { text: 'Hello there', start: 0, end: 2 }, + ], + createdAt: Date.now(), + updatedAt: Date.now(), + }; + getTranscriptMock.mockResolvedValue(transcript); + + const result = await mediaTranscriptionService.insertTranscriptAsCaptions('media-1', { + clipIds: ['clip-1'], + }); + + expect(result).toEqual({ + insertedItemCount: 1, + removedItemCount: 0, + }); + expect(setTracks).toHaveBeenCalledTimes(1); + + const updatedTracks = setTracks.mock.calls[0][0] as TimelineTrack[]; + const captionTrack = updatedTracks.find((track) => !initialTracks.some((existing) => existing.id === track.id)); + expect(captionTrack).toBeDefined(); + expect(captionTrack?.order).toBe(0.5); + + expect(addItems).toHaveBeenCalledTimes(1); + const insertedItems = addItems.mock.calls[0][0] as TimelineItem[]; + expect(insertedItems).toHaveLength(1); + expect(insertedItems[0]?.trackId).toBe(captionTrack?.id); + expect(removeItems).not.toHaveBeenCalled(); + }); + + it('does not reuse an audio track when regenerating transcript captions', async () => { + const clip: VideoItem = { + id: 'clip-1', + type: 'video', + trackId: 'track-video', + from: 0, + durationInFrames: 90, + label: 'Clip', + mediaId: 'media-1', + src: 'blob:test', + sourceStart: 0, + sourceEnd: 90, + sourceDuration: 90, + sourceFps: 30, + speed: 1, + }; + const initialTracks = [ + { ...makeTrack('track-audio', 0), name: 'A1', kind: 'audio' as const }, + { ...makeTrack('track-video', 1), name: 'V1', kind: 'video' as const }, + ]; + const legacyCaptionOnAudioTrack: TimelineItem = { + id: 'caption-old', + type: 'text', + trackId: 'track-audio', + from: 0, + durationInFrames: 30, + label: 'caption-old', + text: 'caption-old', + mediaId: 'media-1', + color: '#fff', + captionSource: { + type: 'transcript', + clipId: 'clip-1', + mediaId: 'media-1', + }, + }; + const setTracks = vi.fn(); + const removeItems = vi.fn(); + const addItems = vi.fn(); + + useTimelineStoreGetStateMock.mockReturnValue({ + fps: 30, + tracks: initialTracks, + items: [clip, legacyCaptionOnAudioTrack], + setTracks, + removeItems, + addItems, + }); + + const transcript: MediaTranscript = { + id: 'media-1', + mediaId: 'media-1', + model: 'tiny', + language: 'auto', + quantization: 'q8', + text: 'Hello there', + segments: [{ text: 'Hello there', start: 0, end: 2 }], + createdAt: Date.now(), + updatedAt: Date.now(), + }; + getTranscriptMock.mockResolvedValue(transcript); + + const result = await mediaTranscriptionService.insertTranscriptAsCaptions('media-1', { + clipIds: ['clip-1'], + replaceExisting: true, + }); + + expect(result).toEqual({ + insertedItemCount: 1, + removedItemCount: 1, + }); + expect(setTracks).toHaveBeenCalledTimes(1); + + const updatedTracks = setTracks.mock.calls[0][0] as TimelineTrack[]; + const captionTrack = updatedTracks.find((track) => !initialTracks.some((existing) => existing.id === track.id)); + expect(captionTrack).toBeDefined(); + expect(captionTrack?.kind).toBe('video'); + + expect(addItems).toHaveBeenCalledTimes(1); + const insertedItems = addItems.mock.calls[0][0] as TimelineItem[]; + expect(insertedItems[0]?.trackId).toBe(captionTrack?.id); + expect(insertedItems[0]?.trackId).not.toBe('track-audio'); + expect(removeItems).toHaveBeenCalledWith(['caption-old']); + }); +}); + +describe('mediaTranscriptionService.transcribeMedia', () => { + beforeEach(() => { + vi.clearAllMocks(); + transcribeMock.mockReturnValue({ + collect: transcribeCollectMock, + }); + transcribeCollectMock.mockResolvedValue([ + { text: ' hello ', start: 0, end: 1.2 }, + ]); + startPreviewAudioConformMock.mockResolvedValue(undefined); + resolvePreviewAudioConformUrlMock.mockResolvedValue(null); + }); + + it('transcribes the original file for browser-decodable codecs', async () => { + const sourceFile = new File(['audio'], 'clip.mp3', { type: 'audio/mpeg' }); + getMediaMock.mockResolvedValue({ + id: 'media-1', + fileName: 'clip.mp3', + mimeType: 'audio/mpeg', + codec: 'mp3', + fileLastModified: 123, + }); + getMediaFileMock.mockResolvedValue(sourceFile); + + await mediaTranscriptionService.transcribeMedia('media-1'); + + expect(startPreviewAudioConformMock).not.toHaveBeenCalled(); + expect(transcribeMock).toHaveBeenCalledTimes(1); + expect(transcribeMock.mock.calls[0]?.[0]).toBe(sourceFile); + expect(saveTranscriptMock).toHaveBeenCalledTimes(1); + }); + + it('transcribes a conformed wav for custom-decoded codecs like pcm-s16be', async () => { + const sourceFile = new File(['pcm'], 'clip.aif', { type: 'audio/aiff' }); + const conformedBlob = new Blob(['wav'], { type: 'audio/wav' }); + const fetchMock = vi.spyOn(globalThis, 'fetch').mockResolvedValue({ + ok: true, + blob: async () => conformedBlob, + } as Response); + + getMediaMock.mockResolvedValue({ + id: 'media-1', + fileName: 'clip.aif', + mimeType: 'audio/aiff', + codec: 'pcm-s16be', + fileLastModified: 123, + }); + getMediaFileMock.mockResolvedValue(sourceFile); + resolvePreviewAudioConformUrlMock + .mockResolvedValueOnce(null) + .mockResolvedValueOnce('blob:conformed-audio'); + + await mediaTranscriptionService.transcribeMedia('media-1'); + + expect(startPreviewAudioConformMock).toHaveBeenCalledWith('media-1', sourceFile); + expect(resolvePreviewAudioConformUrlMock).toHaveBeenCalledWith('media-1'); + expect(transcribeMock).toHaveBeenCalledTimes(1); + + const transcribeFile = transcribeMock.mock.calls[0]?.[0] as File; + expect(transcribeFile).toBeInstanceOf(File); + expect(transcribeFile.type).toBe('audio/wav'); + + fetchMock.mockRestore(); + }); + + it('reuses a cached conformed wav without starting a new conform job', async () => { + const sourceFile = new File(['pcm'], 'clip.aif', { type: 'audio/aiff' }); + const conformedBlob = new Blob(['wav'], { type: 'audio/wav' }); + const fetchMock = vi.spyOn(globalThis, 'fetch').mockResolvedValue({ + ok: true, + blob: async () => conformedBlob, + } as Response); + + getMediaMock.mockResolvedValue({ + id: 'media-1', + fileName: 'clip.aif', + mimeType: 'audio/aiff', + codec: 'pcm-s16be', + fileLastModified: 123, + }); + getMediaFileMock.mockResolvedValue(sourceFile); + resolvePreviewAudioConformUrlMock.mockResolvedValue('blob:cached-conformed-audio'); + + await mediaTranscriptionService.transcribeMedia('media-1'); + + expect(startPreviewAudioConformMock).not.toHaveBeenCalled(); + expect(resolvePreviewAudioConformUrlMock).toHaveBeenCalledWith('media-1'); + expect(transcribeMock).toHaveBeenCalledTimes(1); + + const transcribeFile = transcribeMock.mock.calls[0]?.[0] as File; + expect(transcribeFile).toBeInstanceOf(File); + expect(transcribeFile.type).toBe('audio/wav'); + + fetchMock.mockRestore(); + }); + + it('runs only one transcription job at a time and queues later requests', async () => { + const sourceById = { + 'media-1': new File(['one'], 'one.mp3', { type: 'audio/mpeg' }), + 'media-2': new File(['two'], 'two.mp3', { type: 'audio/mpeg' }), + } as const; + getMediaMock.mockImplementation(async (mediaId: string) => ({ + id: mediaId, + fileName: `${mediaId}.mp3`, + mimeType: 'audio/mpeg', + codec: 'mp3', + fileLastModified: 123, + })); + getMediaFileMock.mockImplementation(async (mediaId: string) => sourceById[mediaId as keyof typeof sourceById]); + + let resolveFirstCollect!: (segments: Array<{ text: string; start: number; end: number }>) => void; + const firstCollect = vi.fn(() => new Promise>((resolve) => { + resolveFirstCollect = resolve; + })); + const secondCollect = vi.fn().mockResolvedValue([ + { text: ' second ', start: 0, end: 1 }, + ]); + + transcribeMock + .mockReturnValueOnce({ collect: firstCollect, cancel: vi.fn() }) + .mockReturnValueOnce({ collect: secondCollect, cancel: vi.fn() }); + + const firstQueueState = vi.fn(); + const secondQueueState = vi.fn(); + + const firstPromise = mediaTranscriptionService.transcribeMedia('media-1', { + onQueueStatusChange: firstQueueState, + }); + const secondPromise = mediaTranscriptionService.transcribeMedia('media-2', { + onQueueStatusChange: secondQueueState, + }); + + await waitFor(() => { + expect(transcribeMock).toHaveBeenCalledTimes(1); + }); + expect(firstQueueState).toHaveBeenCalledWith('running'); + expect(secondQueueState).toHaveBeenCalledWith('queued'); + + resolveFirstCollect([{ text: ' first ', start: 0, end: 1 }]); + + await firstPromise; + await secondPromise; + + expect(transcribeMock).toHaveBeenCalledTimes(2); + expect(secondQueueState).toHaveBeenCalledWith('running'); + }); + + it('cancels queued transcription jobs before they start', async () => { + const sourceById = { + 'media-1': new File(['one'], 'one.mp3', { type: 'audio/mpeg' }), + 'media-2': new File(['two'], 'two.mp3', { type: 'audio/mpeg' }), + } as const; + getMediaMock.mockImplementation(async (mediaId: string) => ({ + id: mediaId, + fileName: `${mediaId}.mp3`, + mimeType: 'audio/mpeg', + codec: 'mp3', + fileLastModified: 123, + })); + getMediaFileMock.mockImplementation(async (mediaId: string) => sourceById[mediaId as keyof typeof sourceById]); + + let resolveFirstCollect!: (segments: Array<{ text: string; start: number; end: number }>) => void; + const firstCollect = vi.fn(() => new Promise>((resolve) => { + resolveFirstCollect = resolve; + })); + + transcribeMock + .mockReturnValueOnce({ collect: firstCollect, cancel: vi.fn() }); + + const firstPromise = mediaTranscriptionService.transcribeMedia('media-1'); + const secondPromise = mediaTranscriptionService.transcribeMedia('media-2'); + + await waitFor(() => { + expect(transcribeMock).toHaveBeenCalledTimes(1); + }); + + const secondRejection = expect(secondPromise).rejects.toThrow('Transcription cancelled'); + expect(mediaTranscriptionService.cancelTranscription('media-2')).toBe(true); + await secondRejection; + expect(transcribeMock).toHaveBeenCalledTimes(1); + + resolveFirstCollect([{ text: ' first ', start: 0, end: 1 }]); + await firstPromise; + }); + + it('cancels the active transcription job and advances the queue', async () => { + const sourceById = { + 'media-1': new File(['one'], 'one.mp3', { type: 'audio/mpeg' }), + 'media-2': new File(['two'], 'two.mp3', { type: 'audio/mpeg' }), + } as const; + getMediaMock.mockImplementation(async (mediaId: string) => ({ + id: mediaId, + fileName: `${mediaId}.mp3`, + mimeType: 'audio/mpeg', + codec: 'mp3', + fileLastModified: 123, + })); + getMediaFileMock.mockImplementation(async (mediaId: string) => sourceById[mediaId as keyof typeof sourceById]); + + let rejectFirstCollect!: (error: Error) => void; + const firstCollect = vi.fn(() => new Promise>((_, reject) => { + rejectFirstCollect = reject; + })); + const firstCancel = vi.fn((message?: string) => { + rejectFirstCollect(new Error(message ?? 'Transcription cancelled')); + }); + const secondCollect = vi.fn().mockResolvedValue([ + { text: ' second ', start: 0, end: 1 }, + ]); + + transcribeMock + .mockReturnValueOnce({ collect: firstCollect, cancel: firstCancel }) + .mockReturnValueOnce({ collect: secondCollect, cancel: vi.fn() }); + + const firstPromise = mediaTranscriptionService.transcribeMedia('media-1'); + const secondPromise = mediaTranscriptionService.transcribeMedia('media-2'); + + await waitFor(() => { + expect(transcribeMock).toHaveBeenCalledTimes(1); + }); + + expect(mediaTranscriptionService.cancelTranscription('media-1')).toBe(true); + await expect(firstPromise).rejects.toThrow('Transcription cancelled'); + + const secondTranscript = await secondPromise; + expect(firstCancel).toHaveBeenCalledWith('Transcription cancelled'); + expect(secondTranscript.mediaId).toBe('media-2'); + expect(transcribeMock).toHaveBeenCalledTimes(2); + }); +}); diff --git a/src/features/media-library/services/media-transcription-service.ts b/src/features/media-library/services/media-transcription-service.ts index a4268fa45..93800f41d 100644 --- a/src/features/media-library/services/media-transcription-service.ts +++ b/src/features/media-library/services/media-transcription-service.ts @@ -9,7 +9,7 @@ import { useSelectionStore } from '@/shared/state/selection'; import { createLogger } from '@/shared/logging/logger'; import type { MediaTranscript, MediaTranscriptModel } from '@/types/storage'; import type { AudioItem, TextItem, TimelineItem, TimelineTrack, VideoItem } from '@/types/timeline'; -import type { TranscribeOptions } from '../transcription/types'; +import type { TranscriptSegment, TranscribeOptions } from '../transcription/types'; import { getDefaultMediaTranscriptionAdapter, getMediaTranscriptionModelLabel, @@ -17,20 +17,27 @@ import { import { mediaLibraryService } from './media-library-service'; import { buildCaptionTextItems, - buildCaptionTrack, + buildCaptionTrackAbove, findReplaceableCaptionItemsForClip, findCompatibleCaptionTrackForRanges, + isCaptionTrackCandidate, getCaptionTextItemTemplate, getCaptionRangeForClip, } from '../utils/caption-items'; import { useProjectStore } from '@/features/media-library/deps/projects'; import { useTimelineStore } from '@/features/media-library/deps/timeline-stores'; import { useSettingsStore } from '@/features/media-library/deps/settings-contract'; +import { + needsCustomAudioDecoder, + resolvePreviewAudioConformUrl, + startPreviewAudioConform, +} from '@/features/media-library/deps/composition-runtime-contract'; import { DEFAULT_WHISPER_MODEL, DEFAULT_WHISPER_QUANTIZATION, normalizeWhisperLanguage, } from '@/shared/utils/whisper-settings'; +import { TRANSCRIPTION_CANCELLED_MESSAGE } from '@/shared/utils/transcription-cancellation'; const logger = createLogger('MediaTranscriptionService'); const DEFAULT_MODEL: MediaTranscriptModel = DEFAULT_WHISPER_MODEL; @@ -47,61 +54,269 @@ interface InsertTranscriptAsCaptionsResult { removedItemCount: number; } +type QueueState = 'queued' | 'running'; + +interface TranscriptionRequestOptions { + language?: string; + model?: MediaTranscriptModel; + quantization?: TranscribeOptions['quantization']; + onProgress?: TranscribeOptions['onProgress']; + onQueueStatusChange?: (state: QueueState) => void; +} + +interface QueuedTranscriptionListener { + onProgress?: TranscribeOptions['onProgress']; + onQueueStatusChange?: (state: QueueState) => void; +} + +interface QueuedTranscriptionJob { + mediaId: string; + requestKey: string; + model: MediaTranscriptModel; + quantization: NonNullable; + language?: string; + listeners: QueuedTranscriptionListener[]; + promise: Promise; + resolve: (value: MediaTranscript) => void; + reject: (reason?: unknown) => void; + state: QueueState; + stream: { collect(): Promise; cancel(message?: string): void } | null; + cancelled: boolean; + cancelMessage: string; +} + class MediaTranscriptionService { private readonly adapter = getDefaultMediaTranscriptionAdapter(); private readonly transcriber = this.adapter.createTranscriber({ model: DEFAULT_MODEL, quantization: DEFAULT_QUANTIZATION, }); + private activeJob: QueuedTranscriptionJob | null = null; + private queue: QueuedTranscriptionJob[] = []; getTranscript = getTranscript; getTranscriptMediaIds = getTranscriptMediaIds; - deleteTranscript = deleteTranscript; + + async deleteTranscript(mediaId: string): Promise { + await deleteTranscript(mediaId); + } async transcribeMedia( mediaId: string, - options: Pick = {}, + options: TranscriptionRequestOptions = {}, ): Promise { + const settings = useSettingsStore.getState(); + const model = options.model ?? settings.defaultWhisperModel ?? DEFAULT_MODEL; + const quantization = + options.quantization ?? settings.defaultWhisperQuantization ?? DEFAULT_QUANTIZATION; + const language = normalizeWhisperLanguage(options.language ?? settings.defaultWhisperLanguage); + const requestKey = `${mediaId}:${model}:${quantization}:${language ?? 'auto'}`; + const listener: QueuedTranscriptionListener = { + onProgress: options.onProgress, + onQueueStatusChange: options.onQueueStatusChange, + }; + const existingJob = this.findJobByKey(requestKey); + + if (existingJob) { + this.attachListener(existingJob, listener); + return existingJob.promise; + } + + const job = this.createJob({ + mediaId, + requestKey, + model, + quantization, + language, + listener, + }); + + if (this.activeJob) { + this.queue.push(job); + this.setJobState(job, 'queued'); + } else { + this.startJob(job); + } + + return job.promise; + } + + cancelTranscription(mediaId: string, message = TRANSCRIPTION_CANCELLED_MESSAGE): boolean { + let cancelled = false; + + this.queue = this.queue.filter((job) => { + if (job.mediaId !== mediaId) { + return true; + } + + cancelled = true; + this.cancelJob(job, message); + return false; + }); + + if (this.activeJob?.mediaId === mediaId) { + cancelled = true; + this.cancelJob(this.activeJob, message); + } + + return cancelled; + } + + private findJobByKey(requestKey: string): QueuedTranscriptionJob | null { + if (this.activeJob?.requestKey === requestKey) { + return this.activeJob; + } + + return this.queue.find((job) => job.requestKey === requestKey) ?? null; + } + + private createJob({ + mediaId, + requestKey, + model, + quantization, + language, + listener, + }: { + mediaId: string; + requestKey: string; + model: MediaTranscriptModel; + quantization: NonNullable; + language?: string; + listener: QueuedTranscriptionListener; + }): QueuedTranscriptionJob { + let resolve!: (value: MediaTranscript) => void; + let reject!: (reason?: unknown) => void; + const promise = new Promise((resolvePromise, rejectPromise) => { + resolve = resolvePromise; + reject = rejectPromise; + }); + + return { + mediaId, + requestKey, + model, + quantization, + language, + listeners: [listener], + promise, + resolve, + reject, + state: 'queued', + stream: null, + cancelled: false, + cancelMessage: TRANSCRIPTION_CANCELLED_MESSAGE, + }; + } + + private attachListener(job: QueuedTranscriptionJob, listener: QueuedTranscriptionListener): void { + job.listeners.push(listener); + listener.onQueueStatusChange?.(job.state); + } + + private setJobState(job: QueuedTranscriptionJob, state: QueueState): void { + job.state = state; + for (const listener of job.listeners) { + listener.onQueueStatusChange?.(state); + } + } + + private cancelJob(job: QueuedTranscriptionJob, message: string): void { + job.cancelled = true; + job.cancelMessage = message; + + if (job.state === 'queued') { + job.reject(new Error(message)); + return; + } + + job.stream?.cancel(message); + } + + private startJob(job: QueuedTranscriptionJob): void { + this.activeJob = job; + this.setJobState(job, 'running'); + + void (async () => { + try { + const transcript = await this.executeTranscriptionJob(job); + job.resolve(transcript); + } catch (error) { + job.reject(error); + } finally { + if (this.activeJob === job) { + this.activeJob = null; + } + this.processNextJob(); + } + })(); + } + + private processNextJob(): void { + if (this.activeJob) { + return; + } + + const nextJob = this.queue.shift(); + if (nextJob) { + this.startJob(nextJob); + } + } + + private throwIfCancelled(job: QueuedTranscriptionJob): void { + if (job.cancelled) { + throw new Error(job.cancelMessage); + } + } + + private async executeTranscriptionJob(job: QueuedTranscriptionJob): Promise { + const mediaId = job.mediaId; const media = await mediaLibraryService.getMedia(mediaId); if (!media) { throw new Error(`Media not found: ${mediaId}`); } + this.throwIfCancelled(job); if (!media.mimeType.startsWith('audio/') && !media.mimeType.startsWith('video/')) { throw new Error('Only audio and video files can be transcribed'); } - const blob = await mediaLibraryService.getMediaFile(mediaId); - if (!blob) { + const sourceBlob = await mediaLibraryService.getMediaFile(mediaId); + if (!sourceBlob) { throw new Error(`Could not load media file: ${media.fileName}`); } + this.throwIfCancelled(job); - const file = blob instanceof File - ? blob - : new File([blob], media.fileName, { - type: media.mimeType, + const transcriptionBlob = await this.resolveTranscriptionBlob(media, sourceBlob); + this.throwIfCancelled(job); + + const file = transcriptionBlob instanceof File + ? transcriptionBlob + : new File([transcriptionBlob], media.fileName, { + type: transcriptionBlob.type || media.mimeType, lastModified: media.fileLastModified ?? Date.now(), }); - const settings = useSettingsStore.getState(); - const model = options.model ?? settings.defaultWhisperModel ?? DEFAULT_MODEL; - const quantization = - options.quantization ?? settings.defaultWhisperQuantization ?? DEFAULT_QUANTIZATION; - const language = normalizeWhisperLanguage(options.language ?? settings.defaultWhisperLanguage); const stream = this.transcriber.transcribe(file, { - model, - language, - quantization, - onProgress: options.onProgress, + model: job.model, + language: job.language, + quantization: job.quantization, + onProgress: (progress) => { + for (const listener of job.listeners) { + listener.onProgress?.(progress); + } + }, }); + job.stream = stream; const segments = await stream.collect(); + this.throwIfCancelled(job); const transcript: MediaTranscript = { id: mediaId, mediaId, - model, - language, - quantization, + model: job.model, + language: job.language, + quantization: job.quantization, text: segments.map((segment) => segment.text.trim()).filter(Boolean).join(' ').trim(), segments: segments.map((segment) => ({ text: segment.text.trim(), @@ -121,6 +336,33 @@ class MediaTranscriptionService { return transcript; } + private async resolveTranscriptionBlob(media: { id: string; fileName: string; mimeType: string; codec: string; audioCodec?: string }, sourceBlob: Blob): Promise { + const transcriptionCodec = media.mimeType.startsWith('audio/') + ? media.codec + : (media.audioCodec ?? media.codec); + + if (!needsCustomAudioDecoder(transcriptionCodec)) { + return sourceBlob; + } + + let conformedUrl = await resolvePreviewAudioConformUrl(media.id); + if (!conformedUrl) { + await startPreviewAudioConform(media.id, sourceBlob); + conformedUrl = await resolvePreviewAudioConformUrl(media.id); + } + + if (!conformedUrl) { + throw new Error(`Failed to prepare ${transcriptionCodec || 'custom'} audio for transcription`); + } + + const response = await fetch(conformedUrl); + if (!response.ok) { + throw new Error(`Failed to load conformed audio for transcription (${response.status})`); + } + + return await response.blob(); + } + async insertTranscriptAsCaptions( mediaId: string, options: InsertTranscriptAsCaptionsOptions = {}, @@ -143,7 +385,7 @@ class MediaTranscriptionService { const generatedCaptionIdsToRemove = options.replaceExisting ? new Set( targetClips.flatMap((clip) => - findReplaceableCaptionItemsForClip(timeline.items, clip).map((item) => item.id) + findReplaceableCaptionItemsForClip(timeline.items, clip, 'transcript').map((item) => item.id) ) ) : new Set(); @@ -157,7 +399,7 @@ class MediaTranscriptionService { } const existingGeneratedCaptions = options.replaceExisting - ? findReplaceableCaptionItemsForClip(timeline.items, clip) + ? findReplaceableCaptionItemsForClip(timeline.items, clip, 'transcript') : []; const preferredTrackId = this.resolvePreferredCaptionTrackId( newTracks, @@ -175,7 +417,10 @@ class MediaTranscriptionService { ); if (!targetTrack) { - targetTrack = buildCaptionTrack(newTracks); + const clipTrack = newTracks.find((track) => track.id === clip.trackId); + targetTrack = clipTrack + ? buildCaptionTrackAbove(newTracks, clipTrack.order) + : buildCaptionTrackAbove(newTracks, 0); newTracks.push(targetTrack); newTracks.sort((a, b) => a.order - b.order); } @@ -283,7 +528,7 @@ class MediaTranscriptionService { } const preferredTrack = tracks.find((track) => track.id === trackIds[0]); - if (!preferredTrack || preferredTrack.visible === false || preferredTrack.locked || preferredTrack.isGroup) { + if (!preferredTrack || !isCaptionTrackCandidate(preferredTrack, items)) { return null; } diff --git a/src/features/media-library/services/proxy-service.ts b/src/features/media-library/services/proxy-service.ts index e85c15784..7734ecadb 100644 --- a/src/features/media-library/services/proxy-service.ts +++ b/src/features/media-library/services/proxy-service.ts @@ -346,8 +346,8 @@ class ProxyService { // Directory may not exist } - // Mirror deletion to workspace cache (best-effort, no-op when absent). - void removeWorkspaceCacheEntry([WORKSPACE_PROXIES_DIR, resolvedProxyKey], { + // Mirror deletion to workspace cache before reporting completion. + await removeWorkspaceCacheEntry([WORKSPACE_PROXIES_DIR, resolvedProxyKey], { recursive: true, }); } @@ -360,12 +360,11 @@ class ProxyService { const staleProxyIds: string[] = []; try { const root = await navigator.storage.getDirectory(); - let proxyRoot: FileSystemDirectoryHandle; - try { - proxyRoot = await root.getDirectoryHandle(PROXY_DIR); - } catch { - return staleProxyIds; // No proxies directory yet - } + // Create the dir if missing — on a fresh origin OPFS has no `proxies/` + // yet, but the workspace fallback below still needs a handle to back-fill + // into. Without `create: true` we'd bail before hydrating from the + // workspace folder and never show cross-origin-reused proxies. + const proxyRoot = await root.getDirectoryHandle(PROXY_DIR, { create: true }); const requestedProxyKeys = new Set(); for (const mediaId of mediaIds) { diff --git a/src/features/media-library/stores/media-delete-actions.test.ts b/src/features/media-library/stores/media-delete-actions.test.ts index d2726b4f3..18194e309 100644 --- a/src/features/media-library/stores/media-delete-actions.test.ts +++ b/src/features/media-library/stores/media-delete-actions.test.ts @@ -30,6 +30,12 @@ vi.mock('@/infrastructure/browser/blob-url-manager', () => ({ blobUrlManager: blobUrlManagerMocks, })); +const sceneBrowserMocks = vi.hoisted(() => ({ + invalidateMediaCaptionThumbnails: vi.fn(), +})); + +vi.mock('../deps/scene-browser', () => sceneBrowserMocks); + type DeleteState = Partial & Partial; type DeleteUpdater = | Partial diff --git a/src/features/media-library/stores/media-delete-actions.ts b/src/features/media-library/stores/media-delete-actions.ts index 3fdffb666..bcd68e6fb 100644 --- a/src/features/media-library/stores/media-delete-actions.ts +++ b/src/features/media-library/stores/media-delete-actions.ts @@ -2,6 +2,7 @@ import { mediaLibraryService } from '../services/media-library-service'; import { proxyService } from '../services/proxy-service'; import { blobUrlManager } from '@/infrastructure/browser/blob-url-manager'; +import { invalidateMediaCaptionThumbnails } from '../deps/scene-browser'; type Set = ( partial: @@ -21,6 +22,11 @@ function releaseDeletedMediaResources(ids: string[]): void { for (const id of ids) { blobUrlManager.release(id); proxyService.clearProxyKey(id); + // Drop every Scene Browser cache tied to this media — thumbnail blob + // URLs (which otherwise pin the JPEG in memory forever), lazy-thumb + // result memos, and both text + image embedding maps. Disk-side + // cleanup is already handled by the recursive `media/{id}/` removal. + invalidateMediaCaptionThumbnails(id); } } diff --git a/src/features/media-library/stores/media-library-store.ts b/src/features/media-library/stores/media-library-store.ts index 4e086d922..b192449e0 100644 --- a/src/features/media-library/stores/media-library-store.ts +++ b/src/features/media-library/stores/media-library-store.ts @@ -1,4 +1,4 @@ -import { create } from 'zustand'; +import { create, type StoreApi, type UseBoundStore } from 'zustand'; import { devtools } from 'zustand/middleware'; import type { MediaLibraryState, @@ -74,7 +74,16 @@ async function initializeProxyState(mediaItems: MediaMetadata[]): Promise await proxyService.loadExistingProxies(videoItems.map((item) => item.id)); } -export const useMediaLibraryStore = create< +type MediaLibraryStoreApi = UseBoundStore>; + +declare global { + // eslint-disable-next-line no-var + var __FREECUT_MEDIA_LIBRARY_STORE__: MediaLibraryStoreApi | undefined; +} + +const hotStore = import.meta.env.DEV ? globalThis.__FREECUT_MEDIA_LIBRARY_STORE__ : undefined; + +const newStore: MediaLibraryStoreApi = hotStore ?? create< MediaLibraryState & MediaLibraryActions >()( devtools( @@ -120,6 +129,7 @@ export const useMediaLibraryStore = create< // AI tagging taggingMediaIds: new Set(), + analysisProgress: null, // v3: Set current project context setCurrentProject: (projectId: string | null) => { @@ -141,6 +151,7 @@ export const useMediaLibraryStore = create< transcriptStatus: new Map(), transcriptProgress: new Map(), taggingMediaIds: new Set(), + analysisProgress: null, }); // Note: loadMediaItems is triggered by the component's useEffect // Don't call it here to avoid double loading @@ -361,6 +372,54 @@ export const useMediaLibraryStore = create< return { mediaItems }; }); }, + + beginAnalysisRun: (count) => { + if (count <= 0) return; + set((state) => { + const current = state.analysisProgress; + if (!current) { + return { analysisProgress: { total: count, completed: 0, cancelRequested: false } }; + } + // Merge concurrent runs (e.g. a per-card analyze while a batch is + // in flight) by growing the total so the percent keeps decreasing + // toward completion instead of snapping back. + return { + analysisProgress: { + total: current.total + count, + completed: current.completed, + cancelRequested: current.cancelRequested, + }, + }; + }); + }, + + incrementAnalysisCompleted: (n = 1) => { + set((state) => { + if (!state.analysisProgress) return state; + return { + analysisProgress: { + ...state.analysisProgress, + completed: Math.min( + state.analysisProgress.total, + state.analysisProgress.completed + n, + ), + }, + }; + }); + }, + + requestAnalysisCancel: () => { + set((state) => { + if (!state.analysisProgress) return state; + return { + analysisProgress: { ...state.analysisProgress, cancelRequested: true }, + }; + }); + }, + + endAnalysisRun: () => { + set({ analysisProgress: null }); + }, }), { name: 'MediaLibraryStore', @@ -369,28 +428,46 @@ export const useMediaLibraryStore = create< ) ); -// Keep mediaById synchronized even when action modules update mediaItems directly. -let prevMediaItemsRef = useMediaLibraryStore.getState().mediaItems; -useMediaLibraryStore.subscribe((state) => { - if (state.mediaItems === prevMediaItemsRef) { - return; - } - prevMediaItemsRef = state.mediaItems; - useMediaLibraryStore.setState({ mediaById: buildMediaById(state.mediaItems) }); -}); - -// Wire up proxy service status listener to update store state -proxyService.onStatusChange((mediaId, status, progress) => { - const store = useMediaLibraryStore.getState(); - if (status === 'idle') { - store.clearProxyStatus(mediaId); - return; - } - store.setProxyStatus(mediaId, status); - if (progress !== undefined) { - store.setProxyProgress(mediaId, progress); - } -}); +// Preserve the store instance across Vite HMR so that mediaItems and the +// rest of project state don't reset to `[]` on every file save — without +// this, editing a feature component wipes the scene browser's "X clips · Y +// scenes" and requires a hard refresh to reload via `loadMediaItems`. +// DEV-only: prod builds don't HMR so the cache is harmless to skip. +if (import.meta.env.DEV) { + globalThis.__FREECUT_MEDIA_LIBRARY_STORE__ = newStore; +} + +export const useMediaLibraryStore = newStore; + +// Subscriptions (below) must only be wired the first time the store is +// created. On HMR, `hotStore` is non-null and the subscription from the +// previous module execution is still live on the store — re-wiring here +// would leak a listener on every file save, eventually double-updating +// `mediaById` and double-firing proxy status changes. +if (!hotStore) { + // Keep mediaById synchronized even when action modules update mediaItems directly. + let prevMediaItemsRef = useMediaLibraryStore.getState().mediaItems; + useMediaLibraryStore.subscribe((state) => { + if (state.mediaItems === prevMediaItemsRef) { + return; + } + prevMediaItemsRef = state.mediaItems; + useMediaLibraryStore.setState({ mediaById: buildMediaById(state.mediaItems) }); + }); + + // Wire up proxy service status listener to update store state + proxyService.onStatusChange((mediaId, status, progress) => { + const store = useMediaLibraryStore.getState(); + if (status === 'idle') { + store.clearProxyStatus(mediaId); + return; + } + store.setProxyStatus(mediaId, status); + if (progress !== undefined) { + store.setProxyProgress(mediaId, progress); + } + }); +} // Selector hooks for common use cases (optional, but recommended) export const useFilteredMediaItems = () => { diff --git a/src/features/media-library/transcription/browser-transcriber.ts b/src/features/media-library/transcription/browser-transcriber.ts index 2367eabe4..b81920bf0 100644 --- a/src/features/media-library/transcription/browser-transcriber.ts +++ b/src/features/media-library/transcription/browser-transcriber.ts @@ -10,6 +10,8 @@ import type { MediaTranscriptQuantization } from '@/types/storage'; import { localInferenceRuntimeRegistry } from '@/shared/state/local-inference'; import { LOCAL_INFERENCE_UNLOADED_MESSAGE } from '@/shared/state/local-inference'; import { formatWhisperRuntimeModelLabel, estimateWhisperRuntimeBytes } from './runtime-estimates'; +import { DEFAULT_WHISPER_MODEL } from '@/shared/utils/whisper-settings'; +import { usePlaybackStore } from '@/shared/state/playback'; export class BrowserTranscriber { private readonly defaultOptions: TranscribeOptions; @@ -37,6 +39,9 @@ export class TranscribeStream implements AsyncIterable { private bridge: Bridge | null = null; private started = false; private runtimeRegistered = false; + private unsubscribePlayback: (() => void) | null = null; + private idleResumeTimer: ReturnType | null = null; + private workerPaused = false; constructor(file: File, options: TranscribeOptions = {}) { this.file = file; @@ -77,9 +82,70 @@ export class TranscribeStream implements AsyncIterable { this.queue.length = 0; this.error = new Error(message); this.unregisterRuntime(); + this.stopPlaybackWatcher(); this.wakeUp(); } + private startPlaybackWatcher(): void { + if (this.unsubscribePlayback) return; + if (!this.bridge) return; + + const IDLE_RESUME_MS = 400; + + const pauseWorker = () => { + if (this.idleResumeTimer !== null) { + clearTimeout(this.idleResumeTimer); + this.idleResumeTimer = null; + } + if (this.workerPaused) return; + this.workerPaused = true; + this.bridge?.setPaused(true); + }; + + const scheduleResume = () => { + if (this.idleResumeTimer !== null) { + clearTimeout(this.idleResumeTimer); + } + this.idleResumeTimer = setTimeout(() => { + this.idleResumeTimer = null; + const playback = usePlaybackStore.getState(); + if (playback.isPlaying || playback.previewFrame !== null) return; + this.workerPaused = false; + this.bridge?.setPaused(false); + }, IDLE_RESUME_MS); + }; + + const initial = usePlaybackStore.getState(); + if (initial.isPlaying || initial.previewFrame !== null) { + pauseWorker(); + } + + this.unsubscribePlayback = usePlaybackStore.subscribe((state, prev) => { + const isActive = state.isPlaying || state.previewFrame !== null; + const frameMoved = state.currentFrameEpoch !== prev.currentFrameEpoch; + + if (isActive || frameMoved) { + pauseWorker(); + if (!state.isPlaying) scheduleResume(); + return; + } + + if (prev.isPlaying && !state.isPlaying) { + scheduleResume(); + } + }); + } + + private stopPlaybackWatcher(): void { + this.unsubscribePlayback?.(); + this.unsubscribePlayback = null; + if (this.idleResumeTimer !== null) { + clearTimeout(this.idleResumeTimer); + this.idleResumeTimer = null; + } + this.workerPaused = false; + } + private async startBridge(): Promise { if (this.started) { return; @@ -104,11 +170,13 @@ export class TranscribeStream implements AsyncIterable { onDone: () => { this.doneFlag = true; this.unregisterRuntime(); + this.stopPlaybackWatcher(); this.wakeUp(); }, onError: (message: string) => { this.error = new Error(message); this.unregisterRuntime(); + this.stopPlaybackWatcher(); this.wakeUp(); }, }); @@ -116,13 +184,15 @@ export class TranscribeStream implements AsyncIterable { try { await this.bridge.start( this.file, - (this.options.model as WhisperModel | undefined) ?? 'whisper-tiny', + (this.options.model as WhisperModel | undefined) ?? DEFAULT_WHISPER_MODEL, this.options.language, this.options.quantization, ); + this.startPlaybackWatcher(); } catch (error) { this.error = error instanceof Error ? error : new Error(String(error)); this.unregisterRuntime(); + this.stopPlaybackWatcher(); this.wakeUp(); } } @@ -133,7 +203,7 @@ export class TranscribeStream implements AsyncIterable { } this.runtimeRegistered = true; - const model = (this.options.model as WhisperModel | undefined) ?? 'whisper-tiny'; + const model = (this.options.model as WhisperModel | undefined) ?? DEFAULT_WHISPER_MODEL; const quantization = (this.options.quantization as MediaTranscriptQuantization | undefined) ?? 'hybrid'; const now = Date.now(); diff --git a/src/features/media-library/transcription/lib/bridge.ts b/src/features/media-library/transcription/lib/bridge.ts index 849c9a969..6d49e1c70 100644 --- a/src/features/media-library/transcription/lib/bridge.ts +++ b/src/features/media-library/transcription/lib/bridge.ts @@ -11,6 +11,7 @@ import { MODEL_IDS } from '../types'; import { createManagedWorkerSession } from '@/shared/utils/managed-worker-session'; import { Chunker } from './chunker'; import { downmixToMono, resampleTo16kHz } from './resampler'; +import { DEFAULT_WHISPER_MODEL } from '@/shared/utils/whisper-settings'; export interface BridgeCallbacks { onSegment: (segment: TranscriptSegment) => void; @@ -73,7 +74,7 @@ export class Bridge { async start( file: File, - model: WhisperModel = 'whisper-tiny', + model: WhisperModel = DEFAULT_WHISPER_MODEL, language?: string, quantization?: QuantizationType, ): Promise { @@ -112,6 +113,19 @@ export class Bridge { this.session.terminate(); } + setPaused(paused: boolean): void { + if (this.session.isTerminated()) { + return; + } + + const message = { type: paused ? 'pause' : 'resume' } as const; + this.session.getWorker('whisper').postMessage(message); + const hasWebCodecs = typeof window !== 'undefined' && 'AudioDecoder' in window; + if (hasWebCodecs) { + this.session.getWorker('decoder').postMessage(message); + } + } + private async decodeWithAudioContext(file: File, port: MessagePort): Promise { try { this.callbacks.onProgress({ stage: 'decoding', progress: 0 }); diff --git a/src/features/media-library/transcription/registry.test.ts b/src/features/media-library/transcription/registry.test.ts index c58f4d678..c053ba27f 100644 --- a/src/features/media-library/transcription/registry.test.ts +++ b/src/features/media-library/transcription/registry.test.ts @@ -12,11 +12,15 @@ describe('mediaTranscriptionAdapterRegistry', () => { id: 'browser-whisper', label: 'Browser Whisper', }); - expect(getDefaultMediaTranscriptionModel()).toBe('whisper-tiny'); + expect(getDefaultMediaTranscriptionModel()).toBe('whisper-small'); expect(getMediaTranscriptionModelOptions()).toContainEqual({ value: 'whisper-small', label: 'Small', }); + expect(getMediaTranscriptionModelOptions()).not.toContainEqual({ + value: 'whisper-tiny', + label: 'Tiny', + }); }); it('formats model labels through the active adapter', () => { diff --git a/src/features/media-library/transcription/types.ts b/src/features/media-library/transcription/types.ts index 8c889d652..d6e9cecc2 100644 --- a/src/features/media-library/transcription/types.ts +++ b/src/features/media-library/transcription/types.ts @@ -49,7 +49,9 @@ export type WhisperWorkerMessage = modelId: string; language?: string; quantization?: QuantizationType; - }; + } + | { type: 'pause' } + | { type: 'resume' }; export const MODEL_IDS: Record = { 'whisper-tiny': 'onnx-community/whisper-tiny', diff --git a/src/features/media-library/transcription/workers/decoder.worker.ts b/src/features/media-library/transcription/workers/decoder.worker.ts index 60f53e9f9..6a10f248d 100644 --- a/src/features/media-library/transcription/workers/decoder.worker.ts +++ b/src/features/media-library/transcription/workers/decoder.worker.ts @@ -11,6 +11,8 @@ import type { MainThreadMessage, PCMChunk } from '../types'; let port: MessagePort | null = null; let whisperQueueSize = 0; let whisperQueueWaiter: (() => void) | null = null; +let paused = false; +let pauseWaiter: (() => void) | null = null; self.onmessage = async (event: MessageEvent) => { const message = event.data as { type: string; port?: MessagePort; file?: File }; @@ -27,6 +29,21 @@ self.onmessage = async (event: MessageEvent) => { return; } + if (message.type === 'pause') { + paused = true; + return; + } + + if (message.type === 'resume') { + paused = false; + if (pauseWaiter) { + const waiter = pauseWaiter; + pauseWaiter = null; + waiter(); + } + return; + } + if (message.type === 'init' && message.file) { try { await run(message.file); @@ -39,6 +56,13 @@ self.onmessage = async (event: MessageEvent) => { } }; +function awaitResume(): Promise { + if (!paused) return Promise.resolve(); + return new Promise((resolve) => { + pauseWaiter = resolve; + }); +} + async function run(file: File): Promise { if (typeof AudioDecoder === 'undefined') { throw new Error('WebCodecs AudioDecoder is not available in this browser'); @@ -146,6 +170,10 @@ async function run(file: File): Promise { try { const sink = new EncodedPacketSink(audioTrack); for await (const packet of sink.packets()) { + if (paused) { + await awaitResume(); + } + while (decoder.decodeQueueSize > 10 || whisperQueueSize >= 3) { await new Promise((resolve) => { if (decoder.decodeQueueSize > 10) { diff --git a/src/features/media-library/transcription/workers/whisper.worker.ts b/src/features/media-library/transcription/workers/whisper.worker.ts index d0917be50..f64854339 100644 --- a/src/features/media-library/transcription/workers/whisper.worker.ts +++ b/src/features/media-library/transcription/workers/whisper.worker.ts @@ -48,10 +48,29 @@ let currentModelId: string | null = null; let port: MessagePort | null = null; let language: string | undefined; let pipelineReady = false; +let paused = false; const queue: PCMChunk[] = []; let processing = false; let reportedEstimatedBytes = 0; +self.addEventListener('unhandledrejection', (event: PromiseRejectionEvent) => { + const reason = event.reason; + const message = reason instanceof Error + ? `${reason.name}: ${reason.message}` + : typeof reason === 'string' + ? reason + : 'Unknown worker error'; + postMain({ type: 'error', message }); + event.preventDefault(); +}); + +self.addEventListener('error', (event: ErrorEvent) => { + postMain({ + type: 'error', + message: event.message || (event.error instanceof Error ? event.error.message : 'Worker error'), + }); +}); + self.onmessage = async (event: MessageEvent) => { const message = event.data as WhisperWorkerMessage; @@ -66,18 +85,35 @@ self.onmessage = async (event: MessageEvent) => { if (message.type === 'init') { language = message.language; await initPipeline(message.modelId, message.quantization ?? 'hybrid'); + return; + } + + if (message.type === 'pause') { + paused = true; + return; + } + + if (message.type === 'resume') { + if (!paused) return; + paused = false; + if (pipelineReady && !processing && queue.length > 0) { + void processNext(); + } } }; function enqueue(chunk: PCMChunk): void { queue.push(chunk); port?.postMessage(queue.length); - if (pipelineReady && !processing) { + if (pipelineReady && !processing && !paused) { void processNext(); } } -async function initPipeline(modelId: string, quantization: QuantizationType): Promise { +async function initPipeline( + modelId: string, + quantization: QuantizationType, +): Promise { postMain({ type: 'progress', event: { stage: 'loading', progress: 0 } }); reportedEstimatedBytes = 0; @@ -189,7 +225,7 @@ async function initPipeline(modelId: string, quantization: QuantizationType): Pr } async function processNext(): Promise { - if (!pipelineReady || !asrPipeline) { + if (!pipelineReady || !asrPipeline || paused) { processing = false; return; } @@ -215,7 +251,7 @@ async function processNext(): Promise { } processing = false; - if (queue.length > 0) { + if (queue.length > 0 && !paused) { void processNext(); } } diff --git a/src/features/media-library/types.ts b/src/features/media-library/types.ts index b5bc177d5..a3e6ca76b 100644 --- a/src/features/media-library/types.ts +++ b/src/features/media-library/types.ts @@ -11,7 +11,7 @@ export interface MediaLibrarySelection { compositionIds: string[]; } -export type MediaTranscriptStatus = 'idle' | 'transcribing' | 'ready' | 'error'; +export type MediaTranscriptStatus = 'idle' | 'queued' | 'transcribing' | 'ready' | 'error'; export type MediaTranscriptProgress = TranscriptionProgressSnapshot; @@ -94,6 +94,19 @@ export interface MediaLibraryState { // AI tagging taggingMediaIds: Set; + /** + * Deterministic progress for the currently running AI analysis run (single + * item or batch). Null when no analysis is in flight. `completed` counts + * finished items (success or failure); the background progress bar reads + * `completed / total` to draw a real percentage instead of an indeterminate + * pulse. `cancelRequested` is a soft stop — the service finishes the + * current item then skips the rest. + */ + analysisProgress: { + total: number; + completed: number; + cancelRequested: boolean; + } | null; } export interface MediaLibraryActions { @@ -185,5 +198,14 @@ export interface MediaLibraryActions { // AI captioning setTaggingMedia: (mediaId: string, active: boolean) => void; - updateMediaCaptions: (mediaId: string, captions: Array<{ timeSec: number; text: string }>) => void; + updateMediaCaptions: (mediaId: string, captions: NonNullable) => void; + + /** Start (or merge into) an analysis run — adds `count` to `total`. */ + beginAnalysisRun: (count: number) => void; + /** Increment the completed counter by one (or by `n`). */ + incrementAnalysisCompleted: (n?: number) => void; + /** Ask the current run to stop after the in-flight item. */ + requestAnalysisCancel: () => void; + /** Clear analysisProgress when the run is done. */ + endAnalysisRun: () => void; } diff --git a/src/features/media-library/utils/caption-items.test.ts b/src/features/media-library/utils/caption-items.test.ts index a10eb4249..e80e044a5 100644 --- a/src/features/media-library/utils/caption-items.test.ts +++ b/src/features/media-library/utils/caption-items.test.ts @@ -8,18 +8,80 @@ vi.mock('../deps/timeline-contract', () => ({ timelineFps: number, sourceFps: number, ) => Math.max(0, Math.round((timelineFrames / timelineFps) * sourceFps * speed)), + getNextClassicTrackName: (tracks: Array<{ name: string; kind?: string }>, kind: 'video' | 'audio') => { + const prefix = kind === 'video' ? 'V' : 'A'; + const regex = new RegExp(`^${prefix}(\\d+)$`, 'i'); + const used = new Set( + tracks + .filter((track) => track.kind === undefined || track.kind === kind) + .map((track) => { + const match = track.name.match(regex); + return match?.[1] ? Number.parseInt(match[1], 10) : NaN; + }) + .filter((value) => Number.isFinite(value) && value > 0), + ); + let next = 1; + while (used.has(next)) next += 1; + return `${prefix}${next}`; + }, + getTrackKind: (track: { name: string; kind?: string }) => { + if (track.kind === 'video' || track.kind === 'audio') { + return track.kind; + } + if (/^V(\d+)$/i.test(track.name)) { + return 'video'; + } + if (/^A(\d+)$/i.test(track.name)) { + return 'audio'; + } + return null; + }, + getEffectiveTrackKindForItem: ( + track: { id: string; name: string; kind?: string }, + items: Array<{ trackId: string; type: string }>, + ) => { + if (track.kind === 'video' || track.kind === 'audio') { + return track.kind; + } + if (/^V(\d+)$/i.test(track.name)) { + return 'video'; + } + if (/^A(\d+)$/i.test(track.name)) { + return 'audio'; + } + + let hasAudioItems = false; + for (const item of items) { + if (item.trackId !== track.id) continue; + if (item.type === 'audio') { + hasAudioItems = true; + continue; + } + return 'video'; + } + + return hasAudioItems ? 'audio' : null; + }, })); import { + aiCaptionsToSegments, buildCaptionTextItems, + buildCaptionTrack, + buildCaptionTrackAbove, findGeneratedCaptionItemsForClip, findReplaceableCaptionItemsForClip, getCaptionTextItemTemplate, findCompatibleCaptionTrack, + findCompatibleCaptionTrackForRanges, + findCompatibleGeneratedTrackForRanges, getCaptionRangeForClip, getCaptionFrameRange, + isGeneratedContentTrackCandidate, + isCaptionTrackCandidate, normalizeCaptionSegments, } from './caption-items'; +import { getTrackKind } from '../deps/timeline-contract'; import type { TimelineItem, TimelineTrack, VideoItem } from '@/types/timeline'; describe('caption-items', () => { @@ -158,6 +220,105 @@ describe('caption-items', () => { expect(track?.id).toBe('track-2'); }); + it('never reuses audio tracks for caption text', () => { + const tracks: TimelineTrack[] = [ + { + id: 'track-audio', + name: 'A1', + kind: 'audio', + height: 64, + locked: false, + visible: true, + muted: false, + solo: false, + order: 0, + items: [], + }, + { + id: 'track-video', + name: 'V1', + kind: 'video', + height: 64, + locked: false, + visible: true, + muted: false, + solo: false, + order: 1, + items: [], + }, + ]; + + expect(isCaptionTrackCandidate(tracks[0]!, [])).toBe(false); + expect(isCaptionTrackCandidate(tracks[1]!, [])).toBe(true); + expect(findCompatibleCaptionTrack(tracks, [], 30, 90)?.id).toBe('track-video'); + expect( + findCompatibleCaptionTrackForRanges(tracks, [], [{ startFrame: 30, endFrame: 90 }])?.id, + ).toBe('track-video'); + }); + + it('can target audio tracks for generated audio content', () => { + const tracks: TimelineTrack[] = [ + { + id: 'track-generic-audio', + name: 'Track 1', + height: 64, + locked: false, + visible: true, + muted: false, + solo: false, + order: 0, + items: [], + }, + { + id: 'track-video', + name: 'V1', + kind: 'video', + height: 64, + locked: false, + visible: true, + muted: false, + solo: false, + order: 1, + items: [], + }, + { + id: 'track-audio', + name: 'A1', + kind: 'audio', + height: 64, + locked: false, + visible: true, + muted: false, + solo: false, + order: 2, + items: [], + }, + ]; + const items: TimelineItem[] = [ + { + id: 'existing-audio', + type: 'audio', + trackId: 'track-generic-audio', + from: 0, + durationInFrames: 30, + label: 'Existing audio', + src: 'blob:test', + }, + ]; + + expect(isGeneratedContentTrackCandidate(tracks[0]!, items, 'audio')).toBe(true); + expect(isGeneratedContentTrackCandidate(tracks[1]!, items, 'audio')).toBe(false); + expect(isGeneratedContentTrackCandidate(tracks[2]!, items, 'audio')).toBe(true); + expect( + findCompatibleGeneratedTrackForRanges( + tracks, + items, + [{ startFrame: 30, endFrame: 90 }], + 'audio', + )?.id, + ).toBe('track-generic-audio'); + }); + it('returns the overall transcript frame range', () => { const frameRange = getCaptionFrameRange( [ @@ -294,3 +455,95 @@ describe('caption-items', () => { expect(replaceableCaptions.map((item) => item.id)).toEqual(['legacy-caption']); }); }); + +function makeTrack(id: string, order: number): TimelineTrack { + return { + id, + name: id, + height: 40, + locked: false, + visible: true, + muted: false, + solo: false, + order, + items: [], + }; +} + +describe('aiCaptionsToSegments', () => { + it('returns [] for empty input', () => { + expect(aiCaptionsToSegments([])).toEqual([]); + }); + + it('derives end from next caption start for all but the last', () => { + const segments = aiCaptionsToSegments([ + { timeSec: 0, text: 'a' }, + { timeSec: 3, text: 'b' }, + { timeSec: 7, text: 'c' }, + ]); + expect(segments).toEqual([ + { text: 'a', start: 0, end: 3 }, + { text: 'b', start: 3, end: 7 }, + { text: 'c', start: 7, end: 10 }, // 3s fallback for the tail + ]); + }); + + it('uses provided sampleIntervalSec for the trailing caption', () => { + const segments = aiCaptionsToSegments([{ timeSec: 0, text: 'only' }], 5); + expect(segments).toEqual([{ text: 'only', start: 0, end: 5 }]); + }); + + it('sorts captions by timeSec before converting', () => { + const segments = aiCaptionsToSegments([ + { timeSec: 5, text: 'b' }, + { timeSec: 0, text: 'a' }, + ]); + expect(segments.map((s) => s.text)).toEqual(['a', 'b']); + }); +}); + +describe('buildCaptionTrackAbove', () => { + it('places the caption track halfway between the reference and the next track up', () => { + const tracks = [makeTrack('a', 0), makeTrack('b', 1), makeTrack('c', 2)]; + const captionTrack = buildCaptionTrackAbove(tracks, 2); + expect(captionTrack.order).toBe(1.5); + }); + + it('places the track a full integer above when nothing sits higher', () => { + const tracks = [makeTrack('a', 5)]; + const captionTrack = buildCaptionTrackAbove(tracks, 5); + expect(captionTrack.order).toBe(4); + }); + + it('sorts visually higher than the reference clip track after insertion', () => { + const tracks = [makeTrack('a', 0), makeTrack('clip', 1), makeTrack('b', 2)]; + const captionTrack = buildCaptionTrackAbove(tracks, 1); + const sorted = [...tracks, captionTrack].sort((x, y) => x.order - y.order); + const clipIndex = sorted.findIndex((t) => t.id === 'clip'); + const captionIndex = sorted.findIndex((t) => t.id === captionTrack.id); + // CLAUDE.md convention: lower order = visually higher (top of timeline). + expect(captionIndex).toBeLessThan(clipIndex); + }); + + it('creates a video-kind overlay track so the timeline renders it immediately', () => { + const tracks = [makeTrack('clip', 1)]; + const captionTrack = buildCaptionTrackAbove(tracks, 1); + expect(captionTrack.kind).toBe('video'); + expect(getTrackKind(captionTrack)).toBe('video'); + expect(captionTrack.name).toBe('V1'); + }); +}); + +describe('buildCaptionTrack (append-to-bottom helper)', () => { + it('still creates tracks at maxOrder + 1', () => { + const tracks = [ + { ...makeTrack('a', 0), name: 'V1', kind: 'video' as const }, + { ...makeTrack('b', 1), name: 'A1', kind: 'audio' as const }, + { ...makeTrack('c', 2), name: 'V2', kind: 'video' as const }, + ]; + const captionTrack = buildCaptionTrack(tracks); + expect(captionTrack.order).toBe(3); + expect(captionTrack.kind).toBe('video'); + expect(captionTrack.name).toBe('V3'); + }); +}); diff --git a/src/features/media-library/utils/caption-items.ts b/src/features/media-library/utils/caption-items.ts index bea59355a..eafba1fec 100644 --- a/src/features/media-library/utils/caption-items.ts +++ b/src/features/media-library/utils/caption-items.ts @@ -1,5 +1,11 @@ -import { DEFAULT_TRACK_HEIGHT } from '../deps/timeline-contract'; +import { + DEFAULT_TRACK_HEIGHT, + getEffectiveTrackKindForItem, + getNextClassicTrackName, + type TrackKind, +} from '../deps/timeline-contract'; import type { MediaTranscriptSegment } from '@/types/storage'; +import type { MediaCaption } from '@/infrastructure/analysis'; import type { AudioItem, GeneratedCaptionSource, @@ -10,6 +16,13 @@ import type { } from '@/types/timeline'; import { timelineToSourceFrames } from '../deps/timeline-contract'; +/** + * Fallback segment duration when AI captions can't infer an `end` time from + * the next caption (i.e. for the last caption, or when the sample interval is + * unknown). Seconds. + */ +const AI_CAPTION_FALLBACK_DURATION_SEC = 3; + interface BuildCaptionTextItemsOptions { mediaId: string; trackId: string; @@ -19,6 +32,13 @@ interface BuildCaptionTextItemsOptions { canvasWidth: number; canvasHeight: number; styleTemplate?: CaptionTextItemTemplate; + /** + * Discriminator for the `captionSource.type` stamped on the generated + * text items. Defaults to `'transcript'` (whisper flow); AI captioning + * flows pass `'ai-captions'` so later replace/remove operations can tell + * the two kinds apart on the same clip. + */ + sourceType?: GeneratedCaptionSource['type']; } export type CaptionTextItemTemplate = Pick< @@ -198,39 +218,32 @@ export function findCompatibleCaptionTrack( startFrame: number, endFrame: number, ): TimelineTrack | null { - const sortedTracks = [...tracks].sort((a, b) => a.order - b.order); - - for (const track of sortedTracks) { - if (track.visible === false || track.locked || track.isGroup) { - continue; - } - - const hasOverlap = items.some((item) => { - if (item.trackId !== track.id) { - return false; - } - - const itemEnd = item.from + item.durationInFrames; - return item.from < endFrame && itemEnd > startFrame; - }); - - if (!hasOverlap) { - return track; - } - } - - return null; + return findCompatibleGeneratedTrackForRanges( + tracks, + items, + [{ startFrame, endFrame }], + 'video', + ); } export function findCompatibleCaptionTrackForRanges( tracks: readonly TimelineTrack[], items: readonly TimelineItem[], ranges: ReadonlyArray<{ startFrame: number; endFrame: number }>, +): TimelineTrack | null { + return findCompatibleGeneratedTrackForRanges(tracks, items, ranges, 'video'); +} + +export function findCompatibleGeneratedTrackForRanges( + tracks: readonly TimelineTrack[], + items: readonly TimelineItem[], + ranges: ReadonlyArray<{ startFrame: number; endFrame: number }>, + requiredKind: TrackKind, ): TimelineTrack | null { const sortedTracks = [...tracks].sort((a, b) => a.order - b.order); for (const track of sortedTracks) { - if (track.visible === false || track.locked || track.isGroup) { + if (!isGeneratedContentTrackCandidate(track, items, requiredKind)) { continue; } @@ -253,34 +266,135 @@ export function findCompatibleCaptionTrackForRanges( return null; } +export function isGeneratedContentTrackCandidate( + track: TimelineTrack, + items: readonly TimelineItem[], + requiredKind: TrackKind, +): boolean { + if (track.visible === false || track.locked || track.isGroup) { + return false; + } + + const effectiveKind = getEffectiveTrackKindForItem(track, items); + if (requiredKind === 'audio') { + return effectiveKind === 'audio'; + } + + return effectiveKind === 'video' || effectiveKind === null; +} + +export function isCaptionTrackCandidate( + track: TimelineTrack, + items: readonly TimelineItem[], +): boolean { + return isGeneratedContentTrackCandidate(track, items, 'video'); +} + export function buildCaptionTrack(tracks: readonly TimelineTrack[]): TimelineTrack { const maxOrder = tracks.reduce((highest, track) => Math.max(highest, track.order), -1); return { id: `track-captions-${Date.now()}`, - name: 'Captions', + name: getNextClassicTrackName([...tracks], 'video'), + kind: 'video', height: DEFAULT_TRACK_HEIGHT, locked: false, + syncLock: true, visible: true, muted: false, solo: false, + volume: 0, order: maxOrder + 1, items: [], }; } -function buildCaptionSource(mediaId: string, clipId: string): GeneratedCaptionSource { +/** + * Build a captions track positioned *above* a reference track (the clip's + * own track in the AI-captions flow). The new track's `order` is set halfway + * between `referenceOrder` and the next track up, so both stay unique and no + * existing tracks need to shift. + * + * If nothing sits above the reference, we land a full integer lower than it. + * Matches the fractional-order pattern used by `insertTrack` in + * `use-timeline-tracks.ts`. + */ +export function buildCaptionTrackAbove( + tracks: readonly TimelineTrack[], + referenceOrder: number, +): TimelineTrack { + const ordersStrictlyAbove = tracks + .map((t) => t.order) + .filter((order) => order < referenceOrder); + const previousOrder = ordersStrictlyAbove.length > 0 + ? Math.max(...ordersStrictlyAbove) + : referenceOrder - 2; + const newOrder = (previousOrder + referenceOrder) / 2; + + return { + id: `track-captions-${Date.now()}`, + name: getNextClassicTrackName([...tracks], 'video'), + kind: 'video', + height: DEFAULT_TRACK_HEIGHT, + locked: false, + syncLock: true, + visible: true, + muted: false, + solo: false, + volume: 0, + order: newOrder, + items: [], + }; +} + +function buildCaptionSource( + mediaId: string, + clipId: string, + type: GeneratedCaptionSource['type'] = 'transcript', +): GeneratedCaptionSource { return { - type: 'transcript', + type, mediaId, clipId, }; } +/** + * Convert AI captions (point-in-time frame descriptions) into segments with + * start/end pairs consumable by {@link buildCaptionTextItems}. + * + * AI captions have no intrinsic duration — the end time is derived from the + * next caption's `timeSec`, with a fallback to the provider's sample interval + * (or {@link AI_CAPTION_FALLBACK_DURATION_SEC}) for the final caption. + */ +export function aiCaptionsToSegments( + captions: readonly MediaCaption[], + sampleIntervalSec?: number, +): MediaTranscriptSegment[] { + if (captions.length === 0) return []; + const sorted = [...captions].sort((a, b) => a.timeSec - b.timeSec); + const fallbackEndDelta = sampleIntervalSec && sampleIntervalSec > 0 + ? sampleIntervalSec + : AI_CAPTION_FALLBACK_DURATION_SEC; + + return sorted.map((caption, index) => { + const next = sorted[index + 1]; + const start = Math.max(0, caption.timeSec); + const end = next !== undefined + ? Math.max(start + 0.01, next.timeSec) + : start + fallbackEndDelta; + return { + text: caption.text, + start, + end, + }; + }); +} + export function isGeneratedCaptionTextItem( item: TimelineItem, ): item is TextItem & { captionSource: GeneratedCaptionSource } { return item.type === 'text' - && item.captionSource?.type === 'transcript' + && (item.captionSource?.type === 'transcript' || item.captionSource?.type === 'ai-captions') && item.captionSource.clipId.length > 0 && item.captionSource.mediaId.length > 0; } @@ -288,9 +402,12 @@ export function isGeneratedCaptionTextItem( export function findGeneratedCaptionItemsForClip( items: readonly TimelineItem[], clipId: string, + sourceType?: GeneratedCaptionSource['type'], ): Array { return items.filter((item): item is TextItem & { captionSource: GeneratedCaptionSource } => - isGeneratedCaptionTextItem(item) && item.captionSource.clipId === clipId + isGeneratedCaptionTextItem(item) + && item.captionSource.clipId === clipId + && (sourceType === undefined || item.captionSource.type === sourceType) ); } @@ -313,12 +430,18 @@ function isLegacyGeneratedCaptionItemForClip( export function findReplaceableCaptionItemsForClip( items: readonly TimelineItem[], clip: AudioItem | VideoItem, + sourceType?: GeneratedCaptionSource['type'], ): TextItem[] { - const generatedCaptionItems = findGeneratedCaptionItemsForClip(items, clip.id); + const generatedCaptionItems = findGeneratedCaptionItemsForClip(items, clip.id, sourceType); if (generatedCaptionItems.length > 0) { return generatedCaptionItems; } + // Legacy fallback only applies to transcript-generated captions (the only + // kind that predates the `captionSource` discriminator). + if (sourceType !== undefined && sourceType !== 'transcript') { + return []; + } return items.filter((item): item is TextItem => isLegacyGeneratedCaptionItemForClip(item, clip)); } @@ -352,6 +475,7 @@ export function buildCaptionTextItems({ canvasWidth, canvasHeight, styleTemplate, + sourceType = 'transcript', }: BuildCaptionTextItemsOptions): TextItem[] { const normalizedSegments = normalizeCaptionSegments(segments); const { sourceStart, sourceEnd, sourceFps, speed } = getClipSourceBounds(clip, timelineFps); @@ -387,11 +511,12 @@ export function buildCaptionTextItems({ const defaultCaptionItem: TextItem = { id: crypto.randomUUID(), type: 'text', + textRole: 'caption', trackId, from, durationInFrames, mediaId, - captionSource: buildCaptionSource(mediaId, clip.id), + captionSource: buildCaptionSource(mediaId, clip.id, sourceType), label: segment.text.slice(0, 48), text: segment.text, fontSize: Math.max(36, Math.round(canvasHeight * 0.045)), diff --git a/src/features/preview/components/edit-2up-panels.tsx b/src/features/preview/components/edit-2up-panels.tsx index 50900daad..804ce6d4c 100644 --- a/src/features/preview/components/edit-2up-panels.tsx +++ b/src/features/preview/components/edit-2up-panels.tsx @@ -4,8 +4,24 @@ import { type VideoFrameSource, } from '@/features/preview/deps/export'; import { getGlobalVideoSourcePool } from '@/features/preview/deps/player-pool'; +import { + backgroundBatchPreseek, + backgroundPreseek, + getCachedPredecodedBitmap, + waitForInflightPredecodedBitmap, +} from '@/features/preview/utils/decoder-prewarm'; +import { + getCachedEditOverlayFrame, + getEditOverlayFrameCacheKey, + hasCachedEditOverlayFrame, + putCachedEditOverlayFrame, +} from '@/features/preview/utils/edit-overlay-frame-cache'; +import { collectEditOverlayDirectionalPrewarmTimes } from '@/features/preview/utils/edit-overlay-prewarm-plan'; +import { + getActivePreviewScrubbingCache, + getActivePreviewVideoFrameEntry, +} from '@/features/preview/utils/preview-scrubbing-cache-bridge'; import type { TimelineItem } from '@/types/timeline'; -import { usePlaybackStore } from '@/shared/state/playback'; import { resolveMediaUrl, resolveProxyUrl } from '../utils/media-resolver'; import { computeFittedMediaSize, @@ -13,6 +29,7 @@ import { renderPanelMedia, } from './edit-panel-media-utils'; import { useBlobUrlVersion } from '@/infrastructure/browser/blob-url-manager'; +import { useEditOverlayPanelPrewarm } from './use-edit-overlay-panel-prewarm'; const TYPE_PLACEHOLDER_COLORS: Record = { image: '#22c55e', @@ -29,10 +46,11 @@ const GAP = 8; const FALLBACK_CANVAS_WIDTH = 280; const FALLBACK_CANVAS_HEIGHT = 158; const STRICT_DECODE_FALLBACK_FAILURES = 2; -/** Frame cache for edit overlay panels — instant revisits during drag reversal */ -const EDIT_PANEL_CACHE_MAX = 60; -/** Quantize source time to ~frame-level resolution for cache keys */ const CACHE_TIME_QUANTUM = 1 / 60; +const STRICT_DECODE_SHARED_CACHE_WAIT_MS = 6; +const EDIT_OVERLAY_PREWARM_MAX_TIMESTAMPS = 6; +const SCRUBBING_CACHE_TOLERANCE_FACTOR = 0.9; +const EDIT_OVERLAY_LEGACY_SEEK_SPEED_EPSILON = 0.01; let previewVideoInstanceCounter = 0; let strictDecodeInstanceCounter = 0; let globalEditOverlayDecoderPool: SharedVideoExtractorPool | null = null; @@ -44,14 +62,18 @@ function getEditOverlayDecoderPool(): SharedVideoExtractorPool { return globalEditOverlayDecoderPool; } -function useResolvedVideoBlobUrl(mediaId: string | undefined, useProxy: boolean): string | null { +function quantizeOverlayCacheTime(sourceTime: number): number { + return Math.round(sourceTime / CACHE_TIME_QUANTUM) * CACHE_TIME_QUANTUM; +} + +function useResolvedVideoBlobUrl(mediaId: string | undefined): string | null { const [blobUrl, setBlobUrl] = useState(null); const blobUrlVersion = useBlobUrlVersion(); const requestKeyRef = useRef(null); useEffect(() => { let cancelled = false; - const requestKey = `${mediaId ?? 'none'}:${useProxy ? 'proxy' : 'source'}`; + const requestKey = `${mediaId ?? 'none'}:proxy-first`; if (requestKeyRef.current !== requestKey) { requestKeyRef.current = requestKey; setBlobUrl(null); @@ -63,14 +85,12 @@ function useResolvedVideoBlobUrl(mediaId: string | undefined, useProxy: boolean) }; } - if (useProxy) { - const proxyUrl = resolveProxyUrl(mediaId); - if (proxyUrl) { - setBlobUrl(proxyUrl); - return () => { - cancelled = true; - }; - } + const proxyUrl = resolveProxyUrl(mediaId); + if (proxyUrl) { + setBlobUrl(proxyUrl); + return () => { + cancelled = true; + }; } resolveMediaUrl(mediaId) @@ -86,7 +106,7 @@ function useResolvedVideoBlobUrl(mediaId: string | undefined, useProxy: boolean) return () => { cancelled = true; }; - }, [mediaId, useProxy, blobUrlVersion]); + }, [mediaId, blobUrlVersion]); return blobUrl; } @@ -107,6 +127,7 @@ interface EditTwoUpPanelsProps { export function EditTwoUpPanels({ leftPanel, rightPanel }: EditTwoUpPanelsProps) { const containerRef = useRef(null); const [containerSize, setContainerSize] = useState({ width: 0, height: 0 }); + useEditOverlayPanelPrewarm([leftPanel, rightPanel]); useEffect(() => { const el = containerRef.current; @@ -217,13 +238,29 @@ interface VideoFrameProps { function VideoFrameImpl({ item, sourceTime }: VideoFrameProps) { const [useLegacyFallback, setUseLegacyFallback] = useState(false); + const [legacyFailed, setLegacyFailed] = useState(false); + const prefersLegacySeek = Math.abs((item.speed ?? 1) - 1) < EDIT_OVERLAY_LEGACY_SEEK_SPEED_EPSILON; + const canUseLegacySeek = !legacyFailed; + + useEffect(() => { + setUseLegacyFallback(false); + setLegacyFailed(false); + }, [item.id, item.mediaId, prefersLegacySeek]); const handleStrictDecodeFailure = useCallback(() => { + if (!canUseLegacySeek) return; setUseLegacyFallback((prev) => (prev ? prev : true)); + }, [canUseLegacySeek]); + + const shouldUseLegacySeek = canUseLegacySeek && (prefersLegacySeek || useLegacyFallback); + + const handleLegacyFailure = useCallback(() => { + setLegacyFailed(true); + setUseLegacyFallback(false); }, []); - if (useLegacyFallback) { - return ; + if (shouldUseLegacySeek) { + return ; } return ( @@ -239,10 +276,6 @@ interface StrictDecodedVideoFrameProps extends VideoFrameProps { onDecodeFailure: () => void; } -function quantizeTime(t: number): number { - return Math.round(t / CACHE_TIME_QUANTUM) * CACHE_TIME_QUANTUM; -} - function StrictDecodedVideoFrame({ item, sourceTime, @@ -251,6 +284,7 @@ function StrictDecodedVideoFrame({ const canvasRef = useRef(null); const decoderPoolRef = useRef(getEditOverlayDecoderPool()); const decodeLaneRef = useRef(`edit-preview-strict-${++strictDecodeInstanceCounter}`); + const prewarmFps = Math.max(1, Math.round(item.sourceFps ?? 60)); const extractorRef = useRef(null); const mountedRef = useRef(true); const decoderReadyRef = useRef(false); @@ -258,26 +292,97 @@ function StrictDecodedVideoFrame({ const pendingTimeRef = useRef(null); const consecutiveDecodeFailuresRef = useRef(0); const latestTargetTimeRef = useRef(Math.max(0, sourceTime)); - const useProxy = usePlaybackStore((s) => s.useProxy); - const blobUrl = useResolvedVideoBlobUrl(item.mediaId, useProxy); + const blobUrl = useResolvedVideoBlobUrl(item.mediaId); const contextRef = useRef(null); const decoderItemId = `${item.id}:${decodeLaneRef.current}`; - // Frame cache: quantized source time → ImageBitmap for instant revisits - const frameCacheRef = useRef>(new Map()); - const frameCacheOrderRef = useRef([]); + const prewarmInFlightRef = useRef(false); + const queuedPrewarmTimesRef = useRef([]); + const prewarmAnchorFrameRef = useRef(null); useEffect(() => { mountedRef.current = true; return () => { mountedRef.current = false; - // Clean up cached bitmaps on unmount - for (const bitmap of frameCacheRef.current.values()) { - bitmap.close(); - } - frameCacheRef.current.clear(); + prewarmInFlightRef.current = false; + queuedPrewarmTimesRef.current = []; + prewarmAnchorFrameRef.current = null; }; }, []); + const pumpDirectionalPrewarm = useCallback(() => { + if ( + prewarmInFlightRef.current + || !decoderReadyRef.current + || !mountedRef.current + || pendingTimeRef.current !== null + ) { + return; + } + + if (!blobUrl) { + queuedPrewarmTimesRef.current = []; + return; + } + + const timestamps = queuedPrewarmTimesRef.current; + if (timestamps.length === 0) { + return; + } + + prewarmInFlightRef.current = true; + queuedPrewarmTimesRef.current = []; + + const run = async () => { + try { + await backgroundBatchPreseek(blobUrl, timestamps); + } finally { + prewarmInFlightRef.current = false; + if ( + mountedRef.current + && pendingTimeRef.current === null + && queuedPrewarmTimesRef.current.length > 0 + ) { + queueMicrotask(() => { + if (!mountedRef.current) return; + pumpDirectionalPrewarm(); + }); + } + } + }; + + void run(); + }, [blobUrl]); + + const queueDirectionalPrewarm = useCallback((targetTime: number) => { + const extractor = extractorRef.current; + if ( + !extractor + || !decoderReadyRef.current + || pendingTimeRef.current !== null + || !blobUrl + ) { + return; + } + + const result = collectEditOverlayDirectionalPrewarmTimes({ + targetTime, + duration: extractor.getDuration(), + fps: prewarmFps, + previousAnchorFrame: prewarmAnchorFrameRef.current, + quantumSeconds: CACHE_TIME_QUANTUM, + maxTimestamps: EDIT_OVERLAY_PREWARM_MAX_TIMESTAMPS, + isCached: (time) => { + const overlayCacheKey = getEditOverlayFrameCacheKey(blobUrl, time, CACHE_TIME_QUANTUM); + return hasCachedEditOverlayFrame(overlayCacheKey) + || getCachedPredecodedBitmap(blobUrl, time, CACHE_TIME_QUANTUM) !== null; + }, + }); + + prewarmAnchorFrameRef.current = result.targetFrame; + queuedPrewarmTimesRef.current = result.times; + pumpDirectionalPrewarm(); + }, [blobUrl, prewarmFps, pumpDirectionalPrewarm]); + const drawFrame = useCallback(async (targetTime: number) => { const extractor = extractorRef.current; const canvas = canvasRef.current; @@ -299,45 +404,102 @@ function StrictDecodedVideoFrame({ canvas.height = targetHeight; } - // Check frame cache first - const cacheKey = quantizeTime(targetTime); - const cache = frameCacheRef.current; - const cacheOrder = frameCacheOrderRef.current; - const cached = cache.get(cacheKey); - if (cached) { - ctx.drawImage(cached, 0, 0, canvas.width, canvas.height); - // Move to end of LRU order - const idx = cacheOrder.indexOf(cacheKey); - if (idx !== -1) { - cacheOrder.splice(idx, 1); - cacheOrder.push(cacheKey); + const cacheKey = blobUrl + ? getEditOverlayFrameCacheKey(blobUrl, targetTime, CACHE_TIME_QUANTUM) + : null; + const quantizedTargetTime = quantizeOverlayCacheTime(targetTime); + const drawBitmap = (bitmap: CanvasImageSource) => { + ctx.clearRect(0, 0, canvas.width, canvas.height); + ctx.drawImage(bitmap, 0, 0, canvas.width, canvas.height); + }; + const populateSharedScrubCache = (source: ImageBitmap, resolvedSourceTime: number) => { + const scrubbingCache = getActivePreviewScrubbingCache(); + if (!scrubbingCache) { + return; } + void createImageBitmap(source) + .then((bitmap) => { + scrubbingCache.putVideoFrame(item.id, bitmap, quantizeOverlayCacheTime(resolvedSourceTime)); + }) + .catch(() => { + // Shared scrub cache population is best-effort only. + }); + }; + + if (cacheKey) { + const sharedCachedFrame = getCachedEditOverlayFrame(cacheKey); + if (sharedCachedFrame) { + drawBitmap(sharedCachedFrame); + populateSharedScrubCache(sharedCachedFrame, targetTime); + return true; + } + } + + const scrubbingCacheTolerance = Math.max( + CACHE_TIME_QUANTUM / 2, + (SCRUBBING_CACHE_TOLERANCE_FACTOR / prewarmFps) / 2, + ); + const scrubCachedFrame = getActivePreviewVideoFrameEntry( + item.id, + quantizedTargetTime, + scrubbingCacheTolerance, + ); + if (scrubCachedFrame) { + drawBitmap(scrubCachedFrame.frame); return true; } - const didDraw = await extractor.drawFrame(ctx, Math.max(0, targetTime), 0, 0, canvas.width, canvas.height); - if (!didDraw) return false; - - // Cache the decoded frame as ImageBitmap - try { - const bitmap = await createImageBitmap(canvas); - cache.set(cacheKey, bitmap); - cacheOrder.push(cacheKey); - // LRU eviction - while (cacheOrder.length > EDIT_PANEL_CACHE_MAX) { - const evictKey = cacheOrder.shift()!; - const evicted = cache.get(evictKey); - if (evicted) { - evicted.close(); - cache.delete(evictKey); - } + if (blobUrl) { + const predecodedBitmap = getCachedPredecodedBitmap(blobUrl, targetTime, CACHE_TIME_QUANTUM); + if (predecodedBitmap) { + drawBitmap(predecodedBitmap); + populateSharedScrubCache(predecodedBitmap, targetTime); + return true; + } + + const inflightBitmap = await waitForInflightPredecodedBitmap( + blobUrl, + targetTime, + CACHE_TIME_QUANTUM, + STRICT_DECODE_SHARED_CACHE_WAIT_MS, + ).catch(() => null); + if (inflightBitmap) { + drawBitmap(inflightBitmap); + populateSharedScrubCache(inflightBitmap, targetTime); + return true; + } + } + + const drawResult = await extractor.drawFrameWithCapture( + ctx, + Math.max(0, targetTime), + 0, + 0, + canvas.width, + canvas.height, + ); + if (!drawResult.success) return false; + + const scrubbingCache = getActivePreviewScrubbingCache(); + if (scrubbingCache && drawResult.capturedFrame) { + scrubbingCache.putVideoFrame( + item.id, + drawResult.capturedFrame, + quantizedTargetTime, + ); + } + + if (cacheKey) { + try { + const bitmap = await createImageBitmap(canvas); + putCachedEditOverlayFrame(cacheKey, bitmap); + } catch { + // Shared overlay cache population is best-effort only. } - } catch { - // createImageBitmap can fail on empty canvas — not critical } return true; - }, []); + }, [blobUrl]); const pumpLatestFrame = useCallback(() => { if (renderInFlightRef.current) return; @@ -352,6 +514,7 @@ function StrictDecodedVideoFrame({ const didDraw = await drawFrame(targetTime).catch(() => false); if (didDraw) { consecutiveDecodeFailuresRef.current = 0; + queueDirectionalPrewarm(targetTime); continue; } @@ -376,7 +539,7 @@ function StrictDecodedVideoFrame({ }; void run(); - }, [drawFrame, onDecodeFailure]); + }, [drawFrame, onDecodeFailure, queueDirectionalPrewarm]); useEffect(() => { decoderReadyRef.current = false; @@ -384,12 +547,9 @@ function StrictDecodedVideoFrame({ pendingTimeRef.current = null; consecutiveDecodeFailuresRef.current = 0; contextRef.current = null; - // Clear frame cache on source change - for (const bitmap of frameCacheRef.current.values()) { - bitmap.close(); - } - frameCacheRef.current.clear(); - frameCacheOrderRef.current.length = 0; + prewarmInFlightRef.current = false; + queuedPrewarmTimesRef.current = []; + prewarmAnchorFrameRef.current = null; if (!blobUrl) return; @@ -425,12 +585,18 @@ function StrictDecodedVideoFrame({ }, [blobUrl, decoderItemId, onDecodeFailure, pumpLatestFrame]); useEffect(() => { - latestTargetTimeRef.current = Math.max(0, sourceTime); - pendingTimeRef.current = latestTargetTimeRef.current; + const targetTime = Math.max(0, sourceTime); + latestTargetTimeRef.current = targetTime; + pendingTimeRef.current = targetTime; + + if (blobUrl) { + void backgroundPreseek(blobUrl, targetTime).catch(() => null); + } + if (decoderReadyRef.current) { pumpLatestFrame(); } - }, [sourceTime, pumpLatestFrame]); + }, [blobUrl, sourceTime, pumpLatestFrame]); return ( void; +} + +function LegacySeekVideoFrame({ item, sourceTime, onFailure }: LegacySeekVideoFrameProps) { const canvasRef = useRef(null); const poolRef = useRef(getGlobalVideoSourcePool()); const poolClipIdRef = useRef(`edit-preview-${++previewVideoInstanceCounter}`); const videoRef = useRef(null); - const useProxy = usePlaybackStore((s) => s.useProxy); - const blobUrl = useResolvedVideoBlobUrl(item.mediaId, useProxy); + const blobUrl = useResolvedVideoBlobUrl(item.mediaId); const contextRef = useRef(null); const seekingRef = useRef(false); const pendingTimeRef = useRef(null); @@ -510,6 +679,10 @@ function LegacySeekVideoFrame({ item, sourceTime }: VideoFrameProps) { video.playsInline = true; videoRef.current = video; + const handleError = () => { + onFailure(); + }; + const handleSeeked = () => { seekingRef.current = false; drawFrame(); @@ -529,10 +702,12 @@ function LegacySeekVideoFrame({ item, sourceTime }: VideoFrameProps) { } }; + video.addEventListener('error', handleError); video.addEventListener('seeked', handleSeeked); video.addEventListener('loadeddata', handleLoadedData); return () => { + video.removeEventListener('error', handleError); video.removeEventListener('seeked', handleSeeked); video.removeEventListener('loadeddata', handleLoadedData); video.pause(); @@ -541,7 +716,7 @@ function LegacySeekVideoFrame({ item, sourceTime }: VideoFrameProps) { pendingTimeRef.current = null; pool.releaseClip(clipId); }; - }, [blobUrl, drawFrame, requestSeek]); + }, [blobUrl, drawFrame, onFailure, requestSeek]); useEffect(() => { const video = videoRef.current; diff --git a/src/features/preview/components/edit-4up-panels.tsx b/src/features/preview/components/edit-4up-panels.tsx index a6dd54370..982c2e78a 100644 --- a/src/features/preview/components/edit-4up-panels.tsx +++ b/src/features/preview/components/edit-4up-panels.tsx @@ -5,6 +5,7 @@ import { ImageFrame, TypePlaceholder, } from './edit-2up-panels'; +import { useEditOverlayPanelPrewarm } from './use-edit-overlay-panel-prewarm'; import type { TimelineItem } from '@/types/timeline'; import { getItemAspectRatio, @@ -31,6 +32,12 @@ export function EditFourUpPanels({ }: EditFourUpPanelsProps) { const containerRef = useRef(null); const [containerSize, setContainerSize] = useState({ width: 0, height: 0 }); + useEditOverlayPanelPrewarm([ + leftPanel, + rightPanel, + topLeftCorner ?? { item: null }, + topRightCorner ?? { item: null }, + ]); useEffect(() => { const el = containerRef.current; diff --git a/src/features/preview/components/inline-source-preview.tsx b/src/features/preview/components/inline-source-preview.tsx index 8a182775b..34819550e 100644 --- a/src/features/preview/components/inline-source-preview.tsx +++ b/src/features/preview/components/inline-source-preview.tsx @@ -162,6 +162,8 @@ const InlineSourcePreviewContent = memo(function InlineSourcePreviewContent({ src={blobUrl} mediaType={mediaType} fileName={media.fileName} + pausedFrameSource="clock" + forceFastScrub={seekFrame !== null} /> diff --git a/src/features/preview/components/rolling-edit-overlay-utils.ts b/src/features/preview/components/rolling-edit-overlay-utils.ts new file mode 100644 index 000000000..460fb043c --- /dev/null +++ b/src/features/preview/components/rolling-edit-overlay-utils.ts @@ -0,0 +1,28 @@ +import type { TimelineItem } from '@/types/timeline'; +import { getSourceFrameInfo } from './edit-overlay-utils'; + +interface RollingEditPanelFramesParams { + trimmedItem: TimelineItem; + neighborItem: TimelineItem; + handle: 'start' | 'end'; + neighborDelta: number; + fps: number; +} + +export function getRollingEditPanelFrames({ + trimmedItem, + neighborItem, + handle, + neighborDelta, + fps, +}: RollingEditPanelFramesParams) { + const leftItem = handle === 'end' ? trimmedItem : neighborItem; + const rightItem = handle === 'end' ? neighborItem : trimmedItem; + + return { + leftItem, + rightItem, + outInfo: getSourceFrameInfo(leftItem, Math.max(0, leftItem.durationInFrames + neighborDelta - 1), fps), + inInfo: getSourceFrameInfo(rightItem, neighborDelta, fps), + }; +} diff --git a/src/features/preview/components/rolling-edit-overlay.test.ts b/src/features/preview/components/rolling-edit-overlay.test.ts new file mode 100644 index 000000000..d68ab9d67 --- /dev/null +++ b/src/features/preview/components/rolling-edit-overlay.test.ts @@ -0,0 +1,78 @@ +import { describe, expect, it } from 'vitest'; +import type { TimelineItem } from '@/types/timeline'; +import { getRollingEditPanelFrames } from './rolling-edit-overlay-utils'; + +function makeVideoItem(overrides: Partial = {}): TimelineItem { + return { + id: 'item', + type: 'video', + trackId: 'track-1', + from: 0, + durationInFrames: 100, + label: 'Clip', + src: 'clip.mp4', + sourceStart: 0, + sourceEnd: 200, + sourceDuration: 200, + sourceFps: 30, + ...overrides, + } as TimelineItem; +} + +describe('getRollingEditPanelFrames', () => { + it('shows earlier incoming source frames when the edit point rolls left', () => { + const trimmedItem = makeVideoItem({ + id: 'left', + from: 0, + durationInFrames: 100, + sourceStart: 0, + }); + const neighborItem = makeVideoItem({ + id: 'right', + from: 100, + durationInFrames: 100, + sourceStart: 50, + }); + + const result = getRollingEditPanelFrames({ + trimmedItem, + neighborItem, + handle: 'end', + neighborDelta: -30, + fps: 30, + }); + + expect(result.leftItem.id).toBe('left'); + expect(result.rightItem.id).toBe('right'); + expect(result.outInfo.sourceFrame).toBe(69); + expect(result.inInfo.sourceFrame).toBe(20); + }); + + it('shows later incoming source frames when the edit point rolls right from a start handle drag', () => { + const neighborItem = makeVideoItem({ + id: 'left', + from: 0, + durationInFrames: 100, + sourceStart: 0, + }); + const trimmedItem = makeVideoItem({ + id: 'right', + from: 100, + durationInFrames: 100, + sourceStart: 40, + }); + + const result = getRollingEditPanelFrames({ + trimmedItem, + neighborItem, + handle: 'start', + neighborDelta: 12, + fps: 30, + }); + + expect(result.leftItem.id).toBe('left'); + expect(result.rightItem.id).toBe('right'); + expect(result.outInfo.sourceFrame).toBe(111); + expect(result.inInfo.sourceFrame).toBe(52); + }); +}); diff --git a/src/features/preview/components/rolling-edit-overlay.tsx b/src/features/preview/components/rolling-edit-overlay.tsx index b08df8f55..e3fd18958 100644 --- a/src/features/preview/components/rolling-edit-overlay.tsx +++ b/src/features/preview/components/rolling-edit-overlay.tsx @@ -4,7 +4,7 @@ import { } from '@/features/preview/deps/timeline-store'; import { useRollingEditPreviewStore } from '@/features/preview/deps/timeline-edit-preview'; import { EditTwoUpPanels } from './edit-2up-panels'; -import { getSourceFrameInfo } from './edit-overlay-utils'; +import { getRollingEditPanelFrames } from './rolling-edit-overlay-utils'; interface RollingEditOverlayProps { fps: number; @@ -27,19 +27,12 @@ export function RollingEditOverlay({ fps }: RollingEditOverlayProps) { const neighborItem = itemsMap.get(neighborItemId); if (!trimmedItem || !neighborItem) return null; - const leftItem = handle === 'end' ? trimmedItem : neighborItem; - const rightItem = handle === 'end' ? neighborItem : trimmedItem; - - const editPointFrame = - handle === 'end' - ? leftItem.from + leftItem.durationInFrames + neighborDelta - : rightItem.from + neighborDelta; - - const outLocalFrame = Math.max(0, editPointFrame - leftItem.from - 1); - const inLocalFrame = Math.max(0, editPointFrame - rightItem.from); - - const outInfo = getSourceFrameInfo(leftItem, outLocalFrame, fps); - const inInfo = getSourceFrameInfo(rightItem, inLocalFrame, fps); + const { + leftItem, + rightItem, + outInfo, + inInfo, + } = getRollingEditPanelFrames({ trimmedItem, neighborItem, handle, neighborDelta, fps }); return ( ); } - diff --git a/src/features/preview/components/source-composition.tsx b/src/features/preview/components/source-composition.tsx index 624054419..f37c55030 100644 --- a/src/features/preview/components/source-composition.tsx +++ b/src/features/preview/components/source-composition.tsx @@ -16,7 +16,14 @@ import { type VideoFrameSource, } from '@/features/preview/deps/export'; import { resolveProxyUrl } from '../utils/media-resolver'; +import { + backgroundBatchPreseek, + getCachedPredecodedBitmap, + waitForInflightPredecodedBitmap, +} from '../utils/decoder-prewarm'; +import { getDirectionalPrewarmOffsets } from '../utils/fast-scrub-prewarm'; import { usePlaybackStore } from '@/shared/state/playback'; +import { useSourcePlayerStore } from '@/shared/state/source-player'; import { useMediaLibraryStore } from '@/features/preview/deps/media-library'; import { FileAudio } from 'lucide-react'; @@ -25,6 +32,8 @@ interface SourceCompositionProps { src: string; mediaType: 'video' | 'audio' | 'image'; fileName: string; + pausedFrameSource?: 'clock' | 'source-player'; + forceFastScrub?: boolean; } let sourceMonitorVideoInstanceCounter = 0; @@ -35,6 +44,12 @@ const SOURCE_MONITOR_STRICT_DECODE_FALLBACK_FAILURES = 2; const SOURCE_MONITOR_FRAME_CACHE_MAX = 90; const SOURCE_MONITOR_CACHE_TIME_QUANTUM = 1 / 60; const SOURCE_MONITOR_PLAYING_RESYNC_THRESHOLD_FRAMES = 6; +const SOURCE_MONITOR_PREWARM_MAX_TIMESTAMPS = 6; +const SOURCE_MONITOR_PREWARM_FORWARD_STEPS = 4; +const SOURCE_MONITOR_PREWARM_BACKWARD_STEPS = 6; +const SOURCE_MONITOR_PREWARM_OPPOSITE_STEPS = 2; +const SOURCE_MONITOR_PREWARM_NEUTRAL_RADIUS = 2; +const SOURCE_MONITOR_SHARED_CACHE_WAIT_MS = 4; function getSourceMonitorDecoderPool(): SharedVideoExtractorPool { if (!globalSourceMonitorDecoderPool) { @@ -66,9 +81,23 @@ function useSourceMonitorVideoSrc(mediaId: string | undefined, src: string): str }, [mediaId, proxyStatus, src, useProxy]); } -export function SourceComposition({ mediaId, src, mediaType, fileName }: SourceCompositionProps) { +export function SourceComposition({ + mediaId, + src, + mediaType, + fileName, + pausedFrameSource = 'source-player', + forceFastScrub = false, +}: SourceCompositionProps) { if (mediaType === 'video') { - return ; + return ( + + ); } if (mediaType === 'image') { return ; @@ -76,11 +105,26 @@ export function SourceComposition({ mediaId, src, mediaType, fileName }: SourceC return ; } -function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) { +function VideoSource({ + mediaId, + src, + pausedFrameSource, + forceFastScrub, +}: { + mediaId?: string; + src: string; + pausedFrameSource: 'clock' | 'source-player'; + forceFastScrub: boolean; +}) { const activeSrc = useSourceMonitorVideoSrc(mediaId, src); const clock = useClock(); const playing = useClockIsPlaying(); const playbackRate = useClockPlaybackRate(); + const followSourcePlayerFrames = pausedFrameSource === 'source-player'; + const sourcePlayerPreviewScrubbing = useSourcePlayerStore((s) => ( + followSourcePlayerFrames && s.previewSourceFrame !== null + )); + const isPreviewScrubbing = forceFastScrub || sourcePlayerPreviewScrubbing; const videoContainerRef = useRef(null); const videoRef = useRef(null); const audioRef = useRef(null); @@ -99,18 +143,52 @@ function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) { const consecutiveDecodeFailuresRef = useRef(0); const frameCacheRef = useRef>(new Map()); const frameCacheOrderRef = useRef([]); + const prewarmInFlightRef = useRef(false); + const queuedPrewarmTimesRef = useRef([]); + const prewarmAnchorFrameRef = useRef(null); const { fps } = useVideoConfig(); const lastFrameRef = useRef(clock.currentFrame); const playingRef = useRef(playing); + const currentSourceFrameRef = useRef(useSourcePlayerStore.getState().currentSourceFrame); + const previewSourceFrameRef = useRef(useSourcePlayerStore.getState().previewSourceFrame); + const pausedRenderTargetKeyRef = useRef(null); const decoderItemId = `${mediaId ?? 'source-monitor'}:${decodeLaneRef.current}`; const [useLegacyPausedSeek, setUseLegacyPausedSeek] = useState(false); const [strictDecodeReady, setStrictDecodeReady] = useState(false); const [hasDecodedFrame, setHasDecodedFrame] = useState(false); + const [decodedFrameKey, setDecodedFrameKey] = useState(null); + const [pausedRenderTargetKey, setPausedRenderTargetKey] = useState(null); useEffect(() => { playingRef.current = playing; }, [playing]); + useEffect(() => { + if (!followSourcePlayerFrames) { + currentSourceFrameRef.current = clock.currentFrame; + previewSourceFrameRef.current = null; + return; + } + + return useSourcePlayerStore.subscribe((state) => { + currentSourceFrameRef.current = state.currentSourceFrame; + previewSourceFrameRef.current = state.previewSourceFrame; + }); + }, [clock.currentFrame, followSourcePlayerFrames]); + + const getResolvedPausedSourceFrame = useCallback(() => { + if (!followSourcePlayerFrames) { + return clock.currentFrame; + } + + const previewFrame = previewSourceFrameRef.current; + if (previewFrame !== null) { + return previewFrame; + } + + return currentSourceFrameRef.current; + }, [clock.currentFrame, followSourcePlayerFrames]); + useEffect(() => { mountedRef.current = true; return () => { @@ -120,14 +198,123 @@ function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) { } frameCacheRef.current.clear(); frameCacheOrderRef.current = []; + prewarmInFlightRef.current = false; + queuedPrewarmTimesRef.current = []; + prewarmAnchorFrameRef.current = null; }; }, []); useEffect(() => { setUseLegacyPausedSeek(false); setHasDecodedFrame(false); + setDecodedFrameKey(null); + setPausedRenderTargetKey(null); + pausedRenderTargetKeyRef.current = null; + prewarmInFlightRef.current = false; + queuedPrewarmTimesRef.current = []; + prewarmAnchorFrameRef.current = null; }, [activeSrc, mediaId]); + const pumpDirectionalPrewarm = useCallback(() => { + if ( + prewarmInFlightRef.current + || !decoderReadyRef.current + || !mountedRef.current + || playingRef.current + || pendingTimeRef.current !== null + ) { + return; + } + + if (!activeSrc) { + queuedPrewarmTimesRef.current = []; + return; + } + + const timestamps = queuedPrewarmTimesRef.current; + if (timestamps.length === 0) { + return; + } + + prewarmInFlightRef.current = true; + queuedPrewarmTimesRef.current = []; + + const run = async () => { + try { + await backgroundBatchPreseek(activeSrc, timestamps); + } finally { + prewarmInFlightRef.current = false; + if ( + mountedRef.current + && !playingRef.current + && pendingTimeRef.current === null + && queuedPrewarmTimesRef.current.length > 0 + ) { + queueMicrotask(() => { + if (!mountedRef.current) return; + pumpDirectionalPrewarm(); + }); + } + } + }; + + void run(); + }, [activeSrc]); + + const queueDirectionalPrewarm = useCallback((targetTime: number) => { + const extractor = extractorRef.current; + if ( + !extractor + || !decoderReadyRef.current + || playingRef.current + || pendingTimeRef.current !== null + ) { + return; + } + + const duration = extractor.getDuration(); + if (!Number.isFinite(duration) || duration <= 0) { + return; + } + + const targetFrame = Math.max(0, Math.round(targetTime * fps)); + const previousAnchorFrame = prewarmAnchorFrameRef.current; + const direction: -1 | 0 | 1 = previousAnchorFrame === null || previousAnchorFrame === targetFrame + ? 0 + : targetFrame > previousAnchorFrame + ? 1 + : -1; + prewarmAnchorFrameRef.current = targetFrame; + + const offsets = getDirectionalPrewarmOffsets(direction, { + forwardSteps: SOURCE_MONITOR_PREWARM_FORWARD_STEPS, + backwardSteps: SOURCE_MONITOR_PREWARM_BACKWARD_STEPS, + oppositeSteps: SOURCE_MONITOR_PREWARM_OPPOSITE_STEPS, + neutralRadius: SOURCE_MONITOR_PREWARM_NEUTRAL_RADIUS, + }); + + const maxFrame = Math.max(0, Math.floor(duration * fps) - 1); + const cache = frameCacheRef.current; + const nextPrewarmTimes: number[] = []; + const seen = new Set(); + + for (const offset of offsets) { + const prewarmFrame = targetFrame + offset; + if (prewarmFrame < 0 || prewarmFrame > maxFrame) continue; + const prewarmTime = quantizeSourceMonitorTime(prewarmFrame / fps); + if (prewarmTime === quantizeSourceMonitorTime(targetTime)) continue; + if (cache.has(prewarmTime) || seen.has(prewarmTime)) continue; + seen.add(prewarmTime); + nextPrewarmTimes.push(prewarmTime); + if (nextPrewarmTimes.length >= SOURCE_MONITOR_PREWARM_MAX_TIMESTAMPS) { + break; + } + } + + queuedPrewarmTimesRef.current = nextPrewarmTimes; + pumpDirectionalPrewarm(); + }, [fps, pumpDirectionalPrewarm]); + const drawDecodedFrame = useCallback(async (targetTime: number) => { const extractor = extractorRef.current; const canvas = canvasRef.current; @@ -150,6 +337,10 @@ function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) { } const cacheKey = quantizeSourceMonitorTime(targetTime); + const markDecodedFrame = () => { + setHasDecodedFrame(true); + setDecodedFrameKey((prev) => (prev === cacheKey ? prev : cacheKey)); + }; const cache = frameCacheRef.current; const cacheOrder = frameCacheOrderRef.current; const cached = cache.get(cacheKey); @@ -161,7 +352,33 @@ function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) { cacheOrder.splice(cacheIndex, 1); cacheOrder.push(cacheKey); } + markDecodedFrame(); + return true; + } + + const drawSharedBitmap = (bitmap: ImageBitmap): boolean => { + ctx.clearRect(0, 0, canvas.width, canvas.height); + ctx.drawImage(bitmap, 0, 0, canvas.width, canvas.height); return true; + }; + + if (activeSrc) { + const sharedBitmap = getCachedPredecodedBitmap(activeSrc, Math.max(0, targetTime), SOURCE_MONITOR_CACHE_TIME_QUANTUM); + if (sharedBitmap && drawSharedBitmap(sharedBitmap)) { + markDecodedFrame(); + return true; + } + + const inflightBitmap = await waitForInflightPredecodedBitmap( + activeSrc, + Math.max(0, targetTime), + SOURCE_MONITOR_CACHE_TIME_QUANTUM, + SOURCE_MONITOR_SHARED_CACHE_WAIT_MS, + ).catch(() => null); + if (inflightBitmap && drawSharedBitmap(inflightBitmap)) { + markDecodedFrame(); + return true; + } } const didDraw = await extractor.drawFrame( @@ -190,8 +407,9 @@ function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) { // Cache population is best-effort only. } + markDecodedFrame(); return true; - }, []); + }, [activeSrc]); const pumpLatestDecodedFrame = useCallback(() => { if (renderInFlightRef.current) return; @@ -211,7 +429,7 @@ function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) { const didDraw = await drawDecodedFrame(targetTime).catch(() => false); if (didDraw) { consecutiveDecodeFailuresRef.current = 0; - setHasDecodedFrame(true); + queueDirectionalPrewarm(targetTime); continue; } @@ -243,7 +461,7 @@ function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) { }; void run(); - }, [drawDecodedFrame]); + }, [drawDecodedFrame, queueDirectionalPrewarm]); // Acquire/release pooled element when source changes. useEffect(() => { @@ -292,6 +510,9 @@ function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) { pendingTimeRef.current = null; consecutiveDecodeFailuresRef.current = 0; contextRef.current = null; + prewarmInFlightRef.current = false; + queuedPrewarmTimesRef.current = []; + prewarmAnchorFrameRef.current = null; for (const bitmap of frameCacheRef.current.values()) { bitmap.close(); @@ -339,15 +560,22 @@ function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) { const video = videoRef.current; const audio = audioRef.current; const targetTime = frame / fps; + const targetCacheKey = quantizeSourceMonitorTime(targetTime); latestTargetTimeRef.current = targetTime; lastFrameRef.current = frame; - if (!playingRef.current && !useLegacyPausedSeek) { + if (!playingRef.current && !useLegacyPausedSeek && !isPreviewScrubbing) { + if (pausedRenderTargetKeyRef.current !== targetCacheKey) { + pausedRenderTargetKeyRef.current = targetCacheKey; + setPausedRenderTargetKey(targetCacheKey); + } pendingTimeRef.current = targetTime; if (decoderReadyRef.current) { pumpLatestDecodedFrame(); } + } else if (isPreviewScrubbing) { + pendingTimeRef.current = null; } const syncAudioTime = () => { @@ -376,7 +604,7 @@ function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) { const canSeek = video.readyState >= 1; if (!canSeek) return; - if (!playingRef.current && strictDecodeReady && hasDecodedFrame && !useLegacyPausedSeek) { + if (!playingRef.current && strictDecodeReady && hasDecodedFrame && !useLegacyPausedSeek && !isPreviewScrubbing) { syncAudioTime(); return; } @@ -395,25 +623,67 @@ function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) { return; } - try { - poolRef.current.seekClip(poolClipIdRef.current, frame / fps, { fast: true }); - } catch { - // Ignore seek errors while media is loading + if (isPreviewScrubbing) { + if (Math.abs(video.currentTime - targetTime) >= 0.016) { + try { + video.currentTime = targetTime; + } catch { + // Ignore seek errors while media is loading + } + } + } else { + try { + poolRef.current.seekClip(poolClipIdRef.current, frame / fps, { fast: true }); + } catch { + // Ignore seek errors while media is loading + } } syncAudioTime(); - }, [activeSrc, fps, hasDecodedFrame, pumpLatestDecodedFrame, src, strictDecodeReady, useLegacyPausedSeek]); + }, [ + activeSrc, + fps, + hasDecodedFrame, + isPreviewScrubbing, + pumpLatestDecodedFrame, + src, + strictDecodeReady, + useLegacyPausedSeek, + ]); useEffect(() => { - syncSourceFrame(clock.currentFrame); + syncSourceFrame(playing ? clock.currentFrame : getResolvedPausedSourceFrame()); return clock.onFrameChange((frame) => { + if (!playingRef.current) { + return; + } syncSourceFrame(frame); }); - }, [clock, syncSourceFrame]); + }, [clock, getResolvedPausedSourceFrame, playing, syncSourceFrame]); + + useEffect(() => { + syncSourceFrame(playing ? clock.currentFrame : getResolvedPausedSourceFrame()); + }, [clock, getResolvedPausedSourceFrame, playing, syncSourceFrame]); useEffect(() => { - syncSourceFrame(clock.currentFrame); - }, [clock, playing, syncSourceFrame]); + if (!followSourcePlayerFrames) { + return; + } + + return useSourcePlayerStore.subscribe((state, prevState) => { + if ( + playingRef.current + || ( + state.previewSourceFrame === prevState.previewSourceFrame + && state.currentSourceFrame === prevState.currentSourceFrame + ) + ) { + return; + } + + syncSourceFrame(state.previewSourceFrame ?? state.currentSourceFrame); + }); + }, [followSourcePlayerFrames, syncSourceFrame]); // Handle play/pause sync useEffect(() => { @@ -454,7 +724,15 @@ function VideoSource({ mediaId, src }: { mediaId?: string; src: string }) { } }, [playbackRate, playing, src]); - const showDecodedCanvas = !playing && strictDecodeReady && hasDecodedFrame && !useLegacyPausedSeek; + const showDecodedCanvas = ( + !playing + && !isPreviewScrubbing + && strictDecodeReady + && hasDecodedFrame + && !useLegacyPausedSeek + && decodedFrameKey !== null + && decodedFrameKey === pausedRenderTargetKey + ); return ( diff --git a/src/features/preview/components/source-monitor.test.tsx b/src/features/preview/components/source-monitor.test.tsx index 600821972..89fd5d587 100644 --- a/src/features/preview/components/source-monitor.test.tsx +++ b/src/features/preview/components/source-monitor.test.tsx @@ -1,6 +1,6 @@ import { StrictMode, type ReactNode } from 'react'; import { beforeAll, beforeEach, describe, expect, it, vi } from 'vitest'; -import { render, waitFor } from '@testing-library/react'; +import { fireEvent, render, waitFor } from '@testing-library/react'; const editorStoreState = vi.hoisted(() => ({ sourcePreviewMediaId: 'media-1' as string | null, @@ -11,6 +11,7 @@ const sourcePlayerStoreState = vi.hoisted(() => ({ playerMethods: null as unknown, currentMediaId: null as string | null, currentSourceFrame: 0, + previewSourceFrame: null as number | null, inPoint: null as number | null, outPoint: null as number | null, pendingSeekFrame: null as number | null, @@ -19,6 +20,7 @@ const sourcePlayerStoreState = vi.hoisted(() => ({ setCurrentMediaId: vi.fn(), releaseCurrentMediaId: vi.fn(), setCurrentSourceFrame: vi.fn(), + setPreviewSourceFrame: vi.fn(), setInPoint: vi.fn(), setOutPoint: vi.fn(), clearInOutPoints: vi.fn(), @@ -42,23 +44,31 @@ const itemsStoreState = vi.hoisted(() => ({ tracks: [], })); +const playerMethodsState = vi.hoisted(() => ({ + seek: vi.fn(), + play: vi.fn(), + pause: vi.fn(), + toggle: vi.fn(), + frameBack: vi.fn(), + frameForward: vi.fn(), +})); + +const clockState = vi.hoisted(() => ({ + currentFrame: 0, + isPlaying: false, +})); + vi.mock('@/features/preview/deps/player-context', () => ({ PlayerEmitterProvider: ({ children }: { children: ReactNode }) => <>{children}, ClockBridgeProvider: ({ children }: { children: ReactNode }) => <>{children}, VideoConfigProvider: ({ children }: { children: ReactNode }) => <>{children}, useClock: () => ({ - currentFrame: 0, - isPlaying: false, + currentFrame: clockState.currentFrame, + isPlaying: clockState.isPlaying, onFrameChange: () => () => {}, }), - useClockIsPlaying: () => false, - usePlayer: () => ({ - seek: vi.fn(), - play: vi.fn(), - toggle: vi.fn(), - frameBack: vi.fn(), - frameForward: vi.fn(), - }), + useClockIsPlaying: () => clockState.isPlaying, + usePlayer: () => playerMethodsState, })); vi.mock('./source-composition', () => ({ @@ -165,11 +175,19 @@ describe('SourceMonitor current media ownership', () => { } vi.stubGlobal('ResizeObserver', ResizeObserverMock); + vi.stubGlobal('requestAnimationFrame', (callback: FrameRequestCallback) => ( + window.setTimeout(() => callback(performance.now()), 0) + )); + vi.stubGlobal('cancelAnimationFrame', (handle: number) => { + window.clearTimeout(handle); + }); }); beforeEach(() => { vi.clearAllMocks(); editorStoreState.sourcePreviewMediaId = 'media-1'; + clockState.currentFrame = 0; + clockState.isPlaying = false; }); it('does not release the current media during the initial Strict Mode remount', async () => { @@ -199,4 +217,64 @@ describe('SourceMonitor current media ownership', () => { expect(sourcePlayerStoreState.releaseCurrentMediaId).toHaveBeenCalledWith('media-1'); }); + it('batches seek bar drags and commits the final frame on mouseup', async () => { + const rendered = render(); + + await waitFor(() => { + expect(sourcePlayerStoreState.setCurrentMediaId).toHaveBeenCalledWith('media-1'); + }); + + const seekBar = rendered.getByTestId('source-monitor-seek-bar'); + vi.spyOn(seekBar, 'getBoundingClientRect').mockReturnValue({ + x: 0, + y: 0, + top: 0, + left: 0, + right: 100, + bottom: 10, + width: 100, + height: 10, + toJSON: () => ({}), + }); + + fireEvent.mouseDown(seekBar, { clientX: 25 }); + fireEvent.mouseMove(document, { clientX: 75 }); + + expect(playerMethodsState.seek).not.toHaveBeenCalled(); + await waitFor(() => { + expect(sourcePlayerStoreState.setCurrentSourceFrame).toHaveBeenLastCalledWith(112); + }); + + fireEvent.mouseUp(document); + + expect(playerMethodsState.seek).toHaveBeenCalledTimes(1); + expect(playerMethodsState.seek).toHaveBeenCalledWith(112); + }); + + it('pauses playback when seek-bar scrubbing starts', async () => { + clockState.isPlaying = true; + const rendered = render(); + + await waitFor(() => { + expect(sourcePlayerStoreState.setCurrentMediaId).toHaveBeenCalledWith('media-1'); + }); + + const seekBar = rendered.getByTestId('source-monitor-seek-bar'); + vi.spyOn(seekBar, 'getBoundingClientRect').mockReturnValue({ + x: 0, + y: 0, + top: 0, + left: 0, + right: 100, + bottom: 10, + width: 100, + height: 10, + toJSON: () => ({}), + }); + + fireEvent.mouseDown(seekBar, { clientX: 25 }); + + expect(playerMethodsState.pause).toHaveBeenCalledTimes(1); + }); + }); diff --git a/src/features/preview/components/source-monitor.tsx b/src/features/preview/components/source-monitor.tsx index 7184e6e3f..8a772345c 100644 --- a/src/features/preview/components/source-monitor.tsx +++ b/src/features/preview/components/source-monitor.tsx @@ -41,6 +41,7 @@ import { useEditorStore } from '@/app/state/editor'; import { useSourcePlayerStore } from '@/shared/state/source-player'; import { useSelectionStore } from '@/shared/state/selection'; import { EDITOR_LAYOUT_CSS_VALUES, getEditorLayout } from '@/app/editor-layout'; +import { createScrubThrottleState, shouldCommitScrubFrame } from '../deps/timeline-utils'; import { cn } from '@/shared/ui/cn'; import { formatTimecodeCompact } from '@/shared/utils/time-utils'; import type { TimelineTrack } from '@/types/timeline'; @@ -559,6 +560,12 @@ function SourcePlaybackControls({ [fps, formatFrameNumber], ); + const clearPreviewSourceFrame = useCallback(() => { + if (interactive) { + useSourcePlayerStore.getState().setPreviewSourceFrame(null); + } + }, [interactive]); + const updateFrameDisplay = useCallback((frame: number) => { currentFrameRef.current = frame; if (interactive) { @@ -575,21 +582,44 @@ function SourcePlaybackControls({ } }, [fps, formatFrameNumber, interactive, lastFrame]); + const commitSourceSeek = useCallback((frame: number) => { + clearPreviewSourceFrame(); + updateFrameDisplay(frame); + player.seek(frame); + }, [clearPreviewSourceFrame, player, updateFrameDisplay]); + // Bridge player methods into the source player store for keyboard shortcuts useEffect(() => { if (!interactive) return; const setPlayerMethods = useSourcePlayerStore.getState().setPlayerMethods; setPlayerMethods({ - toggle: player.toggle, - seek: player.seek, - frameBack: player.frameBack, - frameForward: player.frameForward, + toggle: () => { + const previewFrame = useSourcePlayerStore.getState().previewSourceFrame; + if (previewFrame !== null) { + commitSourceSeek(previewFrame); + } + player.toggle(); + }, + pause: () => { + player.pause(); + }, + seek: (frame) => { + commitSourceSeek(frame); + }, + frameBack: (frames) => { + clearPreviewSourceFrame(); + player.frameBack(frames); + }, + frameForward: (frames) => { + clearPreviewSourceFrame(); + player.frameForward(frames); + }, getDurationInFrames: () => durationInFrames, }); return () => { useSourcePlayerStore.getState().setPlayerMethods(null); }; - }, [durationInFrames, interactive, player.toggle, player.seek, player.frameBack, player.frameForward]); + }, [clearPreviewSourceFrame, commitSourceSeek, durationInFrames, interactive, player]); useEffect(() => { updateFrameDisplay(clock.currentFrame); @@ -603,20 +633,29 @@ function SourcePlaybackControls({ }); }, [clock, player, updateFrameDisplay]); - // Consume pending seek (e.g. double-click opens clip at its In point) + // Consume pending seek. Always pause → seek → (optionally) play so + // switching scenes lands a clean transition: no `player.play()` short- + // circuiting because the previous scene was still playing (the ref + // `imperativePlaying.current` blocks a second play), and the video + // element isn't decoding the old frame while the seek is in flight. const pendingSeekFrame = useSourcePlayerStore((s) => s.pendingSeekFrame); useEffect(() => { if (!interactive) return; if (pendingSeekFrame !== null) { - player.seek(pendingSeekFrame); - useSourcePlayerStore.getState().setPendingSeekFrame(null); + player.pause(); + commitSourceSeek(pendingSeekFrame); + const store = useSourcePlayerStore.getState(); + store.setPendingSeekFrame(null); + const shouldPlay = store.pendingPlay; + store.setPendingPlay(false); + if (shouldPlay) player.play(); } - }, [interactive, pendingSeekFrame, player]); + }, [commitSourceSeek, interactive, pendingSeekFrame, player]); useEffect(() => { if (seekFrame === null) return; - player.seek(seekFrame); - }, [player, seekFrame]); + commitSourceSeek(seekFrame); + }, [commitSourceSeek, seekFrame]); // Read I/O points from store const inPoint = useSourcePlayerStore((s) => s.inPoint); @@ -630,29 +669,124 @@ function SourcePlaybackControls({ const draggingRef = useRef(false); const onMoveRef = useRef<((ev: MouseEvent) => void) | null>(null); const onUpRef = useRef<(() => void) | null>(null); - - const seekFromX = useCallback( + const pendingBarSeekFrameRef = useRef(null); + const pendingBarPointerXRef = useRef(null); + const barSeekRafRef = useRef(null); + const lastIssuedBarSeekFrameRef = useRef(null); + const scrubThrottleStateRef = useRef(createScrubThrottleState({ + frame: clock.currentFrame, + nowMs: performance.now(), + })); + + const frameFromBarX = useCallback( (clientX: number) => { const bar = barRef.current; - if (!bar) return; + if (!bar) return null; const rect = bar.getBoundingClientRect(); + if (rect.width <= 0) { + return 0; + } const pct = Math.max(0, Math.min(1, (clientX - rect.left) / rect.width)); - player.seek(Math.round(pct * lastFrame)); + return Math.round(pct * lastFrame); }, - [player, lastFrame], + [lastFrame], ); + const flushBarSeekFrame = useCallback((frame: number) => { + lastIssuedBarSeekFrameRef.current = frame; + commitSourceSeek(frame); + }, [commitSourceSeek]); + + const getBarPixelsPerSecond = useCallback(() => { + const bar = barRef.current; + if (!bar || durationInFrames <= 0 || fps <= 0) { + return 0; + } + + return (bar.clientWidth * fps) / durationInFrames; + }, [durationInFrames, fps]); + + const previewBarSeekFrame = useCallback((frame: number) => { + pendingBarSeekFrameRef.current = frame; + + if (interactive) { + useSourcePlayerStore.getState().setPreviewSourceFrame(frame); + } + if (currentFrameRef.current !== frame) { + updateFrameDisplay(frame); + } + }, [interactive, updateFrameDisplay]); + + const scheduleBarSeekFrame = useCallback((frame: number, pointerX: number, force = false) => { + pendingBarSeekFrameRef.current = frame; + pendingBarPointerXRef.current = pointerX; + + if (force) { + previewBarSeekFrame(frame); + return; + } + + if (barSeekRafRef.current !== null) { + return; + } + + barSeekRafRef.current = requestAnimationFrame(() => { + barSeekRafRef.current = null; + const pendingFrame = pendingBarSeekFrameRef.current; + const pendingPointerX = pendingBarPointerXRef.current; + if (pendingFrame === null || pendingPointerX === null) { + return; + } + + if (shouldCommitScrubFrame({ + state: scrubThrottleStateRef.current, + pointerX: pendingPointerX, + targetFrame: pendingFrame, + pixelsPerSecond: getBarPixelsPerSecond(), + nowMs: performance.now(), + })) { + previewBarSeekFrame(pendingFrame); + } + }); + }, [getBarPixelsPerSecond, previewBarSeekFrame]); + const handleBarMouseDown = useCallback( (e: React.MouseEvent) => { e.preventDefault(); e.stopPropagation(); draggingRef.current = true; - seekFromX(e.clientX); + if (playing) { + player.pause(); + replayingRef.current = false; + } + const initialFrame = frameFromBarX(e.clientX); + if (initialFrame !== null) { + scrubThrottleStateRef.current = createScrubThrottleState({ + pointerX: e.clientX, + frame: initialFrame, + nowMs: performance.now(), + }); + scheduleBarSeekFrame(initialFrame, e.clientX, true); + } const onMove = (ev: MouseEvent) => { - if (draggingRef.current) seekFromX(ev.clientX); + if (!draggingRef.current) return; + const nextFrame = frameFromBarX(ev.clientX); + if (nextFrame !== null) { + scheduleBarSeekFrame(nextFrame, ev.clientX); + } }; const onUp = () => { + const pendingFrame = pendingBarSeekFrameRef.current; draggingRef.current = false; + pendingBarSeekFrameRef.current = null; + pendingBarPointerXRef.current = null; + if (barSeekRafRef.current !== null) { + cancelAnimationFrame(barSeekRafRef.current); + barSeekRafRef.current = null; + } + if (pendingFrame !== null) { + flushBarSeekFrame(pendingFrame); + } if (onMoveRef.current) { document.removeEventListener('mousemove', onMoveRef.current); onMoveRef.current = null; @@ -667,12 +801,19 @@ function SourcePlaybackControls({ document.addEventListener('mousemove', onMove); document.addEventListener('mouseup', onUp); }, - [seekFromX], + [flushBarSeekFrame, frameFromBarX, player, playing, scheduleBarSeekFrame], ); // Clean up document listeners on unmount useEffect(() => { return () => { + pendingBarSeekFrameRef.current = null; + pendingBarPointerXRef.current = null; + lastIssuedBarSeekFrameRef.current = null; + if (barSeekRafRef.current !== null) { + cancelAnimationFrame(barSeekRafRef.current); + barSeekRafRef.current = null; + } if (onMoveRef.current) { document.removeEventListener('mousemove', onMoveRef.current); onMoveRef.current = null; @@ -803,9 +944,35 @@ function SourcePlaybackControls({ const { inPoint: ip, outPoint: op } = useSourcePlayerStore.getState(); if (ip === null && op === null) return; replayingRef.current = true; - player.seek(ip ?? 0); + commitSourceSeek(ip ?? 0); player.play(); - }, [player]); + }, [commitSourceSeek, player]); + + const handleGoToStart = useCallback(() => { + commitSourceSeek(0); + }, [commitSourceSeek]); + + const handleStepBack = useCallback(() => { + clearPreviewSourceFrame(); + player.frameBack(1); + }, [clearPreviewSourceFrame, player]); + + const handleTogglePlayback = useCallback(() => { + const previewFrame = useSourcePlayerStore.getState().previewSourceFrame; + if (previewFrame !== null) { + commitSourceSeek(previewFrame); + } + player.toggle(); + }, [commitSourceSeek, player]); + + const handleStepForward = useCallback(() => { + clearPreviewSourceFrame(); + player.frameForward(1); + }, [clearPreviewSourceFrame, player]); + + const handleGoToEnd = useCallback(() => { + commitSourceSeek(lastFrame); + }, [commitSourceSeek, lastFrame]); const activeTrack = useMemo( () => (activeTrackId ? tracks.find((track) => track.id === activeTrackId) ?? null : null), @@ -1021,6 +1188,7 @@ function SourcePlaybackControls({ {/* Seek bar */}
@@ -1098,7 +1266,7 @@ function SourcePlaybackControls({
- @@ -1106,7 +1274,7 @@ function SourcePlaybackControls({ - @@ -1114,7 +1282,7 @@ function SourcePlaybackControls({ - @@ -1122,7 +1290,7 @@ function SourcePlaybackControls({ - @@ -1130,7 +1298,7 @@ function SourcePlaybackControls({ - diff --git a/src/features/preview/components/timecode-display.test.tsx b/src/features/preview/components/timecode-display.test.tsx index bf3161621..c1fcaab4e 100644 --- a/src/features/preview/components/timecode-display.test.tsx +++ b/src/features/preview/components/timecode-display.test.tsx @@ -54,4 +54,30 @@ describe('TimecodeDisplay', () => { expect(button).toHaveTextContent('0012'); expect(button).toHaveTextContent('0999'); }); + + it('shows the skim preview frame in the timecode readout', () => { + render(); + + const button = screen.getByRole('button'); + expect(button).toHaveTextContent('00:00:12'); + + usePlaybackStore.getState().setPreviewFrame(48); + + expect(button).toHaveTextContent('00:01:18'); + }); + + it('prefers the displayed overlay frame when fast scrub owns presentation', () => { + render(); + + const button = screen.getByRole('button'); + usePlaybackStore.setState({ + currentFrame: 12, + currentFrameEpoch: 1, + previewFrame: 48, + previewFrameEpoch: 2, + }); + usePreviewBridgeStore.getState().setDisplayedFrame(50); + + expect(button).toHaveTextContent('00:01:20'); + }); }); diff --git a/src/features/preview/components/timecode-display.tsx b/src/features/preview/components/timecode-display.tsx index cd387be52..8e0c66df8 100644 --- a/src/features/preview/components/timecode-display.tsx +++ b/src/features/preview/components/timecode-display.tsx @@ -1,5 +1,6 @@ import { useState, useEffect, useRef, useCallback } from 'react'; -import { usePlaybackStore } from '@/shared/state/playback'; +import { getResolvedPlaybackFrame, usePlaybackStore } from '@/shared/state/playback'; +import { usePreviewBridgeStore } from '@/shared/state/preview-bridge'; import { formatTimecodeCompact } from '@/shared/utils/time-utils'; interface TimecodeDisplayProps { @@ -38,7 +39,20 @@ export function TimecodeDisplay({ fps, totalFrames }: TimecodeDisplayProps) { return frame.toString().padStart(maxDigits, '0'); }, []); - // Subscribe to currentFrame changes and update DOM directly (no React re-renders) + const getVisibleFrame = useCallback(() => { + const playbackState = usePlaybackStore.getState(); + return getResolvedPlaybackFrame({ + currentFrame: playbackState.currentFrame, + currentFrameEpoch: playbackState.currentFrameEpoch, + previewFrame: playbackState.previewFrame, + previewFrameEpoch: playbackState.previewFrameEpoch, + isPlaying: playbackState.isPlaying, + displayedFrame: usePreviewBridgeStore.getState().displayedFrame, + }); + }, []); + + // Subscribe to the resolved visible preview frame and update DOM directly + // (no React re-renders during playback/scrub). useEffect(() => { const updateDisplay = (frame: number) => { if (!currentTimeRef.current) return; @@ -48,22 +62,29 @@ export function TimecodeDisplay({ fps, totalFrames }: TimecodeDisplayProps) { }; // Initial update - updateDisplay(usePlaybackStore.getState().currentFrame); + updateDisplay(getVisibleFrame()); - // Subscribe to store changes - return usePlaybackStore.subscribe((state) => { - updateDisplay(state.currentFrame); - }); - }, [formatFrameNumber]); + const syncDisplay = () => { + updateDisplay(getVisibleFrame()); + }; + + const unsubscribePlayback = usePlaybackStore.subscribe(syncDisplay); + const unsubscribePreviewBridge = usePreviewBridgeStore.subscribe(syncDisplay); + + return () => { + unsubscribePlayback(); + unsubscribePreviewBridge(); + }; + }, [formatFrameNumber, getVisibleFrame]); // Update display when showFrames or fps changes (rare - can trigger re-render) useEffect(() => { if (!currentTimeRef.current) return; - const frame = usePlaybackStore.getState().currentFrame; + const frame = getVisibleFrame(); currentTimeRef.current.textContent = showFrames ? formatFrameNumber(frame) : formatTimecodeCompact(frame, fps); - }, [showFrames, fps, formatFrameNumber]); + }, [showFrames, fps, formatFrameNumber, getVisibleFrame]); return ( + + +
+ More colors + {hidden.length} +
+
+ {hidden.map((cluster, i) => ( + + ))} +
+
+ + )} +
+ ); +}); + +function SwatchButton({ + cluster, + totalWeight, + active, + onPick, +}: { + cluster: Cluster; + totalWeight: number; + active: boolean; + onPick: (cluster: Cluster) => void; +}) { + const [r, g, b] = labToRgb(cluster.l, cluster.a, cluster.b); + const share = cluster.weight / totalWeight; + const label = `Find scenes in this color (${Math.round(share * 100)}% of the library)`; + return ( +
+
+ +
+
+ +
+ + + {isFiltered + ? `${scenes.length} ${scenes.length === 1 ? 'match' : 'matches'} · ${scopeLabel}` + : scopeLabel} + +
+ + Sort + +
+
+ + {/* The `[&>...]` override forces Radix's inner viewport wrapper to + block layout. Radix defaults it to `display: table; min-width: 100%` + which lets the wrapper grow past the viewport width if any row has + a long intrinsic min-width — that overflow slides row content + underneath the vertical scrollbar. Block layout keeps the wrapper + clamped to viewport width so the scrollbar sits in its own column. */} + +
+ {reanalyzingMedia.length > 0 && ( + + )} + {(indexProgress.indexTotal > 0 || indexProgress.loadingModel) && ( + + )} + {hasResults ? ( + viewMode === 'grid' ? ( +
+ {scenes.map((scene, index) => ( + + ))} +
+ ) : ( + scenes.map((scene, index) => ( + + )) + ) + ) : reanalyzingMedia.length === 0 ? ( + + ) : null} +
+
+
+ ); +} + +function SemanticIndexBanner({ + progress, +}: { + progress: { indexing: number; indexTotal: number; loadingModel: boolean }; +}) { + const label = progress.loadingModel + ? 'Downloading semantic model (~22 MB, first run only)…' + : `Indexing captions for semantic search — ${progress.indexing}/${progress.indexTotal} clips`; + return ( +
+ + {label} +
+ ); +} + +function ReanalyzingBanner({ + items, +}: { + items: Array<{ id: string; fileName: string }>; +}) { + const label = items.length === 1 + ? items[0]!.fileName + : `${items.length} clips`; + return ( +
+ + + Re-analyzing {label} — scenes will refresh when done. + +
+ ); +} + +/** + * Compact replacement for the scope ` setQuery(e.target.value)} + placeholder={ + reference + ? 'Finding scenes with a similar palette…' + : semanticActive + ? 'Search by meaning — "sunset over water", "people laughing"…' + : 'Search scenes by what you see…' + } + disabled={!!reference} + className="h-8 pl-8 pr-7 text-[12px] disabled:opacity-60" + spellCheck={false} + autoComplete="off" + /> + {query.length > 0 && !reference && ( + + )} +
+ {reference && ( + + )} +
+ ); +} diff --git a/src/features/scene-browser/deps/analysis-contract.ts b/src/features/scene-browser/deps/analysis-contract.ts new file mode 100644 index 000000000..9a5d26923 --- /dev/null +++ b/src/features/scene-browser/deps/analysis-contract.ts @@ -0,0 +1,27 @@ +/** + * Cross-feature contract — scene-browser uses the embeddings provider for + * semantic search (query embedding + background indexer). + */ + +export { + embeddingsProvider, + EMBEDDING_MODEL_ID, + EMBEDDING_MODEL_DIM, + clipProvider, + CLIP_MODEL_ID, + CLIP_EMBEDDING_DIM, + buildEmbeddingText, + extractDominantColors, + extractDominantColorPhrase, + deltaE2000, + rgbToLab, +} from '@/infrastructure/analysis'; +export type { + EmbeddingsOptions, + EmbeddingsProgress, + EmbeddingsProvider, + BuildEmbeddingTextInput, + TranscriptSegment, + PaletteEntry, + LabColor, +} from '@/infrastructure/analysis'; diff --git a/src/features/scene-browser/deps/analysis.ts b/src/features/scene-browser/deps/analysis.ts new file mode 100644 index 000000000..6195064ad --- /dev/null +++ b/src/features/scene-browser/deps/analysis.ts @@ -0,0 +1 @@ +export * from './analysis-contract'; diff --git a/src/features/scene-browser/deps/media-library-contract.ts b/src/features/scene-browser/deps/media-library-contract.ts new file mode 100644 index 000000000..e62dc9d53 --- /dev/null +++ b/src/features/scene-browser/deps/media-library-contract.ts @@ -0,0 +1,10 @@ +/** + * Cross-feature adapter contract — scene-browser accesses media-library + * state and helpers through this file so the import graph is auditable. + */ + +export { useMediaLibraryStore } from '@/features/media-library/stores/media-library-store'; +export { getMediaType, formatDuration } from '@/features/media-library/utils/validation'; +export { mediaLibraryService } from '@/features/media-library/services/media-library-service'; +export { mediaAnalysisService } from '@/features/media-library/services/media-analysis-service'; +export type { MediaLibraryNotification } from '@/features/media-library/types'; diff --git a/src/features/scene-browser/deps/media-library.ts b/src/features/scene-browser/deps/media-library.ts new file mode 100644 index 000000000..9a640311e --- /dev/null +++ b/src/features/scene-browser/deps/media-library.ts @@ -0,0 +1,10 @@ +/** + * Cross-feature adapter — scene-browser accesses media-library state and + * the shared source player through this barrel so the import graph stays + * one-directional (feature-boundary rule in CLAUDE.md). + */ + +export * from './media-library-contract'; +export { useSourcePlayerStore } from '@/shared/state/source-player'; +export { useEditorStore } from '@/app/state/editor'; +export type { MediaMetadata } from '@/types/storage'; diff --git a/src/features/scene-browser/deps/settings-contract.ts b/src/features/scene-browser/deps/settings-contract.ts new file mode 100644 index 000000000..d212c2e36 --- /dev/null +++ b/src/features/scene-browser/deps/settings-contract.ts @@ -0,0 +1,7 @@ +/** + * Adapter — scene-browser reads `captionSearchMode` from the app settings + * store through this contract so the boundary checker stays happy. + */ + +export { useSettingsStore } from '@/features/settings/stores/settings-store'; +export type { CaptionSearchMode } from '@/features/settings/stores/settings-store'; diff --git a/src/features/scene-browser/deps/settings.ts b/src/features/scene-browser/deps/settings.ts new file mode 100644 index 000000000..5322f23d4 --- /dev/null +++ b/src/features/scene-browser/deps/settings.ts @@ -0,0 +1 @@ +export * from './settings-contract'; diff --git a/src/features/scene-browser/deps/storage.ts b/src/features/scene-browser/deps/storage.ts new file mode 100644 index 000000000..f014cadf3 --- /dev/null +++ b/src/features/scene-browser/deps/storage.ts @@ -0,0 +1,17 @@ +/** + * Storage adapter — loads caption thumbnail blobs from workspace-fs. + */ + +export { + getCaptionThumbnailBlob, + saveCaptionThumbnail, + probeCaptionThumbnail, + saveCaptionEmbeddings, + getCaptionEmbeddings, + getCaptionsEmbeddingsMeta, + saveCaptionImageEmbeddings, + getCaptionImageEmbeddings, + getTranscript, + getScenes, +} from '@/infrastructure/storage'; +export type { SavedScenes } from '@/infrastructure/storage'; diff --git a/src/features/scene-browser/hooks/use-caption-thumbnail.ts b/src/features/scene-browser/hooks/use-caption-thumbnail.ts new file mode 100644 index 000000000..3ce749912 --- /dev/null +++ b/src/features/scene-browser/hooks/use-caption-thumbnail.ts @@ -0,0 +1,112 @@ +import { useEffect, useRef, useState } from 'react'; +import { getCaptionThumbnailBlob } from '../deps/storage'; +import { requestLazyCaptionThumbnail } from '../utils/lazy-thumb'; + +/** + * Module-scoped blob URL cache keyed by `thumbRelPath`. Scene Browser rows + * are virtualized / remount frequently, so loading the same JPEG for every + * mount would thrash the workspace-fs read path. Entries are evicted by + * {@link invalidateMediaCaptionThumbBlobs} when the source media is + * re-analyzed — without that, a blob URL keeps pointing at the pre-reanalyze + * JPEG content even after the on-disk file changes. + */ +const blobUrlCache = new Map(); +const pendingLoads = new Map>(); + +/** + * Revoke and drop every blob URL that lives under a media's + * captions-thumbs directory. Callers should invoke this before a + * re-analysis run so the next render loads the freshly-written JPEG + * instead of the cached pre-overwrite blob. + */ +export function invalidateMediaCaptionThumbBlobs(mediaId: string): void { + const prefix = `media/${mediaId}/cache/ai/captions-thumbs/`; + for (const [key, url] of blobUrlCache) { + if (key.startsWith(prefix)) { + URL.revokeObjectURL(url); + blobUrlCache.delete(key); + } + } + for (const key of pendingLoads.keys()) { + if (key.startsWith(prefix)) pendingLoads.delete(key); + } +} + +async function loadBlobUrl(relPath: string): Promise { + const cached = blobUrlCache.get(relPath); + if (cached) return cached; + const pending = pendingLoads.get(relPath); + if (pending) return pending; + + const promise = (async () => { + const blob = await getCaptionThumbnailBlob(relPath); + if (!blob) return null; + const url = URL.createObjectURL(blob); + blobUrlCache.set(relPath, url); + return url; + })(); + pendingLoads.set(relPath, promise); + try { + return await promise; + } finally { + pendingLoads.delete(relPath); + } +} + +interface LazyRequest { + mediaId: string; + captionIndex: number; + timeSec: number; +} + +/** + * Resolve a caption thumbnail `thumbRelPath` to a blob URL. When the + * persisted path is missing and a `lazy` descriptor is supplied, the + * generator is queued to seek the source media, capture a JPEG, persist + * it, and hand the resulting path back to this hook on a subsequent + * render (via the store patch inside `lazy-thumb.ts`). + */ +export function useCaptionThumbnail( + thumbRelPath: string | undefined, + lazy?: LazyRequest, +): string | null { + const [url, setUrl] = useState(() => ( + thumbRelPath ? blobUrlCache.get(thumbRelPath) ?? null : null + )); + const latestPath = useRef(thumbRelPath); + latestPath.current = thumbRelPath; + + useEffect(() => { + if (thumbRelPath) { + const cached = blobUrlCache.get(thumbRelPath); + if (cached) { + setUrl(cached); + return; + } + setUrl(null); + void loadBlobUrl(thumbRelPath).then((loaded) => { + if (latestPath.current === thumbRelPath) { + setUrl(loaded); + } + }); + return; + } + + // No persisted thumbnail — lazy-generate if we know how. + setUrl(null); + if (!lazy) return; + let cancelled = false; + void requestLazyCaptionThumbnail(lazy.mediaId, lazy.captionIndex, lazy.timeSec) + .then((relPath) => { + if (cancelled || !relPath) return; + void loadBlobUrl(relPath).then((loaded) => { + if (!cancelled && latestPath.current === undefined) { + setUrl(loaded); + } + }); + }); + return () => { cancelled = true; }; + }, [thumbRelPath, lazy?.mediaId, lazy?.captionIndex, lazy?.timeSec]); + + return url; +} diff --git a/src/features/scene-browser/hooks/use-library-palette.ts b/src/features/scene-browser/hooks/use-library-palette.ts new file mode 100644 index 000000000..12cb3fc22 --- /dev/null +++ b/src/features/scene-browser/hooks/use-library-palette.ts @@ -0,0 +1,56 @@ +import { useMemo } from 'react'; +import { useMediaLibraryStore } from '../deps/media-library'; +import { + clusterPaletteEntries, + flattenLibraryPalettes, + type LabCluster, +} from '../utils/library-palette'; + +/** Target cluster count. Capped further by how many palettes exist. */ +const DEFAULT_K = 12; + +/** + * Collect every caption's palette across the library and cluster them + * into a small set of representative colors for the Color Mode picker. + * + * The hook reads from the media-library store (not the scene browser's + * embeddings cache) because captions are the source of truth — the + * palettes in `MediaCaption.palette` are what the ranker matches + * against, so the grid must reflect the same data. + */ +export function useLibraryPalette( + scope: string | null, + k = DEFAULT_K, +): LabCluster[] { + const mediaItems = useMediaLibraryStore((s) => s.mediaItems); + + return useMemo(() => { + const palettes: Array> = []; + for (const media of mediaItems) { + if (scope && media.id !== scope) continue; + const captions = media.aiCaptions; + if (!captions || captions.length === 0) continue; + for (const caption of captions) { + if (caption.palette && caption.palette.length > 0) { + palettes.push(caption.palette); + } + } + } + if (palettes.length === 0) return []; + + const flat = flattenLibraryPalettes(palettes.map((p) => p.map((e) => ({ + l: e.l, a: e.a, b: e.b, weight: e.weight, + })))); + const clusters = clusterPaletteEntries(flat, k); + + // Sort by aggregate weight so the grid leads with the library's + // dominant colors — skin, sky, greenery tend to surface first, + // with vivid accents trailing. Stable tiebreak on Lab so the order + // doesn't jitter across renders. + return clusters.slice().sort((a, b) => { + if (b.weight !== a.weight) return b.weight - a.weight; + if (a.l !== b.l) return a.l - b.l; + return a.a - b.a; + }); + }, [mediaItems, scope, k]); +} diff --git a/src/features/scene-browser/hooks/use-ranked-scenes.ts b/src/features/scene-browser/hooks/use-ranked-scenes.ts new file mode 100644 index 000000000..bc6332fb4 --- /dev/null +++ b/src/features/scene-browser/hooks/use-ranked-scenes.ts @@ -0,0 +1,265 @@ +import { useEffect, useMemo, useState } from 'react'; +import { createLogger } from '@/shared/logging/logger'; +import { clipProvider, embeddingsProvider } from '../deps/analysis'; +import { useMediaLibraryStore } from '../deps/media-library'; +import { useSettingsStore } from '../deps/settings'; +import { useSceneBrowserStore } from '../stores/scene-browser-store'; +import { + getEmbeddingsSnapshot, + getImageEmbeddingsSnapshot, + getPalettesSnapshot, +} from '../utils/embeddings-cache'; +import { parseColorQuery } from '../utils/color-boost'; +import { rankScenes, type RankableScene, type ScoredScene } from '../utils/rank'; +import { semanticRank } from '../utils/semantic-rank'; + +const log = createLogger('SceneBrowser:RankedScenes'); + +export interface RankedScenesResult { + scenes: ScoredScene[]; + totalScenes: number; + totalClips: number; + clipsWithCaptions: number; + /** + * Filenames of media currently being Analyzed-with-AI (and therefore + * excluded from the scene list above). Exposed so the panel can surface + * a "re-analyzing" indicator while the old entries are hidden and the + * new ones haven't landed yet. + */ + reanalyzingMedia: Array<{ id: string; fileName: string }>; + /** + * Whether the active search mode produced the shown ranking. Semantic + * mode falls back to keyword while the query embedding is in flight — + * the panel can use this to show a subtle "embedding…" indicator. + */ + activeMode: 'keyword' | 'semantic'; + /** + * True when a non-empty query is being ranked — toggles per-row score + * chrome so browsing without a query doesn't look cluttered with 0% + * badges on every scene. + */ + isQuerying: boolean; + /** + * Count of scenes (not clips) whose text embedding is currently loaded + * in memory, vs. the total visible scene count. Gives the status-bar + * something concrete to say while the background indexer is still + * filling things in. + */ + sceneTextIndexed: number; + /** Same, for CLIP image embeddings. */ + sceneImageIndexed: number; + /** + * True while we're waiting on the query's semantic text embedding — + * old scenes still render via keyword fallback, but the panel can show + * a subtle "embedding query…" pill so the delay isn't mysterious. + */ + queryTextEmbedding: 'idle' | 'embedding' | 'ready'; + /** Same, for the CLIP text-encoder half of the query. */ + queryImageEmbedding: 'idle' | 'embedding' | 'ready'; +} + +/** + * Build the ranked scene list for the Scene Browser. The hook owns all + * joining between media metadata and caption records so components can + * treat each row as a self-contained record (filename, timestamp, thumb path). + */ +export function useRankedScenes(): RankedScenesResult { + const mediaItems = useMediaLibraryStore((s) => s.mediaItems); + const taggingMediaIds = useMediaLibraryStore((s) => s.taggingMediaIds); + const query = useSceneBrowserStore((s) => s.query); + const scope = useSceneBrowserStore((s) => s.scope); + const sortMode = useSceneBrowserStore((s) => s.sortMode); + const reference = useSceneBrowserStore((s) => s.reference); + const captionSearchMode = useSettingsStore((s) => s.captionSearchMode); + const colorQuery = useMemo(() => parseColorQuery(query), [query]); + + // Embed the query with both text models when semantic mode is active. + // Keeping each in a separate state slot (rather than a Suspense promise + // or sync read) means typing stays fluid — old scenes remain visible + // while the new embedding is in flight. + const [queryEmbedding, setQueryEmbedding] = useState(null); + const [queryImageEmbedding, setQueryImageEmbedding] = useState(null); + const [queryTextState, setQueryTextState] = useState<'idle' | 'embedding' | 'ready'>('idle'); + const [queryImageState, setQueryImageState] = useState<'idle' | 'embedding' | 'ready'>('idle'); + + useEffect(() => { + if (captionSearchMode !== 'semantic' || query.trim().length === 0) { + setQueryEmbedding(null); + setQueryTextState('idle'); + return; + } + if (colorQuery.paletteOnly) { + setQueryEmbedding(new Float32Array(0)); + setQueryTextState('ready'); + return; + } + let cancelled = false; + setQueryTextState('embedding'); + void embeddingsProvider + .embed(query.trim()) + .then((vector) => { + if (cancelled) return; + setQueryEmbedding(vector); + setQueryTextState('ready'); + }) + .catch((error) => { + if (!cancelled) { + log.warn('Query text embedding failed — falling back to keyword', { query, error }); + setQueryEmbedding(null); + setQueryTextState('idle'); + } + }); + return () => { cancelled = true; }; + }, [captionSearchMode, query, colorQuery.paletteOnly]); + + // CLIP text-encoder embedding for the visual side. Loaded independently + // so a slow CLIP download doesn't block text-side ranking — scenes can + // be shown via text-only cosine until the CLIP query vector lands. + useEffect(() => { + if ( + captionSearchMode !== 'semantic' + || query.trim().length === 0 + || colorQuery.paletteOnly + ) { + setQueryImageEmbedding(null); + setQueryImageState('idle'); + return; + } + let cancelled = false; + setQueryImageState('embedding'); + // Use the ensembled path so a one-word query ("fighting") gets + // wrapped in natural-sentence templates before embedding — CLIP is + // badly behaved on bare tokens and the averaged vector materially + // reduces false positives like "a tower matches fighting". + void clipProvider + .embedQueryForImages(query.trim()) + .then((vector) => { + if (cancelled) return; + if (vector) { + setQueryImageEmbedding(vector); + setQueryImageState('ready'); + } else { + setQueryImageState('idle'); + } + }) + .catch((error) => { + if (!cancelled) { + log.warn('CLIP query embedding failed — skipping visual ranking', { query, error }); + setQueryImageEmbedding(null); + setQueryImageState('idle'); + } + }); + return () => { cancelled = true; }; + }, [captionSearchMode, query, colorQuery.paletteOnly]); + + return useMemo(() => { + const allScenes: RankableScene[] = []; + const reanalyzingMedia: Array<{ id: string; fileName: string }> = []; + let clipsWithCaptions = 0; + + for (const media of mediaItems) { + if (scope && media.id !== scope) continue; + // Hide entries for media that's actively being Analyzed-with-AI — + // the old captions are about to be replaced, and surfacing them + // alongside "re-analyzing" state would be misleading. + if (taggingMediaIds.has(media.id)) { + if (media.aiCaptions && media.aiCaptions.length > 0) { + reanalyzingMedia.push({ id: media.id, fileName: media.fileName }); + } + continue; + } + const captions = media.aiCaptions; + if (!captions || captions.length === 0) continue; + clipsWithCaptions += 1; + captions.forEach((caption, captionIndex) => { + allScenes.push({ + id: `${media.id}:${captionIndex}`, + mediaId: media.id, + mediaFileName: media.fileName, + timeSec: caption.timeSec, + text: caption.text, + thumbRelPath: caption.thumbRelPath, + palette: caption.palette, + }); + }); + } + + const isSemanticActive = ( + captionSearchMode === 'semantic' + && query.trim().length > 0 + && queryEmbedding !== null + ); + + const textEmbeddings = getEmbeddingsSnapshot(); + const imageEmbeddings = getImageEmbeddingsSnapshot(); + const paletteSnapshot = getPalettesSnapshot(); + + // A reference palette forces semantic-lane ranking (palette-only + // scoring inside semanticRank). The query stays visible in the input + // but is ignored until the reference is cleared. + let ranked; + if (reference) { + ranked = semanticRank(new Float32Array(0), allScenes, textEmbeddings, { + palettes: paletteSnapshot, + referencePalette: reference.palette, + }); + } else if (isSemanticActive) { + ranked = semanticRank(queryEmbedding!, allScenes, textEmbeddings, { + queryImageEmbedding, + imageEmbeddings, + query, + palettes: paletteSnapshot, + }); + } else { + ranked = rankScenes(query, allScenes); + } + + // Coverage stats over the scenes the user can currently see, not + // over the whole library — keeps the "indexed" counter honest when + // scoped to a single media. + let sceneTextIndexed = 0; + let sceneImageIndexed = 0; + for (const scene of allScenes) { + if (textEmbeddings.has(scene.id)) sceneTextIndexed += 1; + if (imageEmbeddings.has(scene.id)) sceneImageIndexed += 1; + } + + const hasRankingSignal = query.trim().length > 0 || !!reference; + if (!hasRankingSignal || sortMode === 'time' || sortMode === 'name') { + ranked.sort((a, b) => { + if (a.mediaFileName !== b.mediaFileName) { + return a.mediaFileName.localeCompare(b.mediaFileName); + } + return a.timeSec - b.timeSec; + }); + } + // relevance sort is the default output of rankScenes / semanticRank. + + return { + scenes: ranked, + totalScenes: allScenes.length, + totalClips: mediaItems.length, + clipsWithCaptions, + reanalyzingMedia, + activeMode: isSemanticActive || reference ? 'semantic' : 'keyword', + isQuerying: hasRankingSignal, + sceneTextIndexed, + sceneImageIndexed, + queryTextEmbedding: queryTextState, + queryImageEmbedding: queryImageState, + }; + }, [ + mediaItems, + taggingMediaIds, + query, + scope, + sortMode, + captionSearchMode, + queryEmbedding, + queryImageEmbedding, + queryTextState, + queryImageState, + colorQuery.paletteOnly, + reference, + ]); +} diff --git a/src/features/scene-browser/hooks/use-semantic-index.test.tsx b/src/features/scene-browser/hooks/use-semantic-index.test.tsx new file mode 100644 index 000000000..59bbd44af --- /dev/null +++ b/src/features/scene-browser/hooks/use-semantic-index.test.tsx @@ -0,0 +1,111 @@ +import { render, screen, waitFor } from '@testing-library/react'; +import { beforeEach, describe, expect, it, vi } from 'vitest'; +import { create } from 'zustand'; + +const ensureEmbeddingsLoadedMock = vi.fn(); +const indexMediaCaptionsMock = vi.fn(); +const indexMediaImageCaptionsMock = vi.fn(); +const isMediaMissingEmbeddingsMock = vi.fn(); +const isMediaMissingImageEmbeddingsMock = vi.fn(); + +type MediaItem = { + id: string; + aiCaptions?: Array<{ timeSec: number; text: string }>; +}; + +const useMediaLibraryStore = create<{ + mediaItems: MediaItem[]; + taggingMediaIds: Set; +}>(() => ({ + mediaItems: [], + taggingMediaIds: new Set(), +})); + +const useSettingsStore = create<{ + captionSearchMode: 'keyword' | 'semantic'; +}>(() => ({ + captionSearchMode: 'keyword', +})); + +vi.mock('../deps/media-library', () => ({ + useMediaLibraryStore, +})); + +vi.mock('../deps/settings', () => ({ + useSettingsStore, +})); + +vi.mock('../utils/embeddings-cache', () => ({ + ensureEmbeddingsLoaded: ensureEmbeddingsLoadedMock, + indexMediaCaptions: indexMediaCaptionsMock, + indexMediaImageCaptions: indexMediaImageCaptionsMock, + isMediaMissingEmbeddings: isMediaMissingEmbeddingsMock, + isMediaMissingImageEmbeddings: isMediaMissingImageEmbeddingsMock, +})); + +const { useSemanticIndex } = await import('./use-semantic-index'); + +function SemanticIndexProbe() { + const progress = useSemanticIndex(); + return ( +
+ ); +} + +describe('useSemanticIndex', () => { + beforeEach(() => { + vi.clearAllMocks(); + useSettingsStore.setState({ captionSearchMode: 'semantic' }); + useMediaLibraryStore.setState({ + mediaItems: [ + { + id: 'media-1', + aiCaptions: [{ timeSec: 0, text: 'A scene' }], + }, + ], + taggingMediaIds: new Set(), + }); + }); + + it('clears stale progress when a rerun becomes a no-op after store updates', async () => { + let textIndexed = false; + + ensureEmbeddingsLoadedMock.mockResolvedValue(undefined); + isMediaMissingEmbeddingsMock.mockImplementation(() => !textIndexed); + isMediaMissingImageEmbeddingsMock.mockReturnValue(false); + indexMediaImageCaptionsMock.mockResolvedValue(undefined); + indexMediaCaptionsMock.mockImplementation(async (mediaId: string) => { + await Promise.resolve(); + textIndexed = true; + useMediaLibraryStore.setState((state) => ({ + mediaItems: state.mediaItems.map((item) => ( + item.id === mediaId + ? { ...item, aiCaptions: [...(item.aiCaptions ?? [])] } + : item + )), + })); + await Promise.resolve(); + }); + + render(); + + await waitFor(() => { + expect(screen.getByTestId('semantic-index-probe')).toHaveAttribute('data-total', '1'); + }); + + await waitFor(() => { + expect(indexMediaCaptionsMock).toHaveBeenCalledTimes(1); + }); + + await waitFor(() => { + expect(screen.getByTestId('semantic-index-probe')).toHaveAttribute('data-indexing', '0'); + expect(screen.getByTestId('semantic-index-probe')).toHaveAttribute('data-total', '0'); + expect(screen.getByTestId('semantic-index-probe')).toHaveAttribute('data-loading', 'false'); + }); + }); +}); diff --git a/src/features/scene-browser/hooks/use-semantic-index.ts b/src/features/scene-browser/hooks/use-semantic-index.ts new file mode 100644 index 000000000..cc81ab288 --- /dev/null +++ b/src/features/scene-browser/hooks/use-semantic-index.ts @@ -0,0 +1,141 @@ +/** + * Orchestrates retroactive semantic indexing when the user switches into + * semantic mode. Hydrates embeddings for media that already have them on + * disk; runs the embedding model for media that don't. + * + * Exposes progress so the panel can surface a banner ("Indexing 3/12 + * clips…") while work is in flight. Designed to be safe to mount many + * times — the underlying cache + promise maps deduplicate real work. + */ + +import { useEffect, useRef, useState } from 'react'; +import { createLogger } from '@/shared/logging/logger'; +import { useMediaLibraryStore } from '../deps/media-library'; +import { useSettingsStore } from '../deps/settings'; +import { + ensureEmbeddingsLoaded, + indexMediaCaptions, + indexMediaImageCaptions, + isMediaMissingEmbeddings, + isMediaMissingImageEmbeddings, +} from '../utils/embeddings-cache'; + +const log = createLogger('SceneBrowser:SemanticIndex'); + +export interface SemanticIndexProgress { + /** Running indexer is generating fresh embeddings (slow path). */ + indexing: number; + /** Total clips that need indexing in the current pass. */ + indexTotal: number; + /** Model is downloading — blocks even the hydration path. */ + loadingModel: boolean; + /** A clip just finished indexing — used by the banner for a pulse. */ + lastCompletedAt: number; +} + +const INITIAL_PROGRESS: SemanticIndexProgress = { + indexing: 0, + indexTotal: 0, + loadingModel: false, + lastCompletedAt: 0, +}; + +export function useSemanticIndex(): SemanticIndexProgress { + const mode = useSettingsStore((s) => s.captionSearchMode); + const mediaItems = useMediaLibraryStore((s) => s.mediaItems); + const taggingMediaIds = useMediaLibraryStore((s) => s.taggingMediaIds); + const [progress, setProgress] = useState(INITIAL_PROGRESS); + const runIdRef = useRef(0); + + useEffect(() => { + if (mode !== 'semantic') { + setProgress(INITIAL_PROGRESS); + return; + } + + const runId = ++runIdRef.current; + + const candidates = mediaItems.filter((media) => ( + (media.aiCaptions?.length ?? 0) > 0 && !taggingMediaIds.has(media.id) + )); + if (candidates.length === 0) { + setProgress(INITIAL_PROGRESS); + return; + } + + let cancelled = false; + + void (async () => { + // Phase 1: hydrate everything that already has on-disk embeddings. + // Parallel because the bulk of the work is just reading a small bin. + await Promise.all(candidates.map((media) => ensureEmbeddingsLoaded(media.id))); + if (cancelled || runId !== runIdRef.current) return; + + // Phase 2: fill in text embeddings that are missing (fast path on + // already-downloaded all-MiniLM model, ~20ms per caption). + const needsTextIndex = candidates.filter((media) => isMediaMissingEmbeddings(media.id)); + const needsImageIndex = candidates.filter((media) => isMediaMissingImageEmbeddings(media.id)); + const totalToIndex = needsTextIndex.length + needsImageIndex.length; + if (totalToIndex === 0) { + setProgress(INITIAL_PROGRESS); + return; + } + + setProgress({ + indexing: 0, + indexTotal: totalToIndex, + loadingModel: true, + lastCompletedAt: 0, + }); + + let done = 0; + const advance = () => { + done += 1; + setProgress({ + indexing: done, + indexTotal: totalToIndex, + loadingModel: false, + lastCompletedAt: Date.now(), + }); + }; + + for (const media of needsTextIndex) { + if (cancelled || runId !== runIdRef.current) return; + try { + await indexMediaCaptions(media.id); + } catch (error) { + log.warn('Retroactive text embedding failed', { + mediaId: media.id, fileName: media.fileName, error, + }); + } + advance(); + } + + // Phase 3: image indexing. This is the expensive side — CLIP is + // ~90 MB to download and ~50 ms per image, so do it strictly after + // text indexing so at least keyword → text semantic is immediately + // usable while visual search warms up. + for (const media of needsImageIndex) { + if (cancelled || runId !== runIdRef.current) return; + try { + await indexMediaImageCaptions(media.id); + } catch (error) { + log.warn('Retroactive image embedding failed', { + mediaId: media.id, fileName: media.fileName, error, + }); + } + advance(); + } + + if (!cancelled && runId === runIdRef.current) { + setProgress(INITIAL_PROGRESS); + } + })(); + + return () => { + cancelled = true; + }; + }, [mode, mediaItems, taggingMediaIds]); + + return progress; +} diff --git a/src/features/scene-browser/index.ts b/src/features/scene-browser/index.ts new file mode 100644 index 000000000..274eb2c07 --- /dev/null +++ b/src/features/scene-browser/index.ts @@ -0,0 +1,13 @@ +/** + * Scene Browser — cross-library visual search for AI-generated captions. + * + * Public API: + * - `` — the full panel; mount inside the media-library + * body when `useSceneBrowserStore.open === true`. + * - `useSceneBrowserStore` — control open/close, query, scope, sort. + */ + +export { SceneBrowserPanel } from './components/scene-browser-panel'; +export { useSceneBrowserStore } from './stores/scene-browser-store'; +export type { SceneBrowserSortMode } from './stores/scene-browser-store'; +export { invalidateMediaCaptionThumbnails } from './utils/invalidate'; diff --git a/src/features/scene-browser/stores/scene-browser-store.ts b/src/features/scene-browser/stores/scene-browser-store.ts new file mode 100644 index 000000000..193df7920 --- /dev/null +++ b/src/features/scene-browser/stores/scene-browser-store.ts @@ -0,0 +1,128 @@ +import { create, type StoreApi, type UseBoundStore } from 'zustand'; +import type { PaletteEntry } from '../deps/analysis'; + +export type SceneBrowserSortMode = 'relevance' | 'time' | 'name'; +export type SceneBrowserViewMode = 'list' | 'grid'; + +export interface SceneBrowserReference { + /** Scene id whose palette is the reference — for dedupe and the clear chip. */ + sceneId: string; + /** Short human label (e.g. `"foo.mp4 · 0:12"`) shown in the chip. */ + label: string; + /** The reference palette (CIELAB + weight). */ + palette: PaletteEntry[]; +} + +/** + * `scope === null` is the default cross-library view. A non-null scope is + * the mediaId the Scene Browser was opened from — set when the user clicks + * "Open in Scene Browser" from a media card's info popover. + */ +interface SceneBrowserState { + open: boolean; + query: string; + scope: string | null; + sortMode: SceneBrowserSortMode; + /** Incrementing token the search input watches to force a focus. */ + focusNonce: number; + /** + * Active "find similar palette" reference. When set, the ranker scores + * scenes by palette distance against this reference instead of by + * query semantics. Cleared explicitly (chip × or escape). + */ + reference: SceneBrowserReference | null; + /** + * Panel-local Color Mode — swaps the search input for a grid of the + * library's dominant colors. Orthogonal to captionSearchMode; a user + * can come back to their preferred keyword/semantic lane by toggling + * it off. Not persisted so the default is always "text search". + */ + colorMode: boolean; + /** + * List vs grid layout for the results area. Grid is a responsive + * thumbnail-first layout (good for color/visual scanning); list is + * thumbnail + caption text (good for reading matches). + */ + viewMode: SceneBrowserViewMode; +} + +interface SceneBrowserActions { + openBrowser: (options?: { mediaId?: string | null; focus?: boolean }) => void; + closeBrowser: () => void; + toggleBrowser: () => void; + setQuery: (query: string) => void; + setScope: (scope: string | null) => void; + setSortMode: (mode: SceneBrowserSortMode) => void; + requestFocus: () => void; + setReference: (reference: SceneBrowserReference | null) => void; + setColorMode: (colorMode: boolean) => void; + setViewMode: (viewMode: SceneBrowserViewMode) => void; + reset: () => void; +} + +const INITIAL_STATE: SceneBrowserState = { + open: false, + query: '', + scope: null, + sortMode: 'relevance', + focusNonce: 0, + reference: null, + colorMode: false, + viewMode: 'list', +}; + +type SceneBrowserStoreApi = UseBoundStore>; + +declare global { + // eslint-disable-next-line no-var + var __FREECUT_SCENE_BROWSER_STORE__: SceneBrowserStoreApi | undefined; +} + +const hotStore = import.meta.env.DEV ? globalThis.__FREECUT_SCENE_BROWSER_STORE__ : undefined; + +// Preserve query/scope/color-mode/reference across Vite HMR in dev so a +// file save doesn't wipe the panel's current search context. +const sceneBrowserStore: SceneBrowserStoreApi = hotStore ?? create((set) => ({ + ...INITIAL_STATE, + + openBrowser: (options) => set((state) => ({ + open: true, + scope: options?.mediaId !== undefined ? options.mediaId : state.scope, + focusNonce: options?.focus === false ? state.focusNonce : state.focusNonce + 1, + })), + + closeBrowser: () => set({ open: false }), + + toggleBrowser: () => set((state) => ({ + open: !state.open, + focusNonce: !state.open ? state.focusNonce + 1 : state.focusNonce, + })), + + setQuery: (query) => set({ query }), + + setScope: (scope) => set({ scope }), + + setSortMode: (sortMode) => set({ sortMode }), + + requestFocus: () => set((state) => ({ focusNonce: state.focusNonce + 1 })), + + setReference: (reference) => set({ reference }), + + setViewMode: (viewMode) => set({ viewMode }), + + setColorMode: (colorMode) => set((state) => ({ + colorMode, + // Leaving color mode clears any active reference — the mode is the + // only way to land on one, so the chip shouldn't outlive the mode. + reference: colorMode ? state.reference : null, + query: colorMode ? '' : state.query, + })), + + reset: () => set(INITIAL_STATE), +})); + +if (import.meta.env.DEV) { + globalThis.__FREECUT_SCENE_BROWSER_STORE__ = sceneBrowserStore; +} + +export const useSceneBrowserStore = sceneBrowserStore; diff --git a/src/features/scene-browser/utils/color-boost.test.ts b/src/features/scene-browser/utils/color-boost.test.ts new file mode 100644 index 000000000..fbdc60fcc --- /dev/null +++ b/src/features/scene-browser/utils/color-boost.test.ts @@ -0,0 +1,239 @@ +import { describe, expect, it } from 'vitest'; +import { + colorBoostFor, + extractQueryColors, + nearestColorFamily, + palettePairDistance, + paletteSimilarityBoost, + parseColorQuery, +} from './color-boost'; +import type { PaletteEntry } from '../deps/analysis'; + +describe('extractQueryColors', () => { + it('finds a single family for an explicit color-intent query', () => { + const result = extractQueryColors('red color'); + expect(result.map((r) => r.family)).toEqual(['red']); + }); + + it('maps synonyms to their canonical family when the query is palette-oriented', () => { + const result = extractQueryColors('crimson tones'); + expect(result.map((r) => r.family)).toEqual(['red']); + }); + + it('returns empty for queries with no color terms', () => { + expect(extractQueryColors('a man fighting')).toEqual([]); + }); + + it('extracts color families for bare color-only queries', () => { + // A query that is *only* color words has no object semantics — the + // ranker should match against the palette instead of sending CLIP + // chasing unrelated captions that happen to cluster near the token. + expect(extractQueryColors('ruby scarlet red').map((c) => c.family)).toEqual(['red']); + expect(extractQueryColors('pink').map((c) => c.family)).toEqual(['pink']); + }); + + it('returns empty when non-color content words are present without explicit palette intent', () => { + expect(extractQueryColors('orange sunset navy water')).toEqual([]); + }); + + it('supports color prefix syntax and multiple distinct families', () => { + const result = extractQueryColors('color orange palette navy'); + expect(result.map((r) => r.family).sort()).toEqual(['blue', 'orange']); + }); +}); + +describe('parseColorQuery', () => { + it('marks pure color-intent queries as palette-only', () => { + expect(parseColorQuery('yellow color')).toMatchObject({ + colors: [{ family: 'yellow' }], + paletteOnly: true, + }); + expect(parseColorQuery('color:yellow')).toMatchObject({ + colors: [{ family: 'yellow' }], + paletteOnly: true, + }); + }); + + it('keeps mixed content queries out of palette-only mode', () => { + expect(parseColorQuery('yellow color jacket')).toMatchObject({ + colors: [{ family: 'yellow' }], + paletteOnly: false, + }); + }); + + it('treats bare single-color queries as palette intent', () => { + expect(parseColorQuery('pink')).toMatchObject({ + colors: [{ family: 'pink' }], + paletteOnly: true, + }); + }); + + it('treats multi-color-only queries as palette intent', () => { + expect(parseColorQuery('pink purple')).toMatchObject({ + colors: [{ family: 'pink' }, { family: 'purple' }], + paletteOnly: true, + }); + }); +}); + +const REDS: PaletteEntry = { l: 53, a: 70, b: 50, weight: 0.5 }; +const GREEN: PaletteEntry = { l: 60, a: -55, b: 50, weight: 0.3 }; +const BLUES: PaletteEntry = { l: 40, a: 15, b: -60, weight: 0.2 }; + +describe('colorBoostFor', () => { + it('returns a non-zero boost when palette contains the query color', () => { + const queries = extractQueryColors('red color'); + const result = colorBoostFor(queries, [REDS, GREEN, BLUES]); + expect(result).not.toBeNull(); + expect(result?.family).toBe('red'); + expect(result?.boost).toBeGreaterThan(0.1); + }); + + it('returns null when palette has no close match', () => { + const queries = extractQueryColors('red color'); + const result = colorBoostFor(queries, [ + { l: 60, a: -55, b: 50, weight: 1.0 }, + ]); + expect(result).toBeNull(); + }); + + it('returns null for empty palette', () => { + const queries = extractQueryColors('red color'); + expect(colorBoostFor(queries, [])).toBeNull(); + expect(colorBoostFor(queries, undefined)).toBeNull(); + }); + + it('returns null for query without color words', () => { + const queries = extractQueryColors('a scene with people'); + expect(colorBoostFor(queries, [REDS, GREEN, BLUES])).toBeNull(); + }); + + it('weighs larger palette entries higher', () => { + const queries = extractQueryColors('red color'); + const majorRed = colorBoostFor(queries, [{ l: 53, a: 70, b: 50, weight: 0.8 }]); + const minorRed = colorBoostFor(queries, [{ l: 53, a: 70, b: 50, weight: 0.05 }]); + expect(majorRed?.boost).toBeGreaterThan(minorRed?.boost ?? 0); + }); + + it('picks the best match across multiple query colors', () => { + const queries = extractQueryColors('red and blue palette'); + const result = colorBoostFor(queries, [ + { l: 50, a: 50, b: 40, weight: 0.2 }, + { l: 42, a: 18, b: -58, weight: 0.7 }, + ]); + expect(result?.family).toBe('blue'); + }); + + it('does not match pink against warm skin-tone palette entries', () => { + // Lab ~(65, 20, 20) is a common medium skin tone — warm, moderate + // chroma. It sat within the old pink boost range and polluted "pink" + // results with face-dominated dim scenes. + const queries = extractQueryColors('pink'); + const result = colorBoostFor(queries, [ + { l: 65, a: 20, b: 20, weight: 0.5 }, + { l: 40, a: 10, b: 15, weight: 0.3 }, + ]); + expect(result).toBeNull(); + }); + + it('matches pink against genuinely pink palette entries', () => { + const queries = extractQueryColors('pink'); + const result = colorBoostFor(queries, [ + { l: 65, a: 55, b: -5, weight: 0.4 }, + { l: 20, a: 5, b: 5, weight: 0.4 }, + ]); + expect(result).not.toBeNull(); + expect(result?.family).toBe('pink'); + expect(result?.boost).toBeGreaterThan(0.1); + }); + + it('does not match chromatic families against low-chroma gray palette entries', () => { + const queries = extractQueryColors('red'); + const result = colorBoostFor(queries, [ + { l: 55, a: 2, b: 1, weight: 0.8 }, // near-gray + ]); + expect(result).toBeNull(); + }); + + it('still matches neutral families against low-chroma entries', () => { + // The chroma/hue gate applies only to chromatic families — gray, + // black, white should still match near-neutral palette entries. + const queries = extractQueryColors('gray tones'); + const result = colorBoostFor(queries, [ + { l: 55, a: 2, b: 1, weight: 0.6 }, + ]); + expect(result).not.toBeNull(); + expect(result?.family).toBe('gray'); + }); +}); + +describe('nearestColorFamily', () => { + it('maps a clearly chromatic swatch to the obvious family', () => { + expect(nearestColorFamily({ l: 53, a: 70, b: 50 })).toBe('red'); + expect(nearestColorFamily({ l: 40, a: 15, b: -60 })).toBe('blue'); + expect(nearestColorFamily({ l: 90, a: -5, b: 80 })).toBe('yellow'); + }); + + it('maps near-neutral swatches to gray/black/white', () => { + expect(nearestColorFamily({ l: 55, a: 0, b: 0 })).toBe('gray'); + expect(nearestColorFamily({ l: 95, a: 0, b: 0 })).toBe('white'); + expect(nearestColorFamily({ l: 10, a: 0, b: 0 })).toBe('black'); + }); +}); + +describe('palettePairDistance', () => { + it('returns 0 for identical palettes', () => { + const a: PaletteEntry[] = [ + { l: 50, a: 60, b: 40, weight: 0.6 }, + { l: 40, a: 20, b: -50, weight: 0.4 }, + ]; + expect(palettePairDistance(a, a)).toBeCloseTo(0, 5); + }); + + it('is symmetric', () => { + const a: PaletteEntry[] = [{ l: 60, a: 40, b: 30, weight: 0.8 }]; + const b: PaletteEntry[] = [{ l: 65, a: 45, b: 20, weight: 1.0 }]; + expect(palettePairDistance(a, b)).toBeCloseTo(palettePairDistance(b, a), 5); + }); + + it('returns a larger distance for perceptually different palettes', () => { + const warmReds: PaletteEntry[] = [{ l: 53, a: 70, b: 50, weight: 1 }]; + const coolBlues: PaletteEntry[] = [{ l: 40, a: 15, b: -60, weight: 1 }]; + expect(palettePairDistance(warmReds, coolBlues)).toBeGreaterThan(40); + }); + + it('returns infinity for empty palettes', () => { + const a: PaletteEntry[] = [{ l: 50, a: 0, b: 0, weight: 1 }]; + expect(palettePairDistance(a, [])).toBe(Number.POSITIVE_INFINITY); + expect(palettePairDistance([], a)).toBe(Number.POSITIVE_INFINITY); + }); +}); + +describe('paletteSimilarityBoost', () => { + it('produces a non-zero boost for similar palettes', () => { + const ref: PaletteEntry[] = [ + { l: 50, a: 60, b: 40, weight: 0.7 }, + { l: 40, a: 20, b: -50, weight: 0.3 }, + ]; + const candidate: PaletteEntry[] = [ + { l: 52, a: 62, b: 38, weight: 0.6 }, + { l: 42, a: 18, b: -52, weight: 0.4 }, + ]; + const result = paletteSimilarityBoost(ref, candidate); + expect(result).not.toBeNull(); + expect(result?.boost).toBeGreaterThan(0.1); + expect(result?.distance).toBeLessThan(10); + }); + + it('returns null for clearly dissimilar palettes', () => { + const warmReds: PaletteEntry[] = [{ l: 53, a: 70, b: 50, weight: 1 }]; + const coolGreens: PaletteEntry[] = [{ l: 60, a: -55, b: 50, weight: 1 }]; + expect(paletteSimilarityBoost(warmReds, coolGreens)).toBeNull(); + }); + + it('returns null for missing inputs', () => { + const a: PaletteEntry[] = [{ l: 50, a: 0, b: 0, weight: 1 }]; + expect(paletteSimilarityBoost(undefined, a)).toBeNull(); + expect(paletteSimilarityBoost(a, undefined)).toBeNull(); + }); +}); diff --git a/src/features/scene-browser/utils/color-boost.ts b/src/features/scene-browser/utils/color-boost.ts new file mode 100644 index 000000000..fa00fe2bf --- /dev/null +++ b/src/features/scene-browser/utils/color-boost.ts @@ -0,0 +1,365 @@ +/** + * Color-query boost for semantic search. + * + * CLIP is weak on pure color queries — it was trained on object-centric + * captions, so "red color" drifts to whatever CLIP happens to associate + * with the token. Industry CBIR systems (Imgix, TinEye) sidestep this + * entirely by pre-extracting dominant colors per image and matching + * query colors via ∆E in CIELAB, the approximately-perceptually-uniform + * color space. We do the same here, using the pre-computed palette on + * each `MediaCaption.palette`. + * + * Output: a ColorBoost per scene with the closest palette match, its + * perceptual distance, and a score contribution calibrated to cosine + * magnitudes so it composes cleanly with the text/image scores. + */ + +import { deltaE2000, type LabColor, type PaletteEntry } from '../deps/analysis'; + +export interface ColorBoostResult { + /** Additive score contribution, in cosine-compatible units. */ + boost: number; + /** Query color family that matched (e.g. "red"). */ + family: string; + /** Minimum ∆E across the scene's palette. */ + deltaE: number; + /** The palette entry that produced the minimum distance. */ + matched: PaletteEntry; +} + +/** + * Tuned so that a visually-identical match (∆E ~0) contributes ~0.15 + * — roughly one confidence tier. ∆E ≥ 30 ("obviously different") gives + * 0. Linear falloff in between keeps the math simple and explains + * itself in chip tooltips. + */ +const MAX_BOOST = 0.18; +const ZERO_BOOST_DELTA_E = 30; + +function boostFromDeltaE(deltaE: number, weight: number): number { + if (deltaE >= ZERO_BOOST_DELTA_E) return 0; + const linear = (ZERO_BOOST_DELTA_E - deltaE) / ZERO_BOOST_DELTA_E; + // Weight shrinks the contribution when the matched color is a tiny + // fraction of the thumbnail (a 3% pixel slice of red doesn't really + // make the scene "red"). + const weightFactor = Math.min(1, weight / 0.2); + return MAX_BOOST * linear * weightFactor; +} + +/** Families that demand a visibly chromatic palette entry to match. */ +const CHROMATIC_FAMILIES = new Set([ + 'red', 'orange', 'yellow', 'green', 'teal', 'blue', 'purple', 'pink', 'coral', +]); + +/** Minimum chroma (sqrt(a²+b²)) for a palette entry to match a chromatic family. */ +const MIN_ENTRY_CHROMA = 15; + +/** Max hue-angle difference (degrees) between palette entry and chromatic family. */ +const MAX_HUE_DELTA_DEG = 45; + +function labHueDeg(a: number, b: number): number { + const deg = (Math.atan2(b, a) * 180) / Math.PI; + return deg < 0 ? deg + 360 : deg; +} + +function labChroma(a: number, b: number): number { + return Math.sqrt(a * a + b * b); +} + +function hueDelta(h1: number, h2: number): number { + const diff = Math.abs(h1 - h2) % 360; + return diff > 180 ? 360 - diff : diff; +} + +/** + * Gate low-chroma or off-hue palette entries out of chromatic family + * matches. ∆E 2000 gracefully collapses hue weight for gray-ish colors, + * which is correct for color science but wrong for user intent: a user + * asking for "pink" doesn't want a beige scene that happens to sit + * near-ish to the pink Lab reference. Neutral families (white/black/ + * gray/brown) bypass the gate — their whole point is low-chroma matching. + */ +function paletteEntryCompatibleWithFamily( + family: ColorFamilyDefinition, + entry: PaletteEntry, +): boolean { + if (!CHROMATIC_FAMILIES.has(family.family)) return true; + if (labChroma(entry.a, entry.b) < MIN_ENTRY_CHROMA) return false; + const familyHue = labHueDeg(family.lab.a, family.lab.b); + const entryHue = labHueDeg(entry.a, entry.b); + return hueDelta(familyHue, entryHue) <= MAX_HUE_DELTA_DEG; +} + +/** + * Canonical Lab coordinates for each color family, plus the synonyms + * that map into it. Values are mid-saturation reference points — for + * `red` we pick a slightly-desaturated Lab(53, 70, 50) rather than + * pure-sRGB red (Lab 53, 80, 67) because VLM-described "reds" in + * natural footage tend to sit a bit off the primary. + * + * The list stays conservative to avoid false-positive query parses + * ("rose" as a flower vs. "rose" as a color — we accept the color + * reading; users can always add descriptive words to disambiguate). + */ +export interface ColorFamilyDefinition { + family: string; + lab: LabColor; + synonyms: string[]; +} + +export interface ParsedColorQuery { + colors: ColorFamilyDefinition[]; + /** + * True when the query is asking for palette alone (e.g. `color:yellow`, + * `yellow color`, `crimson tones`) rather than "yellow jacket" / + * "blue car" object semantics. + */ + paletteOnly: boolean; +} + +const COLOR_FAMILIES: ColorFamilyDefinition[] = [ + { family: 'red', lab: { l: 53, a: 70, b: 50 }, synonyms: ['red', 'crimson', 'scarlet', 'maroon', 'ruby', 'burgundy'] }, + { family: 'orange', lab: { l: 65, a: 40, b: 65 }, synonyms: ['orange', 'amber', 'tangerine', 'peach', 'apricot'] }, + { family: 'yellow', lab: { l: 90, a: -5, b: 80 }, synonyms: ['yellow', 'golden', 'gold', 'mustard', 'lemon'] }, + { family: 'green', lab: { l: 60, a: -55, b: 50 }, synonyms: ['green', 'emerald', 'lime', 'olive', 'forest', 'mint', 'sage'] }, + { family: 'teal', lab: { l: 60, a: -40, b: -15 },synonyms: ['teal', 'turquoise', 'cyan', 'aqua'] }, + { family: 'blue', lab: { l: 40, a: 15, b: -60 }, synonyms: ['blue', 'navy', 'azure', 'cobalt', 'indigo', 'sapphire'] }, + { family: 'purple', lab: { l: 40, a: 50, b: -45 }, synonyms: ['purple', 'violet', 'magenta', 'lavender', 'plum', 'lilac'] }, + // Pink hue sits around 340-355° (negative b*), not the 5-10° range — + // at b=+5 we drift into salmon/coral and start matching warm skin + // tones in dimly lit scenes. Classic "pink" needs a cool shift. + { family: 'pink', lab: { l: 70, a: 50, b: -8 }, synonyms: ['pink', 'rose', 'fuchsia'] }, + { family: 'coral', lab: { l: 68, a: 45, b: 25 }, synonyms: ['coral', 'salmon'] }, + { family: 'brown', lab: { l: 40, a: 15, b: 35 }, synonyms: ['brown', 'tan', 'beige', 'chocolate', 'khaki', 'sepia'] }, + { family: 'white', lab: { l: 95, a: 0, b: 0 }, synonyms: ['white', 'ivory', 'cream', 'snow', 'pearl'] }, + { family: 'black', lab: { l: 10, a: 0, b: 0 }, synonyms: ['black', 'ebony', 'charcoal', 'midnight', 'onyx'] }, + { family: 'gray', lab: { l: 55, a: 0, b: 0 }, synonyms: ['gray', 'grey', 'silver', 'slate', 'ash'] }, +]; + +const SYNONYM_TO_FAMILY = new Map(); +for (const def of COLOR_FAMILIES) { + for (const synonym of def.synonyms) SYNONYM_TO_FAMILY.set(synonym, def); +} + +const COLOR_INTENT_TOKENS = new Set([ + 'color', + 'colors', + 'palette', + 'palettes', + 'tint', + 'tints', + 'tone', + 'tones', + 'hue', + 'hues', + 'grade', + 'graded', + 'grading', + 'dominant', + 'swatch', + 'swatches', +]); + +const COLOR_QUERY_FILLER_TOKENS = new Set([ + 'a', + 'an', + 'the', + 'and', + 'or', + 'of', + 'with', + 'in', + 'on', + 'at', + 'to', + 'for', + 'from', + 'by', + 'show', + 'find', + 'me', + 'shot', + 'shots', + 'scene', + 'scenes', + 'clip', + 'clips', + 'frame', + 'frames', + 'image', + 'images', + 'video', + 'videos', + 'please', +]); + +function tokenize(text: string): string[] { + return text.toLowerCase().replace(/[^\p{L}\p{N}\s]/gu, ' ').split(/\s+/).filter(Boolean); +} + +/** + * Parse whether the query is explicitly asking for palette matching. + * Bare color words stay in the normal semantic lane so queries like + * "yellow jacket" or "orange sunset" don't get treated as palette-only. + */ +export function parseColorQuery(query: string): ParsedColorQuery { + const tokens = tokenize(query); + const explicitIntent = tokens.some((token) => COLOR_INTENT_TOKENS.has(token)); + const seen = new Set(); + const colors: ColorFamilyDefinition[] = []; + for (const token of tokens) { + const def = SYNONYM_TO_FAMILY.get(token); + if (def && !seen.has(def.family)) { + seen.add(def.family); + colors.push(def); + } + } + + // A query composed only of color words (e.g. "pink", "red blue") has no + // object semantics to chase — treat it as palette intent so CLIP's + // weakness on bare color tokens doesn't surface unrelated scenes above + // palette-matching ones. Multi-word queries like "pink jacket" still + // flow through the normal semantic path with an additive color boost. + const allTokensAreColors = tokens.length > 0 + && tokens.every((token) => SYNONYM_TO_FAMILY.has(token)); + if (allTokensAreColors) { + return { colors, paletteOnly: true }; + } + + if (!explicitIntent || colors.length === 0) { + return { colors: [], paletteOnly: false }; + } + + const paletteOnly = !tokens.some((token) => ( + !COLOR_INTENT_TOKENS.has(token) + && !COLOR_QUERY_FILLER_TOKENS.has(token) + && !SYNONYM_TO_FAMILY.has(token) + )); + + return { colors, paletteOnly }; +} + +/** + * Return the color families (with Lab coordinates) that the query + * explicitly asks to match by palette. Empty array means no color-aware + * ranking for this query. + */ +export function extractQueryColors(query: string): ColorFamilyDefinition[] { + return parseColorQuery(query).colors; +} + +/** + * Find the best palette match for each query color, pick the overall + * closest one, and return the boost + metadata. `null` means no + * meaningful match (palette empty, or all ∆E ≥ 30). + */ +export function colorBoostFor( + queryColors: ColorFamilyDefinition[], + palette: PaletteEntry[] | undefined, +): ColorBoostResult | null { + if (queryColors.length === 0 || !palette || palette.length === 0) return null; + + let best: ColorBoostResult | null = null; + for (const query of queryColors) { + for (const entry of palette) { + if (!paletteEntryCompatibleWithFamily(query, entry)) continue; + const distance = deltaE2000(query.lab, { l: entry.l, a: entry.a, b: entry.b }); + const boost = boostFromDeltaE(distance, entry.weight); + if (boost <= 0) continue; + if (!best || boost > best.boost) { + best = { boost, family: query.family, deltaE: distance, matched: entry }; + } + } + } + return best; +} + +/** + * Map a single Lab swatch to its closest color-family name. Used by the + * "click a palette swatch to search its color" interaction — we want the + * family word (e.g. `"red"`) that parseColorQuery will canonicalize back + * to the same palette-only search. + * + * Returns null for swatches too far from every family reference (rare — + * the families span the Lab gamut densely enough that the nearest is + * usually within ∆E 30). + */ +export function nearestColorFamily(entry: LabColor): string | null { + let bestFamily: string | null = null; + let bestDistance = Number.POSITIVE_INFINITY; + for (const def of COLOR_FAMILIES) { + const distance = deltaE2000(def.lab, entry); + if (distance < bestDistance) { + bestDistance = distance; + bestFamily = def.family; + } + } + return bestDistance < ZERO_BOOST_DELTA_E ? bestFamily : null; +} + +/** + * Symmetric weighted distance between two palettes via greedy nearest + * matching in both directions. Full Hungarian assignment would be cleaner + * but palettes are ≤6 entries in practice, and greedy matches stay within + * a few percent of optimal for that size. Averaging both directions keeps + * the metric symmetric — a tiny palette shouldn't "win" just because one + * of its entries happens to be close to a big entry in the reference. + * + * Output is a weighted-mean ∆E 2000 across matched entries. Palettes with + * no overlap return POSITIVE_INFINITY so the ranker can drop them. + */ +export function palettePairDistance( + a: PaletteEntry[], + b: PaletteEntry[], +): number { + if (a.length === 0 || b.length === 0) return Number.POSITIVE_INFINITY; + const forward = greedyDirectionalDistance(a, b); + const reverse = greedyDirectionalDistance(b, a); + if (!Number.isFinite(forward) || !Number.isFinite(reverse)) { + return Number.POSITIVE_INFINITY; + } + return (forward + reverse) / 2; +} + +function greedyDirectionalDistance( + source: PaletteEntry[], + target: PaletteEntry[], +): number { + let totalWeight = 0; + let totalWeighted = 0; + for (const s of source) { + let best = Number.POSITIVE_INFINITY; + for (const t of target) { + const d = deltaE2000({ l: s.l, a: s.a, b: s.b }, { l: t.l, a: t.a, b: t.b }); + if (d < best) best = d; + } + if (!Number.isFinite(best)) continue; + totalWeighted += best * s.weight; + totalWeight += s.weight; + } + if (totalWeight <= 0) return Number.POSITIVE_INFINITY; + return totalWeighted / totalWeight; +} + +export interface PaletteSimilarityResult { + /** Cosine-compatible boost (higher = more similar). */ + boost: number; + /** Weighted-mean ∆E 2000 between the two palettes. */ + distance: number; +} + +/** + * Turn a palette-pair distance into a score in cosine-compatible units, + * reusing the same linear falloff as the single-color boost so both + * signals compose cleanly when mixed. + */ +export function paletteSimilarityBoost( + reference: PaletteEntry[] | undefined, + candidate: PaletteEntry[] | undefined, +): PaletteSimilarityResult | null { + if (!reference || !candidate) return null; + const distance = palettePairDistance(reference, candidate); + if (!Number.isFinite(distance) || distance >= ZERO_BOOST_DELTA_E) return null; + const linear = (ZERO_BOOST_DELTA_E - distance) / ZERO_BOOST_DELTA_E; + return { boost: MAX_BOOST * linear, distance }; +} diff --git a/src/features/scene-browser/utils/embeddings-cache.ts b/src/features/scene-browser/utils/embeddings-cache.ts new file mode 100644 index 000000000..0eac9b47d --- /dev/null +++ b/src/features/scene-browser/utils/embeddings-cache.ts @@ -0,0 +1,344 @@ +/** + * In-memory embeddings cache for the Scene Browser. + * + * Caption embeddings live on disk as a packed `Float32Array` bin plus + * metadata in `captions.json`. The hook layer wants fast synchronous + * access during ranking, so this module hydrates per-media vectors on + * first request and keeps them in memory for the session. + * + * Cache keys are "scene ids" (`${mediaId}:${captionIndex}`) so the ranker + * doesn't need to know about media boundaries. + */ + +import { createLogger } from '@/shared/logging/logger'; +import { + EMBEDDING_MODEL_DIM, + EMBEDDING_MODEL_ID, + CLIP_EMBEDDING_DIM, + CLIP_MODEL_ID, + buildEmbeddingText, + clipProvider, + embeddingsProvider, + extractDominantColors, +} from '../deps/analysis'; +import { + mediaLibraryService, + useMediaLibraryStore, + type MediaMetadata, +} from '../deps/media-library'; +import { + getCaptionEmbeddings, + getCaptionImageEmbeddings, + getCaptionThumbnailBlob, + getCaptionsEmbeddingsMeta, + getTranscript, + saveCaptionEmbeddings, + saveCaptionImageEmbeddings, +} from '../deps/storage'; + +const log = createLogger('SceneBrowser:EmbeddingsCache'); + +/** sceneId → normalized text embedding vector. */ +const embeddings = new Map(); +/** sceneId → normalized CLIP image embedding vector. */ +const imageEmbeddings = new Map(); +/** sceneId → dominant-color palette entries (Lab + weight). */ +const palettes = new Map>(); +/** mediaId → outstanding hydration promise so concurrent callers share work. */ +const pendingHydrates = new Map>(); +/** mediaId → outstanding text indexing (retroactive generate) promise. */ +const pendingIndexes = new Map>(); +/** mediaId → outstanding image indexing (retroactive generate) promise. */ +const pendingImageIndexes = new Map>(); +/** mediaIds we've already concluded have no usable text embeddings. */ +const missingEmbeddings = new Set(); +/** mediaIds we've already concluded have no usable image embeddings. */ +const missingImageEmbeddings = new Set(); + +function sceneId(mediaId: string, captionIndex: number): string { + return `${mediaId}:${captionIndex}`; +} + +function populateFromInMemory(media: MediaMetadata): boolean { + const captions = media.aiCaptions; + if (!captions || captions.length === 0) return false; + let found = false; + captions.forEach((caption, i) => { + if (Array.isArray(caption.embedding) && caption.embedding.length === EMBEDDING_MODEL_DIM) { + embeddings.set(sceneId(media.id, i), Float32Array.from(caption.embedding)); + found = true; + } + // Palettes are tiny — always mirror from whatever the store has so + // the rank-time Map is a read-only snapshot of the source of truth. + if (Array.isArray(caption.palette) && caption.palette.length > 0) { + palettes.set(sceneId(media.id, i), caption.palette.map((entry) => ({ + l: entry.l, a: entry.a, b: entry.b, weight: entry.weight, + }))); + } + }); + return found; +} + +async function hydrateFromDisk(mediaId: string, expectedCount: number): Promise<{ + text: boolean; + image: boolean; +}> { + const meta = await getCaptionsEmbeddingsMeta(mediaId); + if (!meta) return { text: false, image: false }; + + let textOk = false; + if (meta.embeddingModel === EMBEDDING_MODEL_ID && meta.embeddingDim === EMBEDDING_MODEL_DIM) { + const vectors = await getCaptionEmbeddings(mediaId, meta.embeddingDim, expectedCount); + if (vectors) { + vectors.forEach((vector, i) => embeddings.set(sceneId(mediaId, i), vector)); + textOk = true; + } + } + + let imageOk = false; + if ( + meta.imageEmbeddingModel === CLIP_MODEL_ID + && meta.imageEmbeddingDim === CLIP_EMBEDDING_DIM + ) { + const vectors = await getCaptionImageEmbeddings(mediaId, meta.imageEmbeddingDim, expectedCount); + if (vectors) { + vectors.forEach((vector, i) => imageEmbeddings.set(sceneId(mediaId, i), vector)); + imageOk = true; + } + } + + return { text: textOk, image: imageOk }; +} + +/** + * Ensure embeddings for every caption on `mediaId` are present in memory. + * Reuses already-loaded vectors; concurrent callers share a single disk read. + */ +export function ensureEmbeddingsLoaded(mediaId: string): Promise { + const existing = pendingHydrates.get(mediaId); + if (existing) return existing; + + const promise = (async () => { + const media = useMediaLibraryStore.getState().mediaById[mediaId]; + if (!media || !media.aiCaptions || media.aiCaptions.length === 0) return; + + let textHydrated = populateFromInMemory(media); + let imageHydrated = imageEmbeddings.has(sceneId(mediaId, 0)); + + if (!textHydrated || !imageHydrated) { + const loaded = await hydrateFromDisk(mediaId, media.aiCaptions.length); + textHydrated ||= loaded.text; + imageHydrated ||= loaded.image; + } + + if (!textHydrated) missingEmbeddings.add(mediaId); + if (!imageHydrated) missingImageEmbeddings.add(mediaId); + })().catch((error) => { + log.warn('Embedding hydrate failed', { mediaId, error }); + missingEmbeddings.add(mediaId); + missingImageEmbeddings.add(mediaId); + }).finally(() => { + pendingHydrates.delete(mediaId); + }); + + pendingHydrates.set(mediaId, promise); + return promise; +} + +/** + * Run the embedding model over captions that have never been indexed, + * save the resulting `.bin`, patch captions.json with the model metadata, + * and populate the cache. Caller is responsible for opening the gate via + * `embeddingsProvider.ensureReady()` — skipped here so background indexing + * can decide when to pay the model-download cost. + */ +export function indexMediaCaptions(mediaId: string): Promise { + const existing = pendingIndexes.get(mediaId); + if (existing) return existing; + + const promise = (async () => { + const state = useMediaLibraryStore.getState(); + const media = state.mediaById[mediaId]; + if (!media || !media.aiCaptions || media.aiCaptions.length === 0) return; + if (state.taggingMediaIds.has(mediaId)) return; + + await embeddingsProvider.ensureReady(); + // The main Analyze-with-AI pipeline owns this media during its run. + // Re-check after the (potentially long) model download to avoid racing + // it with a re-analysis that just started. + if (useMediaLibraryStore.getState().taggingMediaIds.has(mediaId)) return; + + // Gather the same context signals the main pipeline uses so a + // retroactively-indexed caption is embedded identically to one + // generated by Analyze-with-AI — otherwise semantic ranking would + // get two flavors of vectors in one library and drift in quality. + const transcript = await getTranscript(mediaId).catch(() => null); + const colorResults = await Promise.all( + media.aiCaptions.map(async (caption) => { + if (!caption.thumbRelPath) return { phrase: '', palette: [] as const }; + try { + const blob = await getCaptionThumbnailBlob(caption.thumbRelPath); + if (!blob) return { phrase: '', palette: [] as const }; + return await extractDominantColors(blob); + } catch { + return { phrase: '', palette: [] as const }; + } + }), + ); + + const texts = media.aiCaptions.map((caption, i) => buildEmbeddingText({ + caption: { text: caption.text, timeSec: caption.timeSec }, + sceneData: caption.sceneData, + transcriptSegments: transcript?.segments, + colorPhrase: colorResults[i]?.phrase ?? '', + })); + + const vectors = await embeddingsProvider.embedBatch(texts); + if (vectors.length !== texts.length) { + throw new Error(`Embedding returned ${vectors.length} vectors for ${texts.length} captions`); + } + + await saveCaptionEmbeddings(mediaId, vectors, EMBEDDING_MODEL_DIM); + // Persist the model metadata on captions.json so future sessions know + // the bin matches. We rewrite the full captions payload — cheap, since + // retroactive indexing is an explicit user action, not a hot path. + // Stamp the extracted palettes onto each caption so retroactive + // indexing also populates color data for legacy captions without it. + const capturedCaptions = media.aiCaptions.map((caption, i) => { + const palette = colorResults[i]?.palette; + const next = { ...caption }; + if (palette && palette.length > 0) next.palette = [...palette]; + return next; + }); + await mediaLibraryService.updateMediaCaptions(mediaId, capturedCaptions, { + embeddingModel: EMBEDDING_MODEL_ID, + embeddingDim: EMBEDDING_MODEL_DIM, + }); + useMediaLibraryStore.getState().updateMediaCaptions(mediaId, capturedCaptions); + + vectors.forEach((vector, i) => { + embeddings.set(sceneId(mediaId, i), vector); + }); + missingEmbeddings.delete(mediaId); + })().finally(() => { + pendingIndexes.delete(mediaId); + }); + + pendingIndexes.set(mediaId, promise); + return promise; +} + +/** + * Generate CLIP image embeddings for every thumbnail-bearing caption on + * `mediaId`, persist the bin, update captions.json with the image model + * metadata, and populate the cache. Skips captions whose thumbnails are + * missing on disk — the count of saved vectors is allowed to be less + * than the caption count only when the persisted bin layout still + * matches 1:1 (which is why we pre-require all thumbs to exist; if any + * are missing we bail rather than emit a short-count bin). + */ +export function indexMediaImageCaptions(mediaId: string): Promise { + const existing = pendingImageIndexes.get(mediaId); + if (existing) return existing; + + const promise = (async () => { + const state = useMediaLibraryStore.getState(); + const media = state.mediaById[mediaId]; + if (!media || !media.aiCaptions || media.aiCaptions.length === 0) return; + if (state.taggingMediaIds.has(mediaId)) return; + + // Load every thumbnail up front — CLIP expects one vector per + // caption index, so a missing thumb anywhere in the series means we + // can't write a coherent bin for this media. Lazy-thumb will + // eventually generate them on next Scene Browser visit; skip and + // retry next time. + const blobs: Blob[] = []; + for (const caption of media.aiCaptions) { + if (!caption.thumbRelPath) return; + const blob = await getCaptionThumbnailBlob(caption.thumbRelPath); + if (!blob) return; + blobs.push(blob); + } + + await clipProvider.ensureReady(); + if (useMediaLibraryStore.getState().taggingMediaIds.has(mediaId)) return; + + const vectors = await clipProvider.embedImages(blobs); + if (vectors.length !== blobs.length) { + throw new Error(`CLIP returned ${vectors.length} vectors for ${blobs.length} thumbnails`); + } + + await saveCaptionImageEmbeddings(mediaId, vectors, CLIP_EMBEDDING_DIM); + + // Patch captions.json with the image-model metadata. Fetch the latest + // captions from the store so we preserve concurrent edits (rare, but + // re-analyze-and-index-at-the-same-time is the exact race we care about). + const latest = useMediaLibraryStore.getState().mediaById[mediaId]; + if (latest?.aiCaptions) { + await mediaLibraryService.updateMediaCaptions(mediaId, latest.aiCaptions, { + embeddingModel: EMBEDDING_MODEL_ID, + embeddingDim: EMBEDDING_MODEL_DIM, + imageEmbeddingModel: CLIP_MODEL_ID, + imageEmbeddingDim: CLIP_EMBEDDING_DIM, + }); + } + + vectors.forEach((vector, i) => { + imageEmbeddings.set(sceneId(mediaId, i), vector); + }); + missingImageEmbeddings.delete(mediaId); + })().finally(() => { + pendingImageIndexes.delete(mediaId); + }); + + pendingImageIndexes.set(mediaId, promise); + return promise; +} + +/** + * Drop cached embeddings for `mediaId`. Call after Analyze-with-AI finishes + * (new embeddings will be hydrated from the fresh in-memory caption array + * on next access) or when embeddings-on-disk go out of sync. + */ +export function invalidateEmbeddingsCache(mediaId: string): void { + const prefix = `${mediaId}:`; + for (const key of embeddings.keys()) { + if (key.startsWith(prefix)) embeddings.delete(key); + } + for (const key of imageEmbeddings.keys()) { + if (key.startsWith(prefix)) imageEmbeddings.delete(key); + } + for (const key of palettes.keys()) { + if (key.startsWith(prefix)) palettes.delete(key); + } + missingEmbeddings.delete(mediaId); + missingImageEmbeddings.delete(mediaId); + pendingHydrates.delete(mediaId); + pendingIndexes.delete(mediaId); + pendingImageIndexes.delete(mediaId); +} + +/** Read-only view of the in-memory text embeddings cache, for ranking. */ +export function getEmbeddingsSnapshot(): Map { + return embeddings; +} + +/** Read-only view of the in-memory CLIP image embeddings cache. */ +export function getImageEmbeddingsSnapshot(): Map { + return imageEmbeddings; +} + +/** Read-only view of the in-memory color palette cache. */ +export function getPalettesSnapshot(): Map> { + return palettes; +} + +/** Whether the given media is known to be missing text embeddings. */ +export function isMediaMissingEmbeddings(mediaId: string): boolean { + return missingEmbeddings.has(mediaId); +} + +/** Whether the given media is known to be missing image embeddings. */ +export function isMediaMissingImageEmbeddings(mediaId: string): boolean { + return missingImageEmbeddings.has(mediaId); +} diff --git a/src/features/scene-browser/utils/invalidate.ts b/src/features/scene-browser/utils/invalidate.ts new file mode 100644 index 000000000..389935ead --- /dev/null +++ b/src/features/scene-browser/utils/invalidate.ts @@ -0,0 +1,24 @@ +/** + * Single entry point for "this media's captions are about to change — + * drop every cached thumbnail resource tied to it." Called by + * Analyze-with-AI (and any future re-caption flow) before the pipeline + * deletes old thumbs and writes new ones. + * + * Combines the blob URL cache (the hook that hands JPEG URLs to + * rows) and the lazy-thumb probe/generation cache (the queue that fills + * in pointers for pre-feature captions) in one call so callers don't + * have to know about the internal split. + */ + +import { invalidateMediaCaptionThumbBlobs } from '../hooks/use-caption-thumbnail'; +import { invalidateEmbeddingsCache } from './embeddings-cache'; +import { invalidateLazyThumbCache } from './lazy-thumb'; + +export function invalidateMediaCaptionThumbnails(mediaId: string): void { + invalidateMediaCaptionThumbBlobs(mediaId); + invalidateLazyThumbCache(mediaId); + // Semantic embeddings are tied 1:1 to caption indexes — a re-analyze + // throws away the old caption array and generates a fresh one, so the + // cached vectors no longer correspond to their (new) scenes. + invalidateEmbeddingsCache(mediaId); +} diff --git a/src/features/scene-browser/utils/lazy-thumb.ts b/src/features/scene-browser/utils/lazy-thumb.ts new file mode 100644 index 000000000..7281a771b --- /dev/null +++ b/src/features/scene-browser/utils/lazy-thumb.ts @@ -0,0 +1,280 @@ +/** + * Lazy thumbnail generator for captions that were created before the + * Scene Browser feature landed (`thumbRelPath` missing). Opens the source + * media, seeks to the caption timestamp, captures a JPEG, and persists it + * alongside the rest of that media's caption thumbs so the Scene Browser + * can pick it up on subsequent reads. + * + * Work is queued globally so we never spin up more than one HTMLVideoElement + * at a time — 161-caption libraries can otherwise exhaust memory on long + * clips. Images are handled via `fetch` + `createImageBitmap` (same as the + * LFM provider's image path). + */ + +import { createLogger } from '@/shared/logging/logger'; +import { mediaLibraryService, useMediaLibraryStore, type MediaMetadata } from '../deps/media-library'; +import { probeCaptionThumbnail, saveCaptionThumbnail } from '../deps/storage'; + +const log = createLogger('SceneBrowser:LazyThumb'); + +const PERSIST_DEBOUNCE_MS = 1500; +const pendingPersists = new Map>(); + +/** + * Rewrite `captions.json` + the metadata mirror for `mediaId` with the + * current in-memory captions array. Coalesces rapid fire-and-forget + * updates from a stream of thumbnail writes into a single disk write per + * ~{@link PERSIST_DEBOUNCE_MS}ms window — 161 captions that each land a + * thumb in quick succession otherwise trigger 161 JSON rewrites. + */ +function schedulePersist(mediaId: string): void { + const existing = pendingPersists.get(mediaId); + if (existing) clearTimeout(existing); + const timer = setTimeout(() => { + pendingPersists.delete(mediaId); + const latest = useMediaLibraryStore.getState().mediaById[mediaId]; + if (!latest?.aiCaptions) return; + void mediaLibraryService + .updateMediaCaptions(mediaId, latest.aiCaptions) + .catch((error) => { + log.warn('Persisting caption thumb pointers failed', { mediaId, error }); + }); + }, PERSIST_DEBOUNCE_MS); + pendingPersists.set(mediaId, timer); +} + +const MAX_DIM = 512; +const SEEK_TIMEOUT_MS = 8_000; + +interface PendingRequest { + mediaId: string; + captionIndex: number; + timeSec: number; + resolve: (relPath: string | null) => void; +} + +const queue: PendingRequest[] = []; +let running = false; +const resultCache = new Map(); +const inflight = new Map>(); + +function cacheKey(mediaId: string, captionIndex: number): string { + return `${mediaId}:${captionIndex}`; +} + +/** + * Drop the memoized probe + generation results for every caption of + * `mediaId` so a re-analyzed media starts from a clean slate. Queued + * requests that haven't started yet are dropped; in-flight generations + * are left to finish and are discarded at the write site via the + * `taggingMediaIds` gate below. + */ +export function invalidateLazyThumbCache(mediaId: string): void { + const prefix = `${mediaId}:`; + for (const key of resultCache.keys()) { + if (key.startsWith(prefix)) resultCache.delete(key); + } + for (let i = queue.length - 1; i >= 0; i -= 1) { + const request = queue[i]!; + if (request.mediaId === mediaId) { + request.resolve(null); + queue.splice(i, 1); + } + } + const pendingPersist = pendingPersists.get(mediaId); + if (pendingPersist) { + clearTimeout(pendingPersist); + pendingPersists.delete(mediaId); + } +} + +async function seekVideoTo(video: HTMLVideoElement, timeSec: number): Promise { + return new Promise((resolve, reject) => { + const timeout = setTimeout(() => { + cleanup(); + reject(new Error(`Seek timed out at ${timeSec}s`)); + }, SEEK_TIMEOUT_MS); + const onSeeked = () => { + cleanup(); + resolve(); + }; + const onError = () => { + cleanup(); + reject(new Error('Video seek failed')); + }; + const cleanup = () => { + clearTimeout(timeout); + video.removeEventListener('seeked', onSeeked); + video.removeEventListener('error', onError); + }; + video.addEventListener('seeked', onSeeked, { once: true }); + video.addEventListener('error', onError, { once: true }); + video.currentTime = Math.max(0, timeSec); + }); +} + +async function captureFrame(video: HTMLVideoElement): Promise { + const vw = video.videoWidth || 640; + const vh = video.videoHeight || 360; + const scale = Math.min(MAX_DIM / Math.max(vw, vh), 1); + const width = Math.max(1, Math.round(vw * scale)); + const height = Math.max(1, Math.round(vh * scale)); + const canvas = new OffscreenCanvas(width, height); + const context = canvas.getContext('2d'); + if (!context) throw new Error('OffscreenCanvas 2d context unavailable'); + context.drawImage(video, 0, 0, width, height); + return canvas.convertToBlob({ type: 'image/jpeg', quality: 0.75 }); +} + +async function captureImage(blob: Blob): Promise { + const bitmap = await createImageBitmap(blob); + try { + const scale = Math.min(MAX_DIM / Math.max(bitmap.width, bitmap.height), 1); + const width = Math.max(1, Math.round(bitmap.width * scale)); + const height = Math.max(1, Math.round(bitmap.height * scale)); + const canvas = new OffscreenCanvas(width, height); + const context = canvas.getContext('2d'); + if (!context) throw new Error('OffscreenCanvas 2d context unavailable'); + context.drawImage(bitmap, 0, 0, width, height); + return canvas.convertToBlob({ type: 'image/jpeg', quality: 0.75 }); + } finally { + bitmap.close(); + } +} + +/** + * Patch the in-memory media item so subsequent renders see the new path, + * then schedule a debounced write-back so the pointer survives reloads. + */ +function patchStoreThumbPath(mediaId: string, captionIndex: number, relPath: string): void { + const store = useMediaLibraryStore.getState(); + const media = store.mediaById[mediaId]; + if (!media || !media.aiCaptions) return; + const existing = media.aiCaptions[captionIndex]; + if (!existing || existing.thumbRelPath === relPath) return; + const updated: NonNullable = media.aiCaptions.map((caption, i) => + i === captionIndex ? { ...caption, thumbRelPath: relPath } : caption, + ); + store.updateMediaCaptions(mediaId, updated); + schedulePersist(mediaId); +} + +async function generateOne(request: PendingRequest): Promise { + const { mediaId, captionIndex, timeSec } = request; + const state = useMediaLibraryStore.getState(); + const media = state.mediaById[mediaId]; + if (!media) return null; + // A concurrent Analyze-with-AI run owns this media's thumbs for the + // duration of its sweep — skip lazy work so we don't race the main + // pipeline and clobber a fresh thumbnail with a stale one. + if (state.taggingMediaIds.has(mediaId)) return null; + + const isImage = media.mimeType.startsWith('image/'); + const blobUrl = await mediaLibraryService.getMediaBlobUrl(mediaId); + if (!blobUrl) return null; + + try { + let jpeg: Blob; + if (isImage) { + const response = await fetch(blobUrl); + const sourceBlob = await response.blob(); + jpeg = await captureImage(sourceBlob); + } else { + const video = document.createElement('video'); + video.muted = true; + video.preload = 'auto'; + video.crossOrigin = 'anonymous'; + video.src = blobUrl; + try { + await new Promise((resolve, reject) => { + const onLoad = () => { cleanup(); resolve(); }; + const onError = () => { cleanup(); reject(new Error('Video load failed')); }; + const cleanup = () => { + video.removeEventListener('loadedmetadata', onLoad); + video.removeEventListener('error', onError); + }; + video.addEventListener('loadedmetadata', onLoad, { once: true }); + video.addEventListener('error', onError, { once: true }); + }); + await seekVideoTo(video, timeSec); + jpeg = await captureFrame(video); + } finally { + video.pause(); + video.removeAttribute('src'); + video.load(); + } + } + + // Re-check the tagging gate before writing — Analyze-with-AI may have + // started between our initial check and the slow seek + capture above. + if (useMediaLibraryStore.getState().taggingMediaIds.has(mediaId)) { + return null; + } + const relPath = await saveCaptionThumbnail(mediaId, captionIndex, jpeg); + patchStoreThumbPath(mediaId, captionIndex, relPath); + return relPath; + } catch (error) { + log.warn('Lazy thumbnail generation failed', { mediaId, captionIndex, timeSec, error }); + return null; + } finally { + URL.revokeObjectURL(blobUrl); + } +} + +async function drain(): Promise { + if (running) return; + running = true; + try { + while (queue.length > 0) { + const request = queue.shift()!; + const key = cacheKey(request.mediaId, request.captionIndex); + const relPath = await generateOne(request); + resultCache.set(key, relPath); + request.resolve(relPath); + } + } finally { + running = false; + } +} + +/** + * Request a thumbnail for a caption that has no persisted `thumbRelPath`. + * Returns the rel path of either the freshly-saved or already-on-disk thumb, + * or `null` when generation fails. Concurrent callers for the same + * (mediaId, captionIndex) share one job. + * + * The disk probe runs outside the generation queue so all scenes can probe + * in parallel on reload — only probe misses pay the price of the serial + * video-seek generation pipeline. + */ +export function requestLazyCaptionThumbnail( + mediaId: string, + captionIndex: number, + timeSec: number, +): Promise { + const key = cacheKey(mediaId, captionIndex); + const cached = resultCache.get(key); + if (cached !== undefined) return Promise.resolve(cached); + + const pending = inflight.get(key); + if (pending) return pending; + + const promise = (async () => { + const existing = await probeCaptionThumbnail(mediaId, captionIndex); + if (existing) { + patchStoreThumbPath(mediaId, captionIndex, existing); + resultCache.set(key, existing); + return existing; + } + const generated = await new Promise((resolve) => { + queue.push({ mediaId, captionIndex, timeSec, resolve }); + void drain(); + }); + resultCache.set(key, generated); + return generated; + })().finally(() => { + inflight.delete(key); + }); + inflight.set(key, promise); + return promise; +} diff --git a/src/features/scene-browser/utils/library-palette.test.ts b/src/features/scene-browser/utils/library-palette.test.ts new file mode 100644 index 000000000..e42ac30dd --- /dev/null +++ b/src/features/scene-browser/utils/library-palette.test.ts @@ -0,0 +1,85 @@ +import { describe, expect, it } from 'vitest'; +import type { PaletteEntry } from '../deps/analysis'; +import { clusterPaletteEntries, flattenLibraryPalettes } from './library-palette'; + +describe('flattenLibraryPalettes', () => { + it('normalizes each palette so long clips do not dominate', () => { + const a: PaletteEntry[] = [{ l: 50, a: 10, b: 10, weight: 0.8 }]; + const b: PaletteEntry[] = [ + { l: 30, a: 0, b: 0, weight: 0.5 }, + { l: 60, a: 0, b: 0, weight: 0.5 }, + ]; + const flat = flattenLibraryPalettes([a, b]); + const totalA = flat.filter((e) => e.l === 50).reduce((s, e) => s + e.weight, 0); + const totalB = flat.filter((e) => e.l !== 50).reduce((s, e) => s + e.weight, 0); + expect(totalA).toBeCloseTo(1, 5); + expect(totalB).toBeCloseTo(1, 5); + }); + + it('skips empty or undefined palettes', () => { + const a: PaletteEntry[] = [{ l: 50, a: 10, b: 10, weight: 0.8 }]; + expect(flattenLibraryPalettes([a, undefined, []])).toHaveLength(1); + }); +}); + +describe('clusterPaletteEntries', () => { + it('returns empty for empty input', () => { + expect(clusterPaletteEntries([], 5)).toEqual([]); + }); + + it('caps cluster count at the entry count', () => { + const entries: PaletteEntry[] = [ + { l: 50, a: 60, b: 40, weight: 1 }, + { l: 40, a: 15, b: -60, weight: 1 }, + ]; + const clusters = clusterPaletteEntries(entries, 10); + expect(clusters).toHaveLength(2); + }); + + it('recovers well-separated source colors', () => { + // Three obvious color blobs with a bit of jitter per entry. The + // clusters should land near each of the three source centers. + const makeBlob = (base: { l: number; a: number; b: number }): PaletteEntry[] => + Array.from({ length: 5 }, (_, i) => ({ + l: base.l + (i - 2) * 0.3, + a: base.a + (i - 2) * 0.3, + b: base.b + (i - 2) * 0.3, + weight: 1, + })); + + const entries = [ + ...makeBlob({ l: 53, a: 70, b: 50 }), // red + ...makeBlob({ l: 40, a: 15, b: -60 }), // blue + ...makeBlob({ l: 90, a: -5, b: 80 }), // yellow + ]; + const clusters = clusterPaletteEntries(entries, 3); + expect(clusters).toHaveLength(3); + + // At least one cluster center should be close to each source blob. + const nearest = (target: { l: number; a: number; b: number }): number => { + let best = Infinity; + for (const c of clusters) { + const d = Math.sqrt( + (c.l - target.l) ** 2 + (c.a - target.a) ** 2 + (c.b - target.b) ** 2, + ); + if (d < best) best = d; + } + return best; + }; + expect(nearest({ l: 53, a: 70, b: 50 })).toBeLessThan(5); + expect(nearest({ l: 40, a: 15, b: -60 })).toBeLessThan(5); + expect(nearest({ l: 90, a: -5, b: 80 })).toBeLessThan(5); + }); + + it('weights cluster output by pixel coverage', () => { + const entries: PaletteEntry[] = [ + { l: 50, a: 60, b: 40, weight: 0.9 }, // big red + { l: 40, a: 15, b: -60, weight: 0.05 }, // tiny blue + { l: 40, a: 15, b: -60, weight: 0.05 }, // tiny blue + ]; + const clusters = clusterPaletteEntries(entries, 2); + expect(clusters).toHaveLength(2); + const sorted = [...clusters].sort((a, b) => b.weight - a.weight); + expect(sorted[0]?.weight).toBeGreaterThan(sorted[1]?.weight ?? 0); + }); +}); diff --git a/src/features/scene-browser/utils/library-palette.ts b/src/features/scene-browser/utils/library-palette.ts new file mode 100644 index 000000000..067d06023 --- /dev/null +++ b/src/features/scene-browser/utils/library-palette.ts @@ -0,0 +1,190 @@ +/** + * Weighted k-means clustering of palette entries across the library. + * + * Color Mode shows a small grid of "the unique colors this library is + * made of" — for that to be usable we need to collapse the hundreds of + * per-scene palette entries into ~12 cluster centers in CIELAB space, + * weighted by each entry's pixel coverage so a vivid accent color + * doesn't get drowned out by huge neutral expanses of sky/wall. + * + * Init: deterministic k-means++ (heaviest entry, then farthest-weighted). + * Iteration: weighted mean in Lab. + * Distance: ∆E 2000 so perceptual differences drive cluster membership. + */ + +import { deltaE2000, type LabColor, type PaletteEntry } from '../deps/analysis'; + +export interface LabCluster extends LabColor { + /** Sum of pixel-coverage weights of all entries in the cluster. */ + weight: number; + /** Count of raw palette entries that landed here. */ + count: number; +} + +/** + * Fold all media palettes into a single flat list, scaled so every + * scene contributes equally. Without the per-palette normalization a + * long clip would dominate just by having more caption frames indexed. + */ +export function flattenLibraryPalettes( + palettesBySource: Iterable, +): PaletteEntry[] { + const flat: PaletteEntry[] = []; + for (const palette of palettesBySource) { + if (!palette || palette.length === 0) continue; + const total = palette.reduce((sum, e) => sum + e.weight, 0); + if (total <= 0) continue; + for (const entry of palette) { + flat.push({ + l: entry.l, + a: entry.a, + b: entry.b, + weight: entry.weight / total, + }); + } + } + return flat; +} + +/** + * Weighted k-means in Lab. Returns at most `k` cluster centers; empty + * clusters are dropped rather than re-seeded since for this UI "give me + * the N colors that actually exist" is more useful than "exactly N". + */ +export function clusterPaletteEntries( + entries: PaletteEntry[], + k: number, + maxIter = 20, +): LabCluster[] { + if (entries.length === 0 || k <= 0) return []; + const effectiveK = Math.min(k, entries.length); + + const centers: LabColor[] = seedCentersKMeansPP(entries, effectiveK); + + for (let iter = 0; iter < maxIter; iter += 1) { + const assignments = assignEntriesToCenters(entries, centers); + const { centers: nextCenters, weights, counts } = recomputeCenters(entries, assignments, centers.length); + if (nextCenters.length === 0) break; + + const converged = nextCenters.length === centers.length + && nextCenters.every((c, i) => { + const prev = centers[i]; + return prev !== undefined && deltaE2000(c, prev) < 0.5; + }); + + centers.length = 0; + centers.push(...nextCenters); + + if (converged) { + return centers.map((c, i) => ({ + l: c.l, a: c.a, b: c.b, + weight: weights[i] ?? 0, + count: counts[i] ?? 0, + })); + } + } + + // Final assignment for weights/counts when we exhaust iterations. + const assignments = assignEntriesToCenters(entries, centers); + const weights = new Array(centers.length).fill(0); + const counts = new Array(centers.length).fill(0); + for (let i = 0; i < entries.length; i += 1) { + const k = assignments[i]!; + weights[k] = (weights[k] ?? 0) + entries[i]!.weight; + counts[k] = (counts[k] ?? 0) + 1; + } + return centers.map((c, i) => ({ + l: c.l, a: c.a, b: c.b, + weight: weights[i] ?? 0, + count: counts[i] ?? 0, + })); +} + +function seedCentersKMeansPP(entries: PaletteEntry[], k: number): LabColor[] { + // Deterministic init so the grid doesn't re-order on every render. + // Pick the heaviest entry first, then greedily take the entry that + // maximizes min-distance-to-existing × weight (D² weighted sampling + // argmax instead of random sampling — same asymptotic quality, stable + // output). + let heaviest = 0; + for (let i = 1; i < entries.length; i += 1) { + if (entries[i]!.weight > entries[heaviest]!.weight) heaviest = i; + } + const seed = entries[heaviest]!; + const centers: LabColor[] = [{ l: seed.l, a: seed.a, b: seed.b }]; + + while (centers.length < k) { + let bestIdx = -1; + let bestScore = -1; + for (let i = 0; i < entries.length; i += 1) { + const entry = entries[i]!; + let minD = Number.POSITIVE_INFINITY; + for (const c of centers) { + const d = deltaE2000(c, { l: entry.l, a: entry.a, b: entry.b }); + if (d < minD) minD = d; + } + if (!Number.isFinite(minD)) continue; + const score = minD * minD * entry.weight; + if (score > bestScore) { + bestScore = score; + bestIdx = i; + } + } + if (bestIdx < 0 || bestScore <= 0) break; + const picked = entries[bestIdx]!; + centers.push({ l: picked.l, a: picked.a, b: picked.b }); + } + return centers; +} + +function assignEntriesToCenters( + entries: PaletteEntry[], + centers: LabColor[], +): number[] { + const out = new Array(entries.length); + for (let i = 0; i < entries.length; i += 1) { + const entry = entries[i]!; + let bestK = 0; + let bestD = Number.POSITIVE_INFINITY; + for (let k = 0; k < centers.length; k += 1) { + const d = deltaE2000(centers[k]!, { l: entry.l, a: entry.a, b: entry.b }); + if (d < bestD) { bestD = d; bestK = k; } + } + out[i] = bestK; + } + return out; +} + +function recomputeCenters( + entries: PaletteEntry[], + assignments: number[], + k: number, +): { centers: LabColor[]; weights: number[]; counts: number[] } { + const sumL = new Array(k).fill(0); + const sumA = new Array(k).fill(0); + const sumB = new Array(k).fill(0); + const sumW = new Array(k).fill(0); + const counts = new Array(k).fill(0); + + for (let i = 0; i < entries.length; i += 1) { + const entry = entries[i]!; + const cluster = assignments[i]!; + sumL[cluster] = (sumL[cluster] ?? 0) + entry.l * entry.weight; + sumA[cluster] = (sumA[cluster] ?? 0) + entry.a * entry.weight; + sumB[cluster] = (sumB[cluster] ?? 0) + entry.b * entry.weight; + sumW[cluster] = (sumW[cluster] ?? 0) + entry.weight; + counts[cluster] = (counts[cluster] ?? 0) + 1; + } + + const centers: LabColor[] = []; + const weights: number[] = []; + const outCounts: number[] = []; + for (let i = 0; i < k; i += 1) { + const w = sumW[i] ?? 0; + if (w <= 0) continue; + centers.push({ l: sumL[i]! / w, a: sumA[i]! / w, b: sumB[i]! / w }); + weights.push(w); + outCounts.push(counts[i] ?? 0); + } + return { centers, weights, counts: outCounts }; +} diff --git a/src/features/scene-browser/utils/rank.test.ts b/src/features/scene-browser/utils/rank.test.ts new file mode 100644 index 000000000..c3cd7fcec --- /dev/null +++ b/src/features/scene-browser/utils/rank.test.ts @@ -0,0 +1,106 @@ +import { describe, expect, it } from 'vitest'; +import { rankScenes, type RankableScene } from './rank'; + +function scene(id: string, text: string, extra: Partial = {}): RankableScene { + return { + id, + mediaId: extra.mediaId ?? id.split(':')[0] ?? 'm1', + mediaFileName: extra.mediaFileName ?? 'clip.mp4', + timeSec: extra.timeSec ?? 0, + text, + thumbRelPath: extra.thumbRelPath, + }; +} + +describe('rankScenes', () => { + it('returns scenes unchanged when the query is empty', () => { + const scenes = [scene('a', 'A chef plates pasta'), scene('b', 'Sunset over mountains')]; + const result = rankScenes('', scenes); + expect(result).toHaveLength(2); + expect(result.map((s) => s.id)).toEqual(['a', 'b']); + expect(result[0]!.matchSpans).toEqual([]); + }); + + it('scores exact substring matches above token matches', () => { + const scenes = [ + scene('a', 'A chef plating roasted chicken on a wooden board'), + scene('b', 'Kitchen preparation shot with chef tools'), + ]; + const result = rankScenes('roasted chicken', scenes); + expect(result[0]!.id).toBe('a'); + expect(result[0]!.score).toBeGreaterThan(result[1]?.score ?? 0); + }); + + it('matches on token overlap when no substring is present', () => { + const scenes = [ + scene('a', 'Wide shot of a kitchen counter with copper pots'), + scene('b', 'Living room with bookshelves'), + ]; + const result = rankScenes('kitchen pots', scenes); + expect(result.map((s) => s.id)).toEqual(['a']); + }); + + it('tolerates a single-char typo via trigram similarity', () => { + const scenes = [scene('a', 'Bright kitchen counter shot')]; + const result = rankScenes('kitchin', scenes); + expect(result.map((s) => s.id)).toEqual(['a']); + }); + + it('does not fuzzy-match on a shared suffix when the prefix differs', () => { + // "orange" vs "range" share four trigrams at the tail end — without a + // prefix gate, the whole mountain library falls out of a fruit query. + const scenes = [ + scene('a', 'A snowy mountain range with a field of green trees in the foreground.'), + scene('b', 'A tree with orange leaves is shown against a blue sky.'), + ]; + const result = rankScenes('orange', scenes); + expect(result.map((s) => s.id)).toEqual(['b']); + }); + + it('filters out scenes below the score threshold', () => { + const scenes = [ + scene('a', 'A chef plating roasted chicken'), + scene('b', 'Sunset over mountains'), + ]; + const result = rankScenes('kitchen', scenes); + expect(result.find((s) => s.id === 'b')).toBeUndefined(); + }); + + it('returns merged case-insensitive match spans', () => { + const scenes = [scene('a', 'Chef places a pan. Chef plates pasta.')]; + const result = rankScenes('chef', scenes); + expect(result).toHaveLength(1); + const spans = result[0]!.matchSpans; + expect(spans.length).toBeGreaterThanOrEqual(2); + for (const [from, to] of spans) { + expect(result[0]!.text.slice(from, to).toLowerCase()).toBe('chef'); + } + }); + + it('ignores punctuation differences between query and caption', () => { + const scenes = [scene('a', 'A close-up of a wine glass.')]; + const result = rankScenes('close up wine', scenes); + expect(result.map((s) => s.id)).toEqual(['a']); + }); + + it('matches richer scene-caption vocabulary for shot size and weather terms', () => { + const scenes = [ + scene('a', 'Wide shot of a city skyline at dusk.'), + scene('b', 'Medium close-up of a singer on a rainy street.'), + scene('c', 'Close-up of hands slicing limes on a cutting board.'), + ]; + + expect(rankScenes('wide shot dusk skyline', scenes).map((s) => s.id)).toEqual(['a']); + expect(rankScenes('rainy singer', scenes).map((s) => s.id)).toEqual(['b']); + expect(rankScenes('close up limes', scenes)[0]?.id).toBe('c'); + }); + + it('is stable in sort by filename then timestamp when scores tie', () => { + const scenes = [ + scene('b', 'chef pans', { timeSec: 10, mediaFileName: 'b.mp4' }), + scene('a', 'chef pans', { timeSec: 5, mediaFileName: 'a.mp4' }), + ]; + const result = rankScenes('chef pans', scenes); + expect(result.map((s) => s.id)).toEqual(['a', 'b']); + }); +}); diff --git a/src/features/scene-browser/utils/rank.ts b/src/features/scene-browser/utils/rank.ts new file mode 100644 index 000000000..6c439850b --- /dev/null +++ b/src/features/scene-browser/utils/rank.ts @@ -0,0 +1,244 @@ +/** + * Pure keyword + fuzzy ranking for scene captions. + * + * Kept dependency-free so it can be unit-tested without stores or storage, + * and moved into a worker later if ranking grows expensive (current 10k-scene + * runs complete in well under a frame on a modern laptop). + * + * Ranking is intentionally simple in v1: + * - exact substring match on the normalized caption text → 1.0 + * - ratio of query tokens that appear in the caption (whole-word or prefix) + * - trigram similarity as a tiebreak for typo tolerance + * Scores combine with max() rather than a linear blend so a clean substring + * match always beats a partial token overlap, regardless of caption length. + */ + +export interface RankableScene { + /** Stable composite id — typically `${mediaId}:${captionIndex}`. */ + id: string; + mediaId: string; + mediaFileName: string; + timeSec: number; + text: string; + thumbRelPath?: string; + /** + * Dominant-color palette (CIELAB + weight) for UI swatch display and + * color-query ranking. Plumbed through from `MediaCaption.palette`. + */ + palette?: Array<{ l: number; a: number; b: number; weight: number }>; +} + +/** + * Per-signal breakdown of why a scene ranked. Surfaced on the row so + * users can tell, at a glance, whether the match was driven by caption + * keywords, semantic text meaning, or visual (CLIP) similarity — which + * is the main UX gap that "I can't tell if semantic search is working" + * points at. + */ +export interface SceneMatchSignals { + /** Which ranker produced this row. */ + ranker: 'keyword' | 'semantic'; + /** Cosine against the text (all-MiniLM) embedding, when semantic mode ran. */ + textScore?: number; + /** Cosine against the CLIP image embedding, when visual ranking ran. */ + imageScore?: number; + /** True when the row cleared the keyword match threshold. */ + keywordMatched?: boolean; + /** + * Color family (e.g. `"red"`) that the query asked for and the + * caption text mentions. Set by the color-boost pass in the ranker. + * Present means the final score got a boost and the UI should show a + * Color chip; absent means no color match (or no color query). + */ + colorMatch?: string; + /** + * Weighted-mean ∆E between the scene's palette and the user-selected + * reference palette, when "find similar palette" is active. Lower is + * closer; surfaced as a palette-distance chip on the row. + */ + paletteDistance?: number; +} + +export interface ScoredScene extends RankableScene { + score: number; + /** Character ranges within `text` that matched, for rendering. */ + matchSpans: Array<[number, number]>; + signals: SceneMatchSignals; +} + +export interface RankOptions { + /** Drop scenes below this score. Defaults to 0.25. */ + threshold?: number; +} + +const DEFAULT_THRESHOLD = 0.25; + +/** Strip punctuation and lowercase. Preserves letters, digits, CJK, whitespace. */ +function normalize(text: string): string { + return text.toLowerCase().replace(/[^\p{L}\p{N}\s]/gu, ' ').replace(/\s+/g, ' ').trim(); +} + +function trigrams(text: string): Set { + const padded = ` ${text} `; + const set = new Set(); + for (let i = 0; i < padded.length - 2; i += 1) { + set.add(padded.slice(i, i + 3)); + } + return set; +} + +/** + * Dice-style trigram similarity between two tokens. Overlap coefficient + * against the smaller token is more forgiving than Jaccard for short + * typo-laden queries where the whole caption would otherwise dominate the + * denominator. + */ +function tokenTrigramSimilarity(a: string, b: string): number { + if (a.length < 3 || b.length < 3) return 0; + const left = trigrams(a); + const right = trigrams(b); + let overlap = 0; + for (const tri of left) { + if (right.has(tri)) overlap += 1; + } + const denominator = Math.min(left.size, right.size); + return denominator === 0 ? 0 : overlap / denominator; +} + +/** + * Typo-tolerant match gate. Fuzzy matching alone is too permissive — "orange" + * and "range" share four of their five interior trigrams, so a naive trigram + * score would surface "mountain range" results for an "orange" query. + * + * Anchoring on a shared prefix (at least half the query token, capped at 3 + * chars) keeps typos at the back of the word matching ("kitchin" → "kitchen") + * while rejecting coincidental substring overlaps. + */ +function sharesQueryPrefix(queryToken: string, captionToken: string): boolean { + if (queryToken.length < 3) return false; + const prefixLen = Math.min(3, Math.max(2, Math.floor(queryToken.length / 2))); + return captionToken.startsWith(queryToken.slice(0, prefixLen)); +} + +/** Best fuzzy match for a single query token against any caption token. */ +function bestFuzzyTokenScore(queryToken: string, captionTokens: string[]): number { + let best = 0; + for (const captionToken of captionTokens) { + if (!sharesQueryPrefix(queryToken, captionToken)) continue; + const similarity = tokenTrigramSimilarity(queryToken, captionToken); + if (similarity > best) best = similarity; + if (best === 1) return 1; + } + return best; +} + +/** + * Find ranges in the original `text` (case-insensitive) that match any of + * the query tokens. Overlapping ranges are merged so the renderer + * doesn't have to deduplicate. + */ +function findMatchSpans(text: string, tokens: string[]): Array<[number, number]> { + if (tokens.length === 0) return []; + const lower = text.toLowerCase(); + const raw: Array<[number, number]> = []; + for (const token of tokens) { + if (token.length === 0) continue; + let from = 0; + while (from <= lower.length - token.length) { + const idx = lower.indexOf(token, from); + if (idx < 0) break; + raw.push([idx, idx + token.length]); + from = idx + token.length; + } + } + if (raw.length === 0) return []; + raw.sort((a, b) => a[0] - b[0]); + const merged: Array<[number, number]> = []; + for (const span of raw) { + const last = merged[merged.length - 1]; + if (last && span[0] <= last[1]) { + last[1] = Math.max(last[1], span[1]); + } else { + merged.push([span[0], span[1]]); + } + } + return merged; +} + +const FUZZY_TOKEN_THRESHOLD = 0.6; + +function scoreScene(query: string, queryTokens: string[], scene: RankableScene): number { + const captionNormalized = normalize(scene.text); + if (captionNormalized.length === 0) return 0; + + if (query.length > 0 && captionNormalized.includes(query)) { + return 1; + } + + const captionTokens = captionNormalized.split(' '); + if (queryTokens.length === 0) return 0; + + let exactOrPrefix = 0; + let fuzzySum = 0; + for (const queryToken of queryTokens) { + if (captionTokens.some((token) => token === queryToken || token.startsWith(queryToken))) { + exactOrPrefix += 1; + fuzzySum += 1; + continue; + } + const fuzzy = bestFuzzyTokenScore(queryToken, captionTokens); + if (fuzzy >= FUZZY_TOKEN_THRESHOLD) { + fuzzySum += fuzzy; + } + } + + // Prefix-heavy matches get a small bonus so "kitchen pots" in caption wins + // over "kichen pts" in a different caption at the same fuzzy coverage. + const tokenScore = (exactOrPrefix / queryTokens.length) * 0.9; + const fuzzyScore = (fuzzySum / queryTokens.length) * 0.8; + + return Math.max(tokenScore, fuzzyScore); +} + +/** + * Rank scenes against `query`. Empty query returns scenes unchanged (no + * filtering, no sorting) so callers can show the default timestamp-sorted + * view without a second code path. + */ +export function rankScenes( + query: string, + scenes: RankableScene[], + options: RankOptions = {}, +): ScoredScene[] { + const normalizedQuery = normalize(query); + if (normalizedQuery.length === 0) { + return scenes.map((scene) => ({ + ...scene, + score: 0, + matchSpans: [], + signals: { ranker: 'keyword' }, + })); + } + + const threshold = options.threshold ?? DEFAULT_THRESHOLD; + const queryTokens = normalizedQuery.split(' ').filter(Boolean); + + const scored: ScoredScene[] = []; + for (const scene of scenes) { + const score = scoreScene(normalizedQuery, queryTokens, scene); + if (score < threshold) continue; + scored.push({ + ...scene, + score, + matchSpans: findMatchSpans(scene.text, queryTokens), + signals: { ranker: 'keyword', keywordMatched: true }, + }); + } + + scored.sort((a, b) => { + if (b.score !== a.score) return b.score - a.score; + if (a.mediaFileName !== b.mediaFileName) return a.mediaFileName.localeCompare(b.mediaFileName); + return a.timeSec - b.timeSec; + }); + return scored; +} diff --git a/src/features/scene-browser/utils/seek.ts b/src/features/scene-browser/utils/seek.ts new file mode 100644 index 000000000..a4fbfccf9 --- /dev/null +++ b/src/features/scene-browser/utils/seek.ts @@ -0,0 +1,38 @@ +/** + * Seek a scene in the Source Monitor. Mirrors media-card's handleSeekToCaption + * so Scene Browser rows open the source preview the same way a caption + * timestamp click does — setting the source player state alone isn't enough, + * the editor store's sourcePreviewMediaId is what actually mounts the panel. + */ + +import { + useEditorStore, + useMediaLibraryStore, + useSourcePlayerStore, +} from '../deps/media-library'; + +export const SCENE_SELECTION_DURATION_SEC = 3; + +export function seekToScene(mediaId: string, timeSec: number): void { + const media = useMediaLibraryStore.getState().mediaById[mediaId]; + if (!media) return; + const fps = media.fps || 30; + const sourceDurationFrames = Math.max(1, Math.round(media.duration * fps)); + const frame = Math.max(0, Math.min(sourceDurationFrames - 1, Math.round(timeSec * fps))); + const outFrame = Math.min( + sourceDurationFrames, + frame + Math.max(1, Math.round(SCENE_SELECTION_DURATION_SEC * fps)), + ); + + const source = useSourcePlayerStore.getState(); + // Pause the current scene synchronously — waiting for the seek-consume + // effect leaves the video element decoding the old frame, which is + // what the user sees as "flash of the old scene" when switching. + source.playerMethods?.pause(); + source.setCurrentMediaId(mediaId); + source.clearInOutPoints(); + source.setInPoint(frame); + source.setOutPoint(outFrame); + source.setPendingSeekFrame(frame); + useEditorStore.getState().setSourcePreviewMediaId(mediaId); +} diff --git a/src/features/scene-browser/utils/semantic-rank.test.ts b/src/features/scene-browser/utils/semantic-rank.test.ts new file mode 100644 index 000000000..b6d8655ac --- /dev/null +++ b/src/features/scene-browser/utils/semantic-rank.test.ts @@ -0,0 +1,362 @@ +import { describe, expect, it } from 'vitest'; +import { + cosineSimilarity, + semanticRank, + SEMANTIC_MATCH_THRESHOLD, +} from './semantic-rank'; +import type { RankableScene } from './rank'; + +function unit(values: number[]): Float32Array { + const magnitude = Math.sqrt(values.reduce((sum, x) => sum + x * x, 0)) || 1; + return Float32Array.from(values.map((x) => x / magnitude)); +} + +function scene(id: string, text: string): RankableScene { + return { + id, + mediaId: id.split(':')[0] ?? 'm1', + mediaFileName: `${id}.mp4`, + timeSec: 0, + text, + }; +} + +describe('cosineSimilarity', () => { + it('returns 1 for identical unit vectors', () => { + const a = unit([1, 2, 3]); + expect(cosineSimilarity(a, a)).toBeCloseTo(1, 5); + }); + + it('returns 0 for orthogonal vectors', () => { + const a = unit([1, 0]); + const b = unit([0, 1]); + expect(cosineSimilarity(a, b)).toBeCloseTo(0, 5); + }); + + it('returns 0 when dimensions differ', () => { + expect(cosineSimilarity(unit([1, 0]), unit([1, 0, 0]))).toBe(0); + }); +}); + +describe('semanticRank', () => { + it('orders scenes by descending cosine similarity to the query', () => { + const query = unit([1, 0, 0]); + const scenes = [scene('a:0', 'first'), scene('b:0', 'second'), scene('c:0', 'third')]; + const embeddings = new Map([ + ['a:0', unit([0.9, 0.1, 0])], + ['b:0', unit([0.2, 1, 0])], + ['c:0', unit([1, 0, 0])], + ]); + const result = semanticRank(query, scenes, embeddings, { threshold: 0 }); + expect(result.map((s) => s.id)).toEqual(['c:0', 'a:0', 'b:0']); + }); + + it('drops scenes below the threshold', () => { + const query = unit([1, 0]); + const scenes = [scene('a:0', 'a'), scene('b:0', 'b')]; + const embeddings = new Map([ + ['a:0', unit([0.99, 0.01])], + ['b:0', unit([0.01, 0.99])], + ]); + const result = semanticRank(query, scenes, embeddings); + expect(result.map((s) => s.id)).toEqual(['a:0']); + expect(result[0]!.score).toBeGreaterThan(SEMANTIC_MATCH_THRESHOLD); + }); + + it('skips scenes that have no embedding in the map', () => { + const query = unit([1, 0]); + const scenes = [scene('a:0', 'with'), scene('b:0', 'without')]; + const embeddings = new Map([ + ['a:0', unit([1, 0])], + ]); + const result = semanticRank(query, scenes, embeddings, { threshold: 0 }); + expect(result.map((s) => s.id)).toEqual(['a:0']); + }); + + it('returns empty matchSpans so highlighting stays sane', () => { + const query = unit([1, 0]); + const scenes = [scene('a:0', 'orange sky over water')]; + const embeddings = new Map([['a:0', unit([1, 0])]]); + const [top] = semanticRank(query, scenes, embeddings, { threshold: 0 }); + expect(top!.matchSpans).toEqual([]); + }); + + it('stable-sorts ties by filename then timestamp', () => { + const query = unit([1, 0]); + const scenes: RankableScene[] = [ + { id: 'b:0', mediaId: 'b', mediaFileName: 'b.mp4', timeSec: 5, text: 'b' }, + { id: 'a:0', mediaId: 'a', mediaFileName: 'a.mp4', timeSec: 10, text: 'a' }, + ]; + const embeddings = new Map([ + ['a:0', unit([1, 0])], + ['b:0', unit([1, 0])], + ]); + const result = semanticRank(query, scenes, embeddings, { threshold: 0 }); + expect(result.map((s) => s.id)).toEqual(['a:0', 'b:0']); + }); +}); + +describe('semanticRank with CLIP image signal', () => { + it('falls through to image match when caption text is weak', () => { + const textQuery = unit([1, 0]); + const imageQuery = unit([1, 0, 0]); + const scenes = [scene('a:0', 'terse caption')]; + const textEmbeds = new Map([ + ['a:0', unit([0.05, 1])], // nearly orthogonal to text query + ]); + const imageEmbeds = new Map([ + ['a:0', unit([0.9, 0.1, 0])], + ]); + const result = semanticRank(textQuery, scenes, textEmbeds, { + queryImageEmbedding: imageQuery, + imageEmbeddings: imageEmbeds, + }); + expect(result.map((s) => s.id)).toEqual(['a:0']); + expect(result[0]!.score).toBeGreaterThan(0.5); + }); + + it('takes max of text and image scores when both are present', () => { + const textQuery = unit([1, 0]); + const imageQuery = unit([1, 0, 0]); + const scenes = [scene('a:0', 'strong text'), scene('b:0', 'strong image')]; + const textEmbeds = new Map([ + ['a:0', unit([1, 0])], // text cosine ≈ 1 + ['b:0', unit([0.1, 1])], // text cosine low + ]); + const imageEmbeds = new Map([ + ['a:0', unit([0.1, 1, 0])], // image cosine low + ['b:0', unit([1, 0, 0])], // image cosine ≈ 1 + ]); + const result = semanticRank(textQuery, scenes, textEmbeds, { + queryImageEmbedding: imageQuery, + imageEmbeddings: imageEmbeds, + threshold: 0.2, + imageThreshold: 0.2, + }); + expect(result.map((s) => s.id).sort()).toEqual(['a:0', 'b:0']); + expect(result[0]!.score).toBeGreaterThan(0.9); + expect(result[1]!.score).toBeGreaterThan(0.9); + }); + + it('drops a scene only when both signals are below their thresholds', () => { + const textQuery = unit([1, 0]); + const imageQuery = unit([1, 0]); + const scenes = [scene('a:0', 'weak everywhere')]; + const textEmbeds = new Map([ + ['a:0', unit([0.1, 1])], // cosine ≈ 0.1 + ]); + const imageEmbeds = new Map([ + ['a:0', unit([0.05, 1])], // cosine ≈ 0.05 + ]); + const result = semanticRank(textQuery, scenes, textEmbeds, { + queryImageEmbedding: imageQuery, + imageEmbeddings: imageEmbeds, + }); + expect(result).toEqual([]); + }); + + it('drops a scene whose only signal is a 0.21 visual match (below 0.22 threshold)', () => { + const textQuery = unit([1, 0]); + const imageQuery = unit([1, 0]); + const scenes = [scene('tower:0', 'A tall green tower at night')]; + const textEmbeds = new Map(); // no text match at all + // 0.21 cosine — the exact false-positive level observed in the wild + // for one-word queries before we raised the threshold. + const imageEmbeds = new Map([ + ['tower:0', unit([0.21, Math.sqrt(1 - 0.21 * 0.21)])], + ]); + const result = semanticRank(textQuery, scenes, textEmbeds, { + queryImageEmbedding: imageQuery, + imageEmbeddings: imageEmbeds, + }); + expect(result).toEqual([]); + }); + + it('drops a scene whose only signal is a Fair-tier visual match with no text support', () => { + // The "seated down → doorknob close-up" failure: CLIP cosine just + // above the 0.22 floor but no text corroboration. Should not pass. + const textQuery = unit([1, 0]); + const imageQuery = unit([1, 0]); + const scenes = [scene('doorknob:0', 'Close-up of a hand gripping a doorknob')]; + const textEmbeds = new Map([ + ['doorknob:0', unit([0.1, 1])], // text cosine ≈ 0.1, below Fair floor + ]); + const imageEmbeds = new Map([ + ['doorknob:0', unit([0.25, Math.sqrt(1 - 0.25 * 0.25)])], // image cosine ≈ 0.25 + ]); + const result = semanticRank(textQuery, scenes, textEmbeds, { + queryImageEmbedding: imageQuery, + imageEmbeddings: imageEmbeds, + }); + expect(result).toEqual([]); + }); + + it('accepts a Fair-Fair scene where both sides mutually confirm', () => { + const textQuery = unit([1, 0]); + const imageQuery = unit([1, 0]); + const scenes = [scene('a:0', 'An elderly couple sits in a wheelchair')]; + // 0.32 text (Fair) × 0.23 image (Fair) — weak alone, confirming together. + const textEmbeds = new Map([ + ['a:0', unit([0.32, Math.sqrt(1 - 0.32 * 0.32)])], + ]); + const imageEmbeds = new Map([ + ['a:0', unit([0.23, Math.sqrt(1 - 0.23 * 0.23)])], + ]); + const result = semanticRank(textQuery, scenes, textEmbeds, { + queryImageEmbedding: imageQuery, + imageEmbeddings: imageEmbeds, + }); + expect(result.map((s) => s.id)).toEqual(['a:0']); + }); + + it('accepts a scene on strong text alone, even when image is weak', () => { + const textQuery = unit([1, 0]); + const imageQuery = unit([1, 0]); + const scenes = [scene('a:0', 'strong text')]; + const textEmbeds = new Map([ + ['a:0', unit([0.5, Math.sqrt(1 - 0.5 * 0.5)])], // text cosine = 0.5, strong + ]); + const imageEmbeds = new Map([ + ['a:0', unit([0.15, Math.sqrt(1 - 0.15 * 0.15)])], // below Fair floor + ]); + const result = semanticRank(textQuery, scenes, textEmbeds, { + queryImageEmbedding: imageQuery, + imageEmbeddings: imageEmbeds, + }); + expect(result.map((s) => s.id)).toEqual(['a:0']); + }); + + it('still ranks a scene that has no image embedding on text alone', () => { + const textQuery = unit([1, 0]); + const imageQuery = unit([1, 0]); + const scenes = [scene('a:0', 'text-only scene')]; + const textEmbeds = new Map([ + ['a:0', unit([1, 0])], + ]); + const imageEmbeds = new Map(); // empty + const result = semanticRank(textQuery, scenes, textEmbeds, { + queryImageEmbedding: imageQuery, + imageEmbeddings: imageEmbeds, + }); + expect(result.map((s) => s.id)).toEqual(['a:0']); + }); + + it('ignores image side when queryImageEmbedding is null', () => { + const textQuery = unit([1, 0]); + const scenes = [scene('a:0', 'has image not text')]; + const textEmbeds = new Map([ + ['a:0', unit([0.1, 1])], // weak text + ]); + const imageEmbeds = new Map([ + ['a:0', unit([1, 0])], // strong image + ]); + const result = semanticRank(textQuery, scenes, textEmbeds, { + queryImageEmbedding: null, + imageEmbeddings: imageEmbeds, + }); + expect(result).toEqual([]); // image was strong but query image embed absent + }); + + it('uses palette-only ranking for explicit pure color queries', () => { + const textQuery = unit([1, 0]); + const imageQuery = unit([1, 0]); + const scenes = [scene('a:0', 'A person in a yellow jacket'), scene('b:0', 'A dark blue hallway')]; + const textEmbeds = new Map([ + ['a:0', unit([1, 0])], // strong text match that should be ignored + ['b:0', unit([0, 1])], + ]); + const imageEmbeds = new Map([ + ['a:0', unit([1, 0])], // strong visual match that should be ignored + ['b:0', unit([0, 1])], + ]); + const palettes = new Map([ + ['a:0', [{ l: 40, a: 15, b: -60, weight: 0.9 }]], // blue + ['b:0', [{ l: 90, a: -5, b: 80, weight: 0.9 }]], // yellow + ]); + + const result = semanticRank(textQuery, scenes, textEmbeds, { + query: 'yellow color', + queryImageEmbedding: imageQuery, + imageEmbeddings: imageEmbeds, + palettes, + }); + + expect(result.map((s) => s.id)).toEqual(['b:0']); + expect(result[0]?.signals.colorMatch).toBe('yellow'); + expect(result[0]?.signals.textScore).toBeUndefined(); + expect(result[0]?.signals.imageScore).toBeUndefined(); + }); + + it('keeps semantic text/image scoring for mixed color-content queries', () => { + const textQuery = unit([1, 0]); + const imageQuery = unit([1, 0]); + const scenes = [scene('a:0', 'Yellow kitchen interior'), scene('b:0', 'Blue hallway')]; + const textEmbeds = new Map([ + ['a:0', unit([1, 0])], + ['b:0', unit([0, 1])], + ]); + const imageEmbeds = new Map([ + ['a:0', unit([1, 0])], + ['b:0', unit([0, 1])], + ]); + const palettes = new Map([ + ['a:0', [{ l: 90, a: -5, b: 80, weight: 0.9 }]], + ['b:0', [{ l: 40, a: 15, b: -60, weight: 0.9 }]], + ]); + + const result = semanticRank(textQuery, scenes, textEmbeds, { + query: 'yellow color kitchen', + queryImageEmbedding: imageQuery, + imageEmbeddings: imageEmbeds, + palettes, + }); + + expect(result[0]?.id).toBe('a:0'); + expect(result[0]?.signals.colorMatch).toBe('yellow'); + expect(result[0]?.signals.textScore).toBeDefined(); + expect(result[0]?.signals.imageScore).toBeDefined(); + }); + + it('ranks by palette similarity and ignores text scores when referencePalette is set', () => { + // With a reference palette, the ranker should find scenes whose + // palettes are perceptually close to the reference, regardless of + // how well the text side matches the query vector. + const query = unit([1, 0]); + const scenes = [ + scene('warm:0', 'an unrelated caption'), + scene('cool:0', 'a perfect text match'), + ]; + const textEmbeds = new Map([ + ['warm:0', unit([0, 1])], + ['cool:0', unit([1, 0])], + ]); + const palettes = new Map([ + ['warm:0', [{ l: 53, a: 70, b: 50, weight: 0.9 }]], + ['cool:0', [{ l: 40, a: 15, b: -60, weight: 0.9 }]], + ]); + const referencePalette = [{ l: 53, a: 70, b: 50, weight: 1 }]; + + const result = semanticRank(query, scenes, textEmbeds, { + palettes, + referencePalette, + }); + + expect(result.map((s) => s.id)).toEqual(['warm:0']); + expect(result[0]?.signals.paletteDistance).toBeDefined(); + expect(result[0]?.signals.textScore).toBeUndefined(); + }); + + it('falls back to the scene-level palette when paletteMap lacks the id', () => { + const query = unit([1, 0]); + const warmPalette = [{ l: 53, a: 70, b: 50, weight: 1 }]; + const scenes: RankableScene[] = [ + { ...scene('warm:0', 'x'), palette: warmPalette }, + ]; + const textEmbeds = new Map(); + const result = semanticRank(query, scenes, textEmbeds, { + referencePalette: warmPalette, + }); + expect(result).toHaveLength(1); + expect(result[0]?.score).toBeGreaterThan(0); + }); +}); diff --git a/src/features/scene-browser/utils/semantic-rank.ts b/src/features/scene-browser/utils/semantic-rank.ts new file mode 100644 index 000000000..4d8101de3 --- /dev/null +++ b/src/features/scene-browser/utils/semantic-rank.ts @@ -0,0 +1,205 @@ +/** + * Semantic ranker — cosine similarity over unit-length caption embeddings. + * + * Vectors coming out of `embeddingsProvider` are already L2-normalized + * (the worker uses `normalize: true`), so cosine similarity reduces to + * a dot product here. Keeping this module dependency-free makes it + * cheap to unit-test without spinning up a worker. + */ + +import type { PaletteEntry } from '../deps/analysis'; +import { + colorBoostFor, + paletteSimilarityBoost, + parseColorQuery, + type ColorBoostResult, +} from './color-boost'; +import type { RankableScene, ScoredScene } from './rank'; + +/** "Fair" tier floor for text cosines — a weakly confirming signal. */ +export const SEMANTIC_MATCH_THRESHOLD = 0.3; + +/** + * CLIP cosine scores cluster in a much narrower range than all-MiniLM + * text-to-text scores — even a strong visual match rarely clears 0.35, + * whereas a strong text match can hit 0.7+. Using separate thresholds + * keeps both signals on equal footing when we combine them below. + * + * 0.22 is the "Fair" floor — a weakly confirming signal. It used to be + * the *accept* threshold, but at that level CLIP's short-query + * distribution put ~50% of a 200-scene corpus past it on almost any + * prompt (the "seated down → skateboarding, doorknobs" failure). Now + * it gates combined weak-signal acceptance: Fair-Fair only counts when + * the text side ALSO clears its Fair floor. + */ +export const SEMANTIC_IMAGE_MATCH_THRESHOLD = 0.22; + +/** "Good" tier floor for text — strong enough to accept alone. */ +export const SEMANTIC_TEXT_STRONG_THRESHOLD = 0.4; + +/** "Strong" tier floor for CLIP image cosines — strong enough to accept alone. */ +export const SEMANTIC_IMAGE_STRONG_THRESHOLD = 0.3; + +export interface SemanticRankOptions { + /** Minimum text cosine to retain a scene (default 0.3). */ + threshold?: number; + /** Minimum image cosine to retain a scene (default 0.2). */ + imageThreshold?: number; + /** CLIP-text-encoder embedding of the query, for matching image side. */ + queryImageEmbedding?: Float32Array | null; + /** sceneId → CLIP image embedding, parallel to the text embeddings map. */ + imageEmbeddings?: Map; + /** + * Raw user query. When it contains color terms, the ranker computes + * per-scene ∆E 2000 distance against each palette entry and folds + * the best match into the final score. Sidesteps CLIP's weakness on + * bare color queries. + */ + query?: string; + /** sceneId → dominant-color palette (CIELAB + weights). */ + palettes?: Map; + /** + * Reference palette for "find similar colors" mode. When set, the + * ranker switches to palette-similarity scoring and ignores text/CLIP + * signals — object semantics aren't part of "scenes with this palette". + */ + referencePalette?: PaletteEntry[] | null; +} + +export function cosineSimilarity(a: Float32Array, b: Float32Array): number { + if (a.length !== b.length) return 0; + let sum = 0; + for (let i = 0; i < a.length; i += 1) { + sum += a[i]! * b[i]!; + } + return sum; +} + +/** + * Rank scenes by cosine similarity to the query embedding. When a CLIP + * text-encoder query embedding and parallel image-embedding map are + * supplied, each scene's final score is `max(text_cosine, image_cosine)` + * (each gated by its own threshold). This makes missing signals harmless: + * a scene without image embeddings still ranks on text alone, and a scene + * with a weak caption can still surface on visual match. + * + * Scenes whose id is absent from *both* embedding maps are dropped — + * they have no semantic signal to rank on. Callers should handle that + * via keyword fallback or the retroactive indexer. + */ +export function semanticRank( + queryEmbedding: Float32Array, + scenes: RankableScene[], + embeddings: Map, + options: SemanticRankOptions = {}, +): ScoredScene[] { + const threshold = options.threshold ?? SEMANTIC_MATCH_THRESHOLD; + const imageThreshold = options.imageThreshold ?? SEMANTIC_IMAGE_MATCH_THRESHOLD; + const queryImage = options.queryImageEmbedding ?? null; + const imageMap = options.imageEmbeddings; + const paletteMap = options.palettes; + const referencePalette = options.referencePalette ?? null; + + // Parse color intent once so the per-scene loop stays tight. Explicit + // palette queries bypass text/CLIP scoring; mixed queries still get a + // palette boost on top of semantic meaning. A reference palette forces + // palette-only scoring regardless of the query shape. + const colorQuery = options.query ? parseColorQuery(options.query) : { colors: [], paletteOnly: false }; + const queryColors = colorQuery.colors; + const hasColorQuery = queryColors.length > 0; + const paletteOnly = !!referencePalette || colorQuery.paletteOnly; + + const scored: ScoredScene[] = []; + for (const scene of scenes) { + if (referencePalette) { + const scenePalette = paletteMap?.get(scene.id) ?? scene.palette; + const similarity = paletteSimilarityBoost(referencePalette, scenePalette); + if (!similarity) continue; + scored.push({ + ...scene, + score: similarity.boost, + matchSpans: [], + signals: { + ranker: 'semantic', + paletteDistance: similarity.distance, + }, + }); + continue; + } + const textVector = embeddings.get(scene.id); + const imageVector = queryImage && imageMap ? imageMap.get(scene.id) : undefined; + + const textScore = textVector ? cosineSimilarity(queryEmbedding, textVector) : 0; + const imageScore = imageVector && queryImage + ? cosineSimilarity(queryImage, imageVector) + : 0; + + let colorBoost: ColorBoostResult | null = null; + if (hasColorQuery && paletteMap) { + colorBoost = colorBoostFor(queryColors, paletteMap.get(scene.id)); + } + + // Accept logic is side-aware: + // - When both text and image sides exist for this scene, weak + // "Fair" signals are only accepted when mutually confirmed — + // without this gate ~50% of a 200-scene corpus clears the Fair + // CLIP floor on almost any short query (cosines cluster tight), + // so unrelated thumbnails (doorknobs, skateboarding) surface. + // - When only one side is available (CLIP still loading, or scene + // not image-indexed yet), fall back to the per-side floor so + // honest single-signal matches still show up. + // - Image-alone is held to the strong bar — a CLIP-only Fair match + // is the exact noise pattern we're trying to kill. + const hasTextSide = !paletteOnly && !!textVector; + const hasImageSide = !paletteOnly && !!imageVector; + const fairText = hasTextSide && textScore >= threshold; + const fairImage = hasImageSide && imageScore >= imageThreshold; + const strongText = hasTextSide && textScore >= SEMANTIC_TEXT_STRONG_THRESHOLD; + const strongImage = hasImageSide && imageScore >= SEMANTIC_IMAGE_STRONG_THRESHOLD; + + let accept: boolean; + if (hasTextSide && hasImageSide) { + accept = strongText || strongImage || (fairText && fairImage); + } else if (hasTextSide) { + accept = fairText; + } else if (hasImageSide) { + accept = strongImage; + } else { + accept = false; + } + + const textOk = accept && fairText; + const imageOk = accept && fairImage; + const colorOk = !!colorBoost; + if (!accept && !colorOk) continue; + + // Max of text / image / color signals — weakest side doesn't drag + // down a strong one. The color boost is already in cosine-compatible + // units (see `MAX_BOOST` in color-boost.ts). + const baseScore = Math.max( + textOk ? textScore : 0, + imageOk ? imageScore : 0, + ); + const score = colorBoost ? Math.max(baseScore, colorBoost.boost) : baseScore; + + // Semantic matches don't map to character spans in the caption text, + // so highlighting is empty — the rest of the UI handles that case. + scored.push({ + ...scene, + score, + matchSpans: [], + signals: { + ranker: 'semantic', + textScore: !paletteOnly && textVector ? textScore : undefined, + imageScore: !paletteOnly && imageVector ? imageScore : undefined, + colorMatch: colorBoost?.family, + }, + }); + } + scored.sort((a, b) => { + if (b.score !== a.score) return b.score - a.score; + if (a.mediaFileName !== b.mediaFileName) return a.mediaFileName.localeCompare(b.mediaFileName); + return a.timeSec - b.timeSec; + }); + return scored; +} diff --git a/src/features/settings/components/hotkey-editor-sections.ts b/src/features/settings/components/hotkey-editor-sections.ts index c559bf245..68b4f644c 100644 --- a/src/features/settings/components/hotkey-editor-sections.ts +++ b/src/features/settings/components/hotkey-editor-sections.ts @@ -113,6 +113,7 @@ export const HOTKEY_EDITOR_SECTIONS: readonly HotkeyEditorSection[] = [ items: [ { label: 'Save project', keys: ['SAVE'] }, { label: 'Export video', keys: ['EXPORT'] }, + { label: 'Open Scene Browser (search AI captions)', keys: ['OPEN_SCENE_BROWSER'] }, ], }, ] as const; diff --git a/src/features/settings/stores/settings-store.test.ts b/src/features/settings/stores/settings-store.test.ts index 20c91b226..d62b428da 100644 --- a/src/features/settings/stores/settings-store.test.ts +++ b/src/features/settings/stores/settings-store.test.ts @@ -8,7 +8,7 @@ const DEFAULT_SETTINGS = { editorDensity: 'compact' as const, maxUndoHistory: 50, autoSaveInterval: 0, - defaultWhisperModel: 'whisper-tiny' as const, + defaultWhisperModel: 'whisper-small' as const, defaultWhisperQuantization: 'hybrid' as const, defaultWhisperLanguage: '', }; @@ -26,7 +26,7 @@ describe('settings-store', () => { expect(state.editorDensity).toBe('compact'); expect(state.maxUndoHistory).toBe(50); expect(state.autoSaveInterval).toBe(0); - expect(state.defaultWhisperModel).toBe('whisper-tiny'); + expect(state.defaultWhisperModel).toBe('whisper-small'); expect(state.defaultWhisperQuantization).toBe('hybrid'); expect(state.defaultWhisperLanguage).toBe(''); }); @@ -53,6 +53,12 @@ describe('settings-store', () => { expect(useSettingsStore.getState().defaultWhisperQuantization).toBe('q8'); }); + it('normalizes legacy tiny model selections back to small', () => { + useSettingsStore.getState().setSetting('defaultWhisperModel', 'whisper-tiny'); + + expect(useSettingsStore.getState().defaultWhisperModel).toBe('whisper-small'); + }); + it('updates auto-save interval', () => { useSettingsStore.getState().setSetting('autoSaveInterval', 5); expect(useSettingsStore.getState().autoSaveInterval).toBe(5); diff --git a/src/features/settings/stores/settings-store.ts b/src/features/settings/stores/settings-store.ts index 0934c6a12..79f407c22 100644 --- a/src/features/settings/stores/settings-store.ts +++ b/src/features/settings/stores/settings-store.ts @@ -5,6 +5,7 @@ import { DEFAULT_WHISPER_LANGUAGE, DEFAULT_WHISPER_MODEL, DEFAULT_WHISPER_QUANTIZATION, + normalizeSelectableWhisperModel, } from '@/shared/utils/whisper-settings'; import type { EditorDensityPresetName } from '@/app/editor-layout'; import { DEFAULT_EDITOR_DENSITY_PRESET } from '@/app/editor-layout'; @@ -37,10 +38,64 @@ interface AppSettings { defaultWhisperQuantization: MediaTranscriptQuantization; defaultWhisperLanguage: string; + // AI captioning — interval between sampled frames when running LFM captions. + // Frames mode is converted to seconds at capture time using media.fps. + captioningIntervalUnit: CaptioningIntervalUnit; + captioningIntervalValue: number; + + // Scene Browser — how caption search matches queries. `semantic` uses a + // sentence-transformer model to rank by meaning; `keyword` uses + // substring + fuzzy-prefix matching on caption text. + captionSearchMode: CaptionSearchMode; + // Keyboard shortcuts hotkeyOverrides: HotkeyOverrideMap; } +export type CaptionSearchMode = 'keyword' | 'semantic'; + +function normalizeCaptionSearchMode(value: unknown): CaptionSearchMode { + return value === 'semantic' ? 'semantic' : 'keyword'; +} + +export type CaptioningIntervalUnit = 'seconds' | 'frames'; + +export const CAPTIONING_INTERVAL_BOUNDS = { + seconds: { min: 0.5, max: 60 }, + frames: { min: 1, max: 1800 }, +} as const; + +export const DEFAULT_CAPTIONING_INTERVAL_SECONDS = 3; + +function normalizeCaptioningIntervalUnit(value: unknown): CaptioningIntervalUnit { + return value === 'frames' ? 'frames' : 'seconds'; +} + +function clampCaptioningIntervalValue( + value: unknown, + unit: CaptioningIntervalUnit, +): number { + const bounds = CAPTIONING_INTERVAL_BOUNDS[unit]; + const fallback = unit === 'seconds' ? DEFAULT_CAPTIONING_INTERVAL_SECONDS : 90; + const numeric = typeof value === 'number' && Number.isFinite(value) ? value : fallback; + return Math.min(bounds.max, Math.max(bounds.min, numeric)); +} + +/** + * Derive the effective `sampleIntervalSec` to pass to the captioning provider. + * Frames mode divides by the source media FPS (falling back to 30 when the + * media reports no usable frame rate). + */ +export function resolveCaptioningIntervalSec( + unit: CaptioningIntervalUnit, + value: number, + fps: number, +): number { + if (unit === 'seconds') return value; + const effectiveFps = fps > 0 ? fps : 30; + return value / effectiveFps; +} + interface SettingsActions { setSetting: (key: K, value: AppSettings[K]) => void; setHotkeyBinding: (key: HotkeyKey, binding: string) => void; @@ -84,6 +139,13 @@ const DEFAULT_SETTINGS: AppSettings = { defaultWhisperQuantization: DEFAULT_WHISPER_QUANTIZATION, defaultWhisperLanguage: DEFAULT_WHISPER_LANGUAGE, + // AI captioning defaults + captioningIntervalUnit: 'seconds', + captioningIntervalValue: DEFAULT_CAPTIONING_INTERVAL_SECONDS, + + // Scene Browser defaults + captionSearchMode: 'keyword', + // Keyboard shortcuts hotkeyOverrides: {}, }; @@ -101,7 +163,24 @@ export const useSettingsStore = create()( (set) => ({ ...DEFAULT_SETTINGS, - setSetting: (key, value) => set({ [key]: value }), + setSetting: (key, value) => set((state) => { + if (key === 'defaultWhisperModel') { + return { [key]: normalizeSelectableWhisperModel(value as MediaTranscriptModel) }; + } + if (key === 'captioningIntervalUnit') { + const unit = normalizeCaptioningIntervalUnit(value); + return { + captioningIntervalUnit: unit, + captioningIntervalValue: clampCaptioningIntervalValue(state.captioningIntervalValue, unit), + }; + } + if (key === 'captioningIntervalValue') { + return { + captioningIntervalValue: clampCaptioningIntervalValue(value, state.captioningIntervalUnit), + }; + } + return { [key]: value }; + }), setHotkeyBinding: (key, binding) => set((state) => { const normalizedBinding = normalizeHotkeyBinding(binding); @@ -161,11 +240,19 @@ export const useSettingsStore = create()( name: 'freecut-settings', merge: (persistedState, currentState) => { const typedState = (persistedState as Partial | undefined) ?? {}; + const captioningIntervalUnit = normalizeCaptioningIntervalUnit(typedState.captioningIntervalUnit); return { ...currentState, ...typedState, + defaultWhisperModel: normalizeSelectableWhisperModel(typedState.defaultWhisperModel), hotkeyOverrides: sanitizeHotkeyOverrides(typedState.hotkeyOverrides), + captioningIntervalUnit, + captioningIntervalValue: clampCaptioningIntervalValue( + typedState.captioningIntervalValue, + captioningIntervalUnit, + ), + captionSearchMode: normalizeCaptionSearchMode(typedState.captionSearchMode), }; }, } diff --git a/src/features/timeline/components/timeline-content.test.tsx b/src/features/timeline/components/timeline-content.test.tsx index 0986cfbad..c4d479cc6 100644 --- a/src/features/timeline/components/timeline-content.test.tsx +++ b/src/features/timeline/components/timeline-content.test.tsx @@ -1,5 +1,5 @@ import { createRef, type ReactNode } from 'react'; -import { act, render, waitFor } from '@testing-library/react'; +import { act, fireEvent, render, waitFor } from '@testing-library/react'; import { beforeAll, beforeEach, describe, expect, it, vi } from 'vitest'; import { useEditorStore } from '@/app/state/editor'; @@ -131,6 +131,7 @@ beforeAll(() => { function resetStores() { useEditorStore.setState({ linkedSelectionEnabled: true, + transcriptionDialogDepth: 0, }); useSelectionStore.setState({ @@ -216,6 +217,39 @@ describe('TimelineContent playback selection behavior', () => { }); }); + it('does not update the hover scrub preview while the transcription dialog is open', async () => { + const { container } = render(); + const scrollContainer = container.querySelector('[data-timeline-scroll-container]'); + + if (!(scrollContainer instanceof HTMLDivElement)) { + throw new Error('Expected timeline scroll container'); + } + + Object.defineProperty(scrollContainer, 'getBoundingClientRect', { + configurable: true, + value: () => ({ + left: 0, + top: 0, + right: 400, + bottom: 200, + width: 400, + height: 200, + x: 0, + y: 0, + toJSON: () => ({}), + }), + }); + + act(() => { + useEditorStore.setState({ transcriptionDialogDepth: 1 }); + usePlaybackStore.getState().setPreviewFrame(12); + }); + + fireEvent.mouseMove(scrollContainer, { clientX: 180, clientY: 48 }); + + expect(usePlaybackStore.getState().previewFrame).toBeNull(); + }); + it('reveals the active track when selection moves to an offscreen lane', async () => { const videoTracks: TimelineTrack[] = [ { ...VIDEO_TRACK, id: 'track-video-1', name: 'V1', order: 0 }, @@ -375,4 +409,19 @@ describe('TimelineContent playback selection behavior', () => { }); expect(audioScrollContainer!.scrollTop).toBe(55); }); + + it('does not clear previewFrame on ruler mousedown before the ruler handler runs', () => { + const { container } = render(); + + act(() => { + usePlaybackStore.getState().setPreviewFrame(24); + }); + + const ruler = container.querySelector('.timeline-ruler') as HTMLDivElement | null; + expect(ruler).toBeTruthy(); + + fireEvent.mouseDown(ruler!, { button: 0 }); + + expect(usePlaybackStore.getState().previewFrame).toBe(24); + }); }); diff --git a/src/features/timeline/components/timeline-content.tsx b/src/features/timeline/components/timeline-content.tsx index 432bb696c..3deca3c26 100644 --- a/src/features/timeline/components/timeline-content.tsx +++ b/src/features/timeline/components/timeline-content.tsx @@ -51,6 +51,12 @@ import { useTransitionsStore } from '../stores/transitions-store'; import { getFilteredItemSnapEdges } from '../utils/timeline-snap-utils'; import { expandSelectionWithLinkedItems } from '../utils/linked-items'; import { getTimelineWidth, getZoomToFitLevel } from '../utils/timeline-layout'; +import { + getAnchoredZoomScrollLeft, + getCursorZoomAnchor, + getPlayheadZoomAnchor, + type TimelineZoomAnchor, +} from '../utils/zoom-anchor'; const ACTIVE_TIMELINE_GESTURE_CURSOR_CLASSES = [ 'timeline-cursor-trim-left', @@ -720,6 +726,7 @@ export const TimelineContent = memo(function TimelineContent({ const selectMarker = useSelectionStore((s) => s.selectMarker); const clearItemSelection = useSelectionStore((s) => s.clearItemSelection); const activeTrackId = useSelectionStore((s) => s.activeTrackId); + const isTranscriptionDialogOpen = useEditorStore((s) => s.transcriptionDialogDepth > 0); // Granular selectors for drag state - avoid subscribing to entire dragState object const isDragging = useSelectionStore((s) => !!s.dragState?.isDragging); const containerRef = useRef(null); @@ -758,6 +765,13 @@ export const TimelineContent = memo(function TimelineContent({ } }, [isDragging]); + useEffect(() => { + if (!isTranscriptionDialogOpen) return; + if (usePlaybackStore.getState().previewFrame !== null) { + usePlaybackStore.getState().setPreviewFrame(null); + } + }, [isTranscriptionDialogOpen]); + // Cleanup preview RAF on unmount useEffect(() => { return () => { @@ -1125,12 +1139,23 @@ export const TimelineContent = memo(function TimelineContent({ // Preview scrubber: show ghost playhead on hover const handleTimelineMouseDownCapture = useCallback((e: React.MouseEvent) => { if (e.button !== 0) return; + const target = e.target as HTMLElement; + if (target.closest('.timeline-ruler') || target.closest('[data-playhead-handle]')) { + return; + } if (usePlaybackStore.getState().previewFrame !== null) { setPreviewFrameRef.current(null); } }, []); const handleTimelineMouseMove = useCallback((e: React.MouseEvent) => { + if (useEditorStore.getState().transcriptionDialogDepth > 0) { + if (usePlaybackStore.getState().previewFrame !== null) { + setPreviewFrameRef.current(null); + } + return; + } + // Skip during playback if (usePlaybackStore.getState().isPlaying) { if (usePlaybackStore.getState().previewFrame !== null) { @@ -1235,15 +1260,27 @@ export const TimelineContent = memo(function TimelineContent({ actualDurationRef.current = actualDuration; + useLayoutEffect(() => { + const container = containerRef.current; + if (!container) { + return; + } + + const maxScrollLeft = Math.max(0, timelineWidth - container.clientWidth); + if (container.scrollLeft <= maxScrollLeft + 1) { + return; + } + + // Clamp stale scroll after timeline shrink so ruler and tracks stay aligned + // without subscribing broad UI surfaces to item-array churn. + container.scrollLeft = maxScrollLeft; + scrollLeftRef.current = maxScrollLeft; + syncViewportFromContainer(); + }, [timelineWidth, syncViewportFromContainer]); + // NOTE: itemsByTrack removed - TimelineTrack now fetches its own items // This prevents cascade re-renders when only one track's items change - /** - * Adjusts scroll position to keep cursor position stable when zoom changes - * (Anchor zooming - cursor stays visually fixed, content scales around it) - * - * Uses refs for dynamic values to avoid callback recreation on every render - */ const scheduleZoomApply = useCallback((nextZoomLevel: number, nextScrollLeft: number) => { queuedZoomLevelRef.current = nextZoomLevel; queuedZoomScrollLeftRef.current = nextScrollLeft; @@ -1278,64 +1315,64 @@ export const TimelineContent = memo(function TimelineContent({ } }, []); - const applyZoomWithPlayheadCentering = useCallback((newZoomLevel: number) => { - const container = containerRef.current; - if (!container) return; - + const applyZoomWithAnchor = useCallback((newZoomLevel: number, anchor: TimelineZoomAnchor) => { const currentZoom = queuedZoomLevelRef.current ?? zoomLevelRef.current; - - // Clamp zoom to valid range const clampedZoom = Math.max(0.01, Math.min(2, newZoomLevel)); if (clampedZoom === currentZoom) return; - // Cursor's screen position (relative to container's visible left edge) - const cursorScreenX = zoomCursorXRef.current; + const nextScrollLeft = getAnchoredZoomScrollLeft({ + anchor, + maxDurationSeconds: actualDurationRef.current, + nextZoomLevel: clampedZoom, + }); - // Calculate cursor's position in CONTENT coordinates (timeline space) - const baseScrollLeft = queuedZoomScrollLeftRef.current ?? pendingScrollRef.current ?? container.scrollLeft; - const cursorContentX = baseScrollLeft + cursorScreenX; + scheduleZoomApply(clampedZoom, nextScrollLeft); + }, [scheduleZoomApply]); - // Convert to time using current zoom, clamped to actual content duration - const currentPixelsPerSecond = currentZoom * 100; - const cursorTime = Math.min( - cursorContentX / currentPixelsPerSecond, - actualDurationRef.current - ); + const applyZoomWithCursorAnchor = useCallback((newZoomLevel: number) => { + const container = containerRef.current; + if (!container) return; - // Calculate where that same time point will be at the new zoom - const newPixelsPerSecond = clampedZoom * 100; - const newCursorContentX = cursorTime * newPixelsPerSecond; + const currentZoom = queuedZoomLevelRef.current ?? zoomLevelRef.current; + const baseScrollLeft = queuedZoomScrollLeftRef.current ?? pendingScrollRef.current ?? container.scrollLeft; - // Calculate scroll needed to keep cursor at same screen position - // cursor should stay at cursorScreenX, so: - // newScrollLeft + cursorScreenX = newCursorContentX - // newScrollLeft = newCursorContentX - cursorScreenX - const newScrollLeft = newCursorContentX - cursorScreenX; + applyZoomWithAnchor(newZoomLevel, getCursorZoomAnchor({ + currentZoomLevel: currentZoom, + cursorScreenX: zoomCursorXRef.current, + maxDurationSeconds: actualDurationRef.current, + scrollLeft: baseScrollLeft, + })); + }, [applyZoomWithAnchor]); - // Only clamp to prevent negative scroll (left boundary) - const clampedScrollLeft = Math.max(0, newScrollLeft); + const applyZoomWithPlayheadAnchor = useCallback((newZoomLevel: number) => { + const container = containerRef.current; + if (!container) return; - // Coalesce dense wheel updates into a single visual zoom publish per frame. - scheduleZoomApply(clampedZoom, clampedScrollLeft); - }, [scheduleZoomApply]); + const currentZoom = queuedZoomLevelRef.current ?? zoomLevelRef.current; + const baseScrollLeft = queuedZoomScrollLeftRef.current ?? pendingScrollRef.current ?? container.scrollLeft; + + applyZoomWithAnchor(newZoomLevel, getPlayheadZoomAnchor({ + currentFrame: currentFrameRef.current, + currentZoomLevel: currentZoom, + fps: useTimelineStore.getState().fps, + maxDurationSeconds: actualDurationRef.current, + scrollLeft: baseScrollLeft, + })); + }, [applyZoomWithAnchor]); - // Create zoom handlers that include playhead centering - // These callbacks are stable and don't recreate on every render thanks to refs const handleZoomChange = useCallback((newZoom: number) => { - applyZoomWithPlayheadCentering(newZoom); - }, [applyZoomWithPlayheadCentering]); + applyZoomWithPlayheadAnchor(newZoom); + }, [applyZoomWithPlayheadAnchor]); const handleZoomIn = useCallback(() => { - // Use standard zoom step (0.1), read from ref to avoid callback recreation const newZoomLevel = Math.min(2, zoomLevelRef.current + 0.1); - applyZoomWithPlayheadCentering(newZoomLevel); - }, [applyZoomWithPlayheadCentering]); + applyZoomWithPlayheadAnchor(newZoomLevel); + }, [applyZoomWithPlayheadAnchor]); const handleZoomOut = useCallback(() => { - // Use standard zoom step (0.1), read from ref to avoid callback recreation const newZoomLevel = Math.max(0.01, zoomLevelRef.current - 0.1); - applyZoomWithPlayheadCentering(newZoomLevel); - }, [applyZoomWithPlayheadCentering]); + applyZoomWithPlayheadAnchor(newZoomLevel); + }, [applyZoomWithPlayheadAnchor]); // Keep a ref to containerWidth for use in stable callbacks const containerWidthRef = useRef(containerWidth); @@ -1460,7 +1497,7 @@ export const TimelineContent = memo(function TimelineContent({ const logZoom = Math.log(currentZoom); const newLogZoom = logZoom - velocityZoomRef.current * 1.2; // Scale factor for feel const newZoomLevel = Math.exp(newLogZoom); - applyZoomWithPlayheadCentering(newZoomLevel); + applyZoomWithCursorAnchor(newZoomLevel); lastZoomApplyTimeRef.current = now; } @@ -1479,7 +1516,7 @@ export const TimelineContent = memo(function TimelineContent({ }; momentumIdRef.current = requestAnimationFrame(momentumLoop); - }, [applyZoomWithPlayheadCentering]); + }, [applyZoomWithCursorAnchor]); // Cleanup momentum on unmount useEffect(() => { @@ -1540,7 +1577,7 @@ export const TimelineContent = memo(function TimelineContent({ newZoom = Math.min(MAX_ZOOM, currentZoom * ZOOM_FACTOR); } - applyZoomWithPlayheadCentering(newZoom); + applyZoomWithCursorAnchor(newZoom); return; } @@ -1589,7 +1626,7 @@ export const TimelineContent = memo(function TimelineContent({ return () => { container.removeEventListener('wheel', wheelHandler); }; - }, [applyZoomWithPlayheadCentering, getVerticalScrollTarget, hasTrackSections, startMomentumScroll]); + }, [applyZoomWithCursorAnchor, getVerticalScrollTarget, hasTrackSections, startMomentumScroll]); const singleSectionTracks = videoTracks.length > 0 ? videoTracks : audioTracks; const singleSectionKind = videoTracks.length > 0 ? 'video' : 'audio'; @@ -1715,4 +1752,4 @@ export const TimelineContent = memo(function TimelineContent({
); }); - \ No newline at end of file + diff --git a/src/features/timeline/components/timeline-item/index.tsx b/src/features/timeline/components/timeline-item/index.tsx index 3712bb059..262c82344 100644 --- a/src/features/timeline/components/timeline-item/index.tsx +++ b/src/features/timeline/components/timeline-item/index.tsx @@ -21,8 +21,17 @@ import { useTransitionDragStore, } from '@/shared/state/transition-drag'; import { useMediaLibraryStore } from '@/features/timeline/deps/media-library-store'; +import { mediaTranscriptionService } from '@/features/timeline/deps/media-transcription-service'; +import { TranscribeDialog, type TranscribeDialogValues } from '@/features/timeline/deps/transcribe-dialog'; +import { + getTranscriptionOverallPercent, + getTranscriptionStageLabel, +} from '@/shared/utils/transcription-progress'; +import { + isTranscriptionOutOfMemoryError, + TRANSCRIPTION_OOM_HINT, +} from '@/shared/utils/transcription-cancellation'; import type { PreviewItemUpdate } from '../../utils/item-edit-preview'; -import { useSettingsStore } from '@/features/timeline/deps/settings'; import { useTimelineDrag, dragOffsetRef, dragPreviewOffsetByItemRef } from '../../hooks/use-timeline-drag'; import { useTimelineTrim } from '../../hooks/use-timeline-trim'; import { useTrackPush } from '../../hooks/use-track-push'; @@ -208,6 +217,38 @@ export const TimelineItem = memo(function TimelineItem({ item, timelineDuration [item.mediaId] ) ); + const transcriptProgress = useMediaLibraryStore( + useCallback( + (s) => (item.mediaId ? s.transcriptProgress.get(item.mediaId) ?? null : null), + [item.mediaId] + ) + ); + const mediaFileName = useMediaLibraryStore( + useCallback( + (s) => (item.mediaId + ? s.mediaItems.find((m) => m.id === item.mediaId)?.fileName ?? '' + : ''), + [item.mediaId] + ) + ); + const [captionDialogOpen, setCaptionDialogOpen] = useState(false); + const [captionDialogError, setCaptionDialogError] = useState(null); + const mediaHasTranscript = transcriptStatus === 'ready'; + const captionStartedRef = useRef(false); + const captionStopRequestedRef = useRef(false); + + const captionIsActive = + transcriptStatus === 'queued' || transcriptStatus === 'transcribing'; + useEffect(() => { + if (captionStartedRef.current && !captionIsActive) { + captionStartedRef.current = false; + const keepOpen = captionStopRequestedRef.current || captionDialogError !== null; + captionStopRequestedRef.current = false; + setCaptionDialogOpen((wasOpen) => { + return wasOpen && keepOpen; + }); + } + }, [captionIsActive, captionDialogError]); // O(1) index lookup that preserves both explicit captionSource links and // legacy generated-caption detection. const hasGeneratedCaptions = useItemsStore( @@ -216,11 +257,13 @@ export const TimelineItem = memo(function TimelineItem({ item, timelineDuration [item.id] ) ); - const defaultWhisperModel = useSettingsStore((s) => s.defaultWhisperModel); // O(1) via index, including legacy linked audio/video pairs. const isLinked = useItemsStore( useCallback((s) => !!s.linkedItemsByItemId[item.id], [item.id]) ); + const linkedItemsForCaptionOwnership = useItemsStore( + useCallback((s) => s.linkedItemsByItemId[item.id] ?? EMPTY_LINKED_ITEMS, [item.id]) + ); const linkedSelectionEnabled = useEditorStore((s) => s.linkedSelectionEnabled); const segmentOverlays = useTimelineItemOverlayStore( useCallback((s) => s.overlaysByItemId[item.id] ?? EMPTY_SEGMENT_OVERLAYS, [item.id]) @@ -241,6 +284,20 @@ export const TimelineItem = memo(function TimelineItem({ item, timelineDuration [itemKeyframes] ); const hasKeyframes = keyframedProperties.length > 0; + const linkedVideoCaptionOwner = useMemo(() => { + if (item.type !== 'audio' || !item.mediaId) { + return null; + } + + return linkedItemsForCaptionOwnership.find((linkedItem) => ( + linkedItem.id !== item.id + && linkedItem.type === 'video' + && linkedItem.mediaId === item.mediaId + )) ?? null; + }, [item.id, item.mediaId, item.type, linkedItemsForCaptionOwnership]); + const canManageCaptions = !!item.mediaId + && !isBroken + && (item.type === 'video' || (item.type === 'audio' && linkedVideoCaptionOwner === null)); // Use refs for actions to avoid selector re-renders - read from store in callbacks const activeTool = useSelectionStore((s) => s.activeTool); @@ -1403,7 +1460,6 @@ export const TimelineItem = memo(function TimelineItem({ item, timelineDuration getCanLinkSelected, getCanUnlinkSelected, hasSpeakableText, - isCaptionGenerationActive, isSceneDetectionActive, isCompositionItem, handleJoinSelected, @@ -1418,8 +1474,8 @@ export const TimelineItem = memo(function TimelineItem({ item, timelineDuration handleBentoLayout, handleFreezeFrame, handleGenerateAudioFromText, - handleGenerateCaptions, - handleRegenerateCaptions, + handleCaptionsFromDialog, + handleApplyCaptionsFromTranscript, handleCreatePreComp, handleEnterComposition, handleDissolveComposition, @@ -2480,12 +2536,19 @@ export const TimelineItem = memo(function TimelineItem({ item, timelineDuration onFreezeFrame={handleFreezeFrame} isTextItem={item.type === 'text' && hasSpeakableText} onGenerateAudioFromText={handleGenerateAudioFromText} - canGenerateCaptions={(item.type === 'video' || item.type === 'audio') && !!item.mediaId && !isBroken} - canRegenerateCaptions={hasGeneratedCaptions} - isGeneratingCaptions={isCaptionGenerationActive || transcriptStatus === 'transcribing'} - defaultCaptionModel={defaultWhisperModel} - onGenerateCaptions={handleGenerateCaptions} - onRegenerateCaptions={handleRegenerateCaptions} + canManageCaptions={canManageCaptions} + hasCaptions={hasGeneratedCaptions} + hasTranscript={mediaHasTranscript} + isGeneratingCaptions={ + transcriptStatus === 'queued' + || transcriptStatus === 'transcribing' + } + onOpenCaptionDialog={() => { + captionStopRequestedRef.current = false; + setCaptionDialogError(null); + setCaptionDialogOpen(true); + }} + onApplyCaptionsFromTranscript={handleApplyCaptionsFromTranscript} isCompositionItem={isCompositionItem} onEnterComposition={handleEnterComposition} onDissolveComposition={handleDissolveComposition} @@ -2864,6 +2927,56 @@ export const TimelineItem = memo(function TimelineItem({ item, timelineDuration /> + {canManageCaptions && item.mediaId && ( + { + if (!next) setCaptionDialogError(null); + setCaptionDialogOpen(next); + }} + fileName={mediaFileName} + hasTranscript={mediaHasTranscript} + isRunning={ + transcriptStatus === 'queued' + || transcriptStatus === 'transcribing' + } + progressPercent={ + transcriptProgress + ? Math.round(getTranscriptionOverallPercent(transcriptProgress)) + : null + } + progressLabel={ + transcriptProgress + ? `${getTranscriptionStageLabel(transcriptProgress.stage)} (${Math.round( + getTranscriptionOverallPercent(transcriptProgress), + )}%)` + : 'Transcribing...' + } + errorMessage={captionDialogError} + onStart={(values: TranscribeDialogValues) => { + captionStartedRef.current = true; + captionStopRequestedRef.current = false; + setCaptionDialogError(null); + handleCaptionsFromDialog(values, hasGeneratedCaptions, (error) => { + captionStartedRef.current = false; + const baseMessage = error instanceof Error + ? error.message + : 'Failed to generate captions'; + setCaptionDialogError( + isTranscriptionOutOfMemoryError(error) + ? TRANSCRIPTION_OOM_HINT + : baseMessage, + ); + }); + }} + onCancel={() => { + if (item.mediaId) { + captionStopRequestedRef.current = true; + mediaTranscriptionService.cancelTranscription(item.mediaId); + } + }} + /> + )} ); }, (prevProps, nextProps) => { diff --git a/src/features/timeline/components/timeline-item/item-context-menu.test.tsx b/src/features/timeline/components/timeline-item/item-context-menu.test.tsx index 43996f5fb..26f940d9b 100644 --- a/src/features/timeline/components/timeline-item/item-context-menu.test.tsx +++ b/src/features/timeline/components/timeline-item/item-context-menu.test.tsx @@ -39,11 +39,6 @@ vi.mock('@/features/timeline/deps/analysis', () => ({ getSceneVerificationModelOptions: mockGetSceneVerificationModelOptions, })); -vi.mock('@/features/timeline/deps/media-transcription-service', () => ({ - getMediaTranscriptionModelLabel: (model: string) => model, - getMediaTranscriptionModelOptions: () => [], -})); - vi.mock('@/features/timeline/deps/settings', () => ({ useResolvedHotkeys: () => ({}), })); @@ -114,3 +109,54 @@ describe('ItemContextMenu scene detection', () => { expect(onDetectScenes).toHaveBeenCalledWith('optical-flow', 'lfm'); }); }); + +describe('ItemContextMenu captions', () => { + it('shows a single "Generate Captions" item when no transcript exists', () => { + const onOpenCaptionDialog = vi.fn(); + + renderContextMenu({ + canManageCaptions: true, + hasCaptions: false, + hasTranscript: false, + onOpenCaptionDialog, + }); + + const item = screen.getByRole('button', { name: 'Generate Captions' }); + expect(item).toBeInTheDocument(); + expect(screen.queryByText('Captions')).not.toBeInTheDocument(); + fireEvent.click(item); + expect(onOpenCaptionDialog).toHaveBeenCalledTimes(1); + }); + + it('shows a Captions submenu with Insert + Generate when a transcript already exists', () => { + const onOpenCaptionDialog = vi.fn(); + const onApplyCaptionsFromTranscript = vi.fn(); + + renderContextMenu({ + canManageCaptions: true, + hasCaptions: false, + hasTranscript: true, + onOpenCaptionDialog, + onApplyCaptionsFromTranscript, + }); + + expect(screen.getByText('Captions')).toBeInTheDocument(); + expect(screen.getByRole('button', { name: 'Insert Existing Captions' })).toBeInTheDocument(); + expect(screen.getByRole('button', { name: 'Generate Captions' })).toBeInTheDocument(); + + fireEvent.click(screen.getByRole('button', { name: 'Insert Existing Captions' })); + expect(onApplyCaptionsFromTranscript).toHaveBeenCalledTimes(1); + }); + + it('labels the generate item "Regenerate Captions" when the clip already has captions', () => { + renderContextMenu({ + canManageCaptions: true, + hasCaptions: true, + hasTranscript: true, + onOpenCaptionDialog: vi.fn(), + onApplyCaptionsFromTranscript: vi.fn(), + }); + + expect(screen.getByRole('button', { name: 'Regenerate Captions' })).toBeInTheDocument(); + }); +}); diff --git a/src/features/timeline/components/timeline-item/item-context-menu.tsx b/src/features/timeline/components/timeline-item/item-context-menu.tsx index de117ccb4..f4e27d58c 100644 --- a/src/features/timeline/components/timeline-item/item-context-menu.tsx +++ b/src/features/timeline/components/timeline-item/item-context-menu.tsx @@ -18,11 +18,6 @@ import { import { useSelectionStore } from '@/shared/state/selection'; import { PROPERTY_LABELS, type AnimatableProperty } from '@/types/keyframe'; import type { PropertyKeyframes } from '@/types/keyframe'; -import type { MediaTranscriptModel } from '@/types/storage'; -import { - getMediaTranscriptionModelLabel, - getMediaTranscriptionModelOptions, -} from '@/features/timeline/deps/media-transcription-service'; import { getSceneVerificationModelOptions, type VerificationModel, @@ -58,12 +53,12 @@ interface ItemContextMenuProps { /** Whether the playhead is within this item's bounds */ playheadInBounds?: boolean; onFreezeFrame?: () => void; - canGenerateCaptions?: boolean; - canRegenerateCaptions?: boolean; + canManageCaptions?: boolean; + hasCaptions?: boolean; + hasTranscript?: boolean; isGeneratingCaptions?: boolean; - defaultCaptionModel?: MediaTranscriptModel; - onGenerateCaptions?: (model: MediaTranscriptModel) => void; - onRegenerateCaptions?: (model: MediaTranscriptModel) => void; + onOpenCaptionDialog?: () => void; + onApplyCaptionsFromTranscript?: () => void; /** Whether this item is a composition item (enables enter/dissolve options) */ isCompositionItem?: boolean; onEnterComposition?: () => void; @@ -113,12 +108,12 @@ export const ItemContextMenu = memo(function ItemContextMenu({ isVideoItem, playheadInBounds, onFreezeFrame, - canGenerateCaptions, - canRegenerateCaptions, + canManageCaptions, + hasCaptions, + hasTranscript, isGeneratingCaptions, - defaultCaptionModel, - onGenerateCaptions, - onRegenerateCaptions, + onOpenCaptionDialog, + onApplyCaptionsFromTranscript, isCompositionItem, onEnterComposition, onDissolveComposition, @@ -175,12 +170,12 @@ export const ItemContextMenu = memo(function ItemContextMenu({ isVideoItem={isVideoItem} playheadInBounds={playheadInBounds} onFreezeFrame={onFreezeFrame} - canGenerateCaptions={canGenerateCaptions} - canRegenerateCaptions={canRegenerateCaptions} + canManageCaptions={canManageCaptions} + hasCaptions={hasCaptions} + hasTranscript={hasTranscript} isGeneratingCaptions={isGeneratingCaptions} - defaultCaptionModel={defaultCaptionModel} - onGenerateCaptions={onGenerateCaptions} - onRegenerateCaptions={onRegenerateCaptions} + onOpenCaptionDialog={onOpenCaptionDialog} + onApplyCaptionsFromTranscript={onApplyCaptionsFromTranscript} isCompositionItem={isCompositionItem} onEnterComposition={onEnterComposition} onDissolveComposition={onDissolveComposition} @@ -255,12 +250,12 @@ const ItemContextMenuFull = memo(function ItemContextMenuFull({ isVideoItem, playheadInBounds, onFreezeFrame, - canGenerateCaptions, - canRegenerateCaptions, + canManageCaptions, + hasCaptions, + hasTranscript, isGeneratingCaptions, - defaultCaptionModel, - onGenerateCaptions, - onRegenerateCaptions, + onOpenCaptionDialog, + onApplyCaptionsFromTranscript, isCompositionItem, onEnterComposition, onDissolveComposition, @@ -286,18 +281,11 @@ const ItemContextMenuFull = memo(function ItemContextMenuFull({ if (!keyframedProperties) return []; return keyframedProperties.filter(p => p.keyframes.length > 0); }, [keyframedProperties]); - const transcriptionModelOptions = useMemo( - () => getMediaTranscriptionModelOptions(), - [], - ); - const explicitCaptionModelOptions = useMemo( - () => transcriptionModelOptions.filter((option) => option.value !== defaultCaptionModel), - [defaultCaptionModel, transcriptionModelOptions], - ); const sceneVerificationModelOptions = useMemo( () => getSceneVerificationModelOptions(), [], ); + const captionActionLabel = hasCaptions ? 'Regenerate Captions' : 'Generate Captions'; const hasKeyframes = propertiesWithKeyframes.length > 0; @@ -454,60 +442,28 @@ const ItemContextMenuFull = memo(function ItemContextMenuFull({ )} - {canGenerateCaptions && onGenerateCaptions && ( + {canManageCaptions && onOpenCaptionDialog && ( <> {isGeneratingCaptions ? ( - Updating Captions... + Updating captions... + ) : hasTranscript && onApplyCaptionsFromTranscript ? ( + + Captions + + + Insert Existing Captions + + + {captionActionLabel} + + + ) : ( - <> - - Generate Captions for Segment - - {defaultCaptionModel && ( - <> - onGenerateCaptions(defaultCaptionModel)}> - {`Default (${getMediaTranscriptionModelLabel(defaultCaptionModel)})`} - - - - )} - {explicitCaptionModelOptions.map((option) => ( - onGenerateCaptions(option.value)} - > - {option.label} - - ))} - - - - {canRegenerateCaptions && onRegenerateCaptions && ( - - Regenerate Captions for Segment - - {defaultCaptionModel && ( - <> - onRegenerateCaptions(defaultCaptionModel)}> - {`Default (${getMediaTranscriptionModelLabel(defaultCaptionModel)})`} - - - - )} - {explicitCaptionModelOptions.map((option) => ( - onRegenerateCaptions(option.value)} - > - {option.label} - - ))} - - - )} - + + {captionActionLabel} + )} diff --git a/src/features/timeline/components/timeline-item/use-timeline-item-actions.ts b/src/features/timeline/components/timeline-item/use-timeline-item-actions.ts index b1372a1bc..0279752b1 100644 --- a/src/features/timeline/components/timeline-item/use-timeline-item-actions.ts +++ b/src/features/timeline/components/timeline-item/use-timeline-item-actions.ts @@ -2,13 +2,20 @@ import { useCallback, useEffect, useRef } from 'react'; import { toast } from 'sonner'; import type { TimelineItem as TimelineItemType } from '@/types/timeline'; import type { AnimatableProperty } from '@/types/keyframe'; -import type { MediaTranscriptModel } from '@/types/storage'; +import type { + MediaTranscriptModel, + MediaTranscriptQuantization, +} from '@/types/storage'; import { useSelectionStore } from '@/shared/state/selection'; import { usePlaybackStore } from '@/shared/state/playback'; import { useClearKeyframesDialogStore } from '@/app/state/clear-keyframes-dialog'; import { useTtsGenerateDialogStore } from '@/app/state/tts-generate-dialog'; -import { isLocalInferenceCancellationError } from '@/shared/state/local-inference'; -import { getTranscriptionOverallPercent } from '@/shared/utils/transcription-progress'; +import { scheduleAfterPaint } from '@/shared/utils/schedule-after-paint'; +import { + isTranscriptionCancellationError, + isTranscriptionOutOfMemoryError, + TRANSCRIPTION_OOM_HINT, +} from '@/shared/utils/transcription-cancellation'; import { useMediaLibraryStore } from '@/features/timeline/deps/media-library-store'; import { getMediaTranscriptionModelLabel, @@ -40,8 +47,11 @@ import { } from '../../deps/analysis'; import { resolveMediaUrl } from '../../deps/media-library-resolver'; import { useBentoLayoutDialogStore } from '../bento-layout-dialog-store'; +import { createLogger } from '@/shared/logging/logger'; +import { saveScenes } from '@/infrastructure/storage/workspace-fs/scenes'; + +const logger = createLogger('UseTimelineItemActions'); -const CAPTION_GENERATION_OVERLAY_ID = 'caption-generation'; const SCENE_DETECTION_OVERLAY_ID = 'scene-detection'; interface UseTimelineItemActionsParams { @@ -180,6 +190,9 @@ export function useTimelineItemActions({ options?: { forceTranscription?: boolean; replaceExisting?: boolean; + quantization?: MediaTranscriptQuantization; + language?: string; + onError?: (error: unknown) => void; }, ) => { if ((item.type !== 'video' && item.type !== 'audio') || !item.mediaId || isBroken) { @@ -189,11 +202,9 @@ export function useTimelineItemActions({ const mediaId = item.mediaId; const clipId = item.id; const store = useMediaLibraryStore.getState(); - const overlayStore = useTimelineItemOverlayStore.getState(); const previousStatus = store.transcriptStatus.get(mediaId) ?? 'idle'; const forceTranscription = options?.forceTranscription ?? false; const replaceExisting = options?.replaceExisting ?? false; - const overlayLabel = forceTranscription ? 'Regenerating captions' : 'Generating captions'; const run = async () => { let updatedTranscriptStatus = previousStatus; @@ -204,28 +215,26 @@ export function useTimelineItemActions({ forceTranscription || !existingTranscript || existingTranscript.model !== model; if (needsTranscription) { - overlayStore.upsertOverlay(clipId, { - id: CAPTION_GENERATION_OVERLAY_ID, - label: overlayLabel, - progress: 0, - tone: 'info', - }); - store.setTranscriptStatus(mediaId, 'transcribing'); - store.setTranscriptProgress(mediaId, { stage: 'loading', progress: 0 }); + store.setTranscriptStatus(mediaId, 'queued'); + store.setTranscriptProgress(mediaId, { stage: 'queued', progress: 0 }); await mediaTranscriptionService.transcribeMedia(mediaId, { model, + quantization: options?.quantization, + language: options?.language || undefined, + onQueueStatusChange: (state) => { + if (state === 'queued') { + store.setTranscriptStatus(mediaId, 'queued'); + store.setTranscriptProgress(mediaId, { stage: 'queued', progress: 0 }); + return; + } + + store.setTranscriptStatus(mediaId, 'transcribing'); + store.setTranscriptProgress(mediaId, { stage: 'loading', progress: 0 }); + }, onProgress: (progress) => { const mediaLibraryStore = useMediaLibraryStore.getState(); mediaLibraryStore.setTranscriptProgress(mediaId, progress); - const mergedProgress = mediaLibraryStore.transcriptProgress.get(mediaId) ?? progress; - - useTimelineItemOverlayStore.getState().upsertOverlay(clipId, { - id: CAPTION_GENERATION_OVERLAY_ID, - label: overlayLabel, - progress: getTranscriptionOverallPercent(mergedProgress), - tone: 'info', - }); }, }); @@ -233,16 +242,10 @@ export function useTimelineItemActions({ store.setTranscriptStatus(mediaId, updatedTranscriptStatus); store.clearTranscriptProgress(mediaId); } else { - overlayStore.upsertOverlay(clipId, { - id: CAPTION_GENERATION_OVERLAY_ID, - label: replaceExisting ? 'Replacing captions' : 'Adding captions', - tone: 'info', - }); updatedTranscriptStatus = 'ready'; store.setTranscriptStatus(mediaId, updatedTranscriptStatus); store.clearTranscriptProgress(mediaId); } - const result = await mediaTranscriptionService.insertTranscriptAsCaptions(mediaId, { clipIds: [clipId], replaceExisting, @@ -251,17 +254,17 @@ export function useTimelineItemActions({ const successMessage = replaceExisting ? result.insertedItemCount > 0 ? result.removedItemCount > 0 - ? `Replaced ${result.removedItemCount} caption clip${result.removedItemCount === 1 ? '' : 's'} with ${result.insertedItemCount} updated clip${result.insertedItemCount === 1 ? '' : 's'} for this segment using ${getMediaTranscriptionModelLabel(model)}` - : `Regenerated ${result.insertedItemCount} caption clip${result.insertedItemCount === 1 ? '' : 's'} for this segment using ${getMediaTranscriptionModelLabel(model)}` - : `Removed ${result.removedItemCount} generated caption clip${result.removedItemCount === 1 ? '' : 's'} for this segment using ${getMediaTranscriptionModelLabel(model)}` - : `Inserted ${result.insertedItemCount} caption clip${result.insertedItemCount === 1 ? '' : 's'} for this segment with ${getMediaTranscriptionModelLabel(model)}`; + ? `Updated captions on this segment with ${getMediaTranscriptionModelLabel(model)}` + : `Refreshed captions on this segment with ${getMediaTranscriptionModelLabel(model)}` + : `Removed captions from this segment` + : `Added captions to this segment with ${getMediaTranscriptionModelLabel(model)}`; store.showNotification({ type: 'success', message: successMessage, }); } catch (error) { - if (isLocalInferenceCancellationError(error)) { + if (isTranscriptionCancellationError(error)) { store.setTranscriptStatus(mediaId, previousStatus); store.clearTranscriptProgress(mediaId); return; @@ -269,32 +272,81 @@ export function useTimelineItemActions({ store.setTranscriptStatus(mediaId, updatedTranscriptStatus === 'ready' ? 'ready' : 'error'); store.clearTranscriptProgress(mediaId); + const fallbackMessage = error instanceof Error + ? error.message + : 'Failed to generate captions for segment'; + const friendlyMessage = isTranscriptionOutOfMemoryError(error) + ? TRANSCRIPTION_OOM_HINT + : fallbackMessage; + options?.onError?.(error); store.showNotification({ type: 'error', - message: error instanceof Error ? error.message : 'Failed to generate captions for segment', + message: friendlyMessage, }); - } finally { - useTimelineItemOverlayStore.getState().removeOverlay(clipId, CAPTION_GENERATION_OVERLAY_ID); } }; - void run(); + scheduleAfterPaint(() => { + void run(); + }); }, [item.id, item.mediaId, item.type, isBroken]); - const handleGenerateCaptions = useCallback((model: MediaTranscriptModel) => { - handleCaptionGeneration(model); - }, [handleCaptionGeneration]); - - const handleRegenerateCaptions = useCallback((model: MediaTranscriptModel) => { - handleCaptionGeneration(model, { + const handleCaptionsFromDialog = useCallback((values: { + model: MediaTranscriptModel; + quantization: MediaTranscriptQuantization; + language: string; + }, hasExistingCaptions: boolean, onError?: (error: unknown) => void) => { + handleCaptionGeneration(values.model, { + // The dialog path is always "generate fresh captions". Reusing the + // current transcript is handled explicitly by "Insert Existing Captions". forceTranscription: true, - replaceExisting: true, + replaceExisting: hasExistingCaptions, + quantization: values.quantization, + language: values.language, + onError, }); }, [handleCaptionGeneration]); - const isCaptionGenerationActive = segmentOverlays.some( - (overlay) => overlay.id === CAPTION_GENERATION_OVERLAY_ID, - ); + const handleApplyCaptionsFromTranscript = useCallback(() => { + if ((item.type !== 'video' && item.type !== 'audio') || !item.mediaId || isBroken) { + return; + } + + const mediaId = item.mediaId; + const clipId = item.id; + const replaceExisting = useItemsStore.getState().replaceableCaptionClipIds.has(clipId); + const store = useMediaLibraryStore.getState(); + + const run = async () => { + try { + const existingTranscript = await mediaTranscriptionService.getTranscript(mediaId); + if (!existingTranscript) { + throw new Error('Generate a transcript first, then add captions from it.'); + } + + const result = await mediaTranscriptionService.insertTranscriptAsCaptions(mediaId, { + clipIds: [clipId], + replaceExisting, + }); + + store.showNotification({ + type: 'success', + message: replaceExisting + ? result.insertedItemCount > 0 || result.removedItemCount > 0 + ? 'Updated captions on this segment from the current transcript' + : 'Removed captions from this segment' + : 'Added captions to this segment from the current transcript', + }); + } catch (error) { + store.showNotification({ + type: 'error', + message: error instanceof Error ? error.message : 'Failed to update captions for segment', + }); + } + }; + + void run(); + }, [isBroken, item.id, item.mediaId, item.type]); const isSceneDetectionActive = segmentOverlays.some( (overlay) => overlay.id === SCENE_DETECTION_OVERLAY_ID, @@ -414,6 +466,21 @@ export function useTimelineItemActions({ }, }); + // Persist scene cuts to the workspace so the next session/window + // doesn't need to recompute. Fire-and-forget — UX proceeds regardless. + if (cuts.length > 0) { + void saveScenes({ + mediaId, + service: method === 'histogram' ? 'scene-detect-histogram' : 'scene-detect-optical-flow', + model: verificationModel ?? method, + method, + sampleIntervalMs: method === 'histogram' ? 250 : 500, + verificationModel, + fps: mediaFps, + cuts, + }).catch((error) => logger.warn('Failed to persist scene cuts', error)); + } + if (cuts.length === 0) { toast.info('No scene cuts detected'); return; @@ -470,7 +537,6 @@ export function useTimelineItemActions({ getCanLinkSelected, getCanUnlinkSelected, hasSpeakableText, - isCaptionGenerationActive, isSceneDetectionActive, isCompositionItem, handleJoinSelected, @@ -485,8 +551,8 @@ export function useTimelineItemActions({ handleBentoLayout, handleFreezeFrame, handleGenerateAudioFromText, - handleGenerateCaptions, - handleRegenerateCaptions, + handleCaptionsFromDialog, + handleApplyCaptionsFromTranscript, handleCreatePreComp, handleEnterComposition, handleDissolveComposition, diff --git a/src/features/timeline/components/timeline-navigator.tsx b/src/features/timeline/components/timeline-navigator.tsx index 5e4dbd5df..f5b139d8e 100644 --- a/src/features/timeline/components/timeline-navigator.tsx +++ b/src/features/timeline/components/timeline-navigator.tsx @@ -2,6 +2,7 @@ import { useCallback, useEffect, useMemo, useRef, useState } from 'react'; import { useTimelineViewportStore } from '../stores/timeline-viewport-store'; import { useTimelineStore } from '../stores/timeline-store'; +import { useItemsStore } from '../stores/items-store'; import { useZoomStore } from '../stores/zoom-store'; import { cn } from '@/shared/ui/cn'; import { getNavigatorResizeDragResult, getNavigatorThumbMetrics } from './timeline-navigator-utils'; @@ -32,9 +33,7 @@ export function TimelineNavigator({ const [dragStartThumbLeft, setDragStartThumbLeft] = useState(0); const [dragStartThumbWidth, setDragStartThumbWidth] = useState(0); - const maxFrame = useTimelineStore((s) => - s.items.reduce((max, item) => Math.max(max, item.from + item.durationInFrames), 0) - ); + const maxFrame = useItemsStore((s) => s.maxItemEndFrame); const contentDuration = useMemo(() => { const furthestEndSeconds = maxFrame / fps; diff --git a/src/features/timeline/components/timeline-playhead.test.tsx b/src/features/timeline/components/timeline-playhead.test.tsx new file mode 100644 index 000000000..1cd135ec3 --- /dev/null +++ b/src/features/timeline/components/timeline-playhead.test.tsx @@ -0,0 +1,72 @@ +import { fireEvent, render, waitFor } from '@testing-library/react'; +import { beforeEach, describe, expect, it } from 'vitest'; + +import { usePlaybackStore } from '@/shared/state/playback'; +import { TimelinePlayhead } from './timeline-playhead'; +import { useZoomStore, _resetZoomStoreForTest } from '../stores/zoom-store'; +import { useTimelineStore } from '../stores/timeline-store'; + +describe('TimelinePlayhead', () => { + beforeEach(() => { + usePlaybackStore.setState({ + currentFrame: 12, + currentFrameEpoch: 0, + isPlaying: false, + playbackRate: 1, + loop: false, + volume: 1, + muted: false, + zoom: -1, + previewFrame: null, + previewFrameEpoch: 0, + frameUpdateEpoch: 0, + previewItemId: null, + useProxy: true, + previewQuality: 1, + }); + useTimelineStore.setState({ fps: 30 }); + _resetZoomStoreForTest(); + useZoomStore.getState().setZoomLevelSynchronized(1); + }); + + it('uses atomic scrub updates while dragging and clears preview on release', async () => { + const { container } = render( +
+ +
, + ); + + const ruler = container.querySelector('.timeline-ruler') as HTMLDivElement | null; + expect(ruler).toBeTruthy(); + + ruler!.getBoundingClientRect = () => ({ + x: 0, + y: 0, + left: 0, + top: 0, + right: 600, + bottom: 40, + width: 600, + height: 40, + toJSON: () => ({}), + }); + + const hitArea = container.querySelector('[style*="width: 20px"]') as HTMLDivElement | null; + expect(hitArea).toBeTruthy(); + + fireEvent.mouseDown(hitArea!, { clientX: 24, clientY: 8, button: 0 }); + fireEvent.mouseMove(document, { clientX: 120, clientY: 8 }); + + await waitFor(() => { + expect(usePlaybackStore.getState().previewFrame).toBe(36); + expect(usePlaybackStore.getState().currentFrame).toBe(36); + }); + + fireEvent.mouseUp(document, { clientX: 120, clientY: 8 }); + + await waitFor(() => { + expect(usePlaybackStore.getState().currentFrame).toBe(36); + expect(usePlaybackStore.getState().previewFrame).toBeNull(); + }); + }); +}); diff --git a/src/features/timeline/components/timeline-playhead.tsx b/src/features/timeline/components/timeline-playhead.tsx index d72fda7d0..a21a444a8 100644 --- a/src/features/timeline/components/timeline-playhead.tsx +++ b/src/features/timeline/components/timeline-playhead.tsx @@ -25,13 +25,13 @@ interface TimelinePlayheadProps { */ export function TimelinePlayhead({ inRuler = false, maxFrame }: TimelinePlayheadProps) { // Don't subscribe to currentFrame - use ref + manual subscription instead - const setCurrentFrame = usePlaybackStore((s) => s.setCurrentFrame); const setScrubFrame = usePlaybackStore((s) => s.setScrubFrame); const { frameToPixels, pixelsToFrame, pixelsPerSecond } = useTimelineZoomContext(); const [isDragging, setIsDragging] = useState(false); const [isExternalDrag, setIsExternalDrag] = useState(false); const playheadRef = useRef(null); + const isDraggingRef = useRef(false); // Track activeTool via ref subscription to avoid re-renders during playback // This prevents mode toggle from interrupting frame updates @@ -44,7 +44,6 @@ export function TimelinePlayhead({ inRuler = false, maxFrame }: TimelinePlayhead // Use refs to avoid stale closures const pixelsToFrameRef = useRef(pixelsToFrame); - const setCurrentFrameRef = useRef(setCurrentFrame); const setScrubFrameRef = useRef(setScrubFrame); const maxFrameRef = useRef(maxFrame); const frameToPixelsRef = useRef(frameToPixels); @@ -68,14 +67,19 @@ export function TimelinePlayhead({ inRuler = false, maxFrame }: TimelinePlayhead // Update refs when functions change useEffect(() => { pixelsToFrameRef.current = pixelsToFrame; - setCurrentFrameRef.current = setCurrentFrame; setScrubFrameRef.current = setScrubFrame; maxFrameRef.current = maxFrame; frameToPixelsRef.current = frameToPixels; pixelsPerSecondRef.current = pixelsPerSecond; - }, [pixelsToFrame, setCurrentFrame, setScrubFrame, maxFrame, frameToPixels, pixelsPerSecond]); + }, [pixelsToFrame, setScrubFrame, maxFrame, frameToPixels, pixelsPerSecond]); - // Subscribe to currentFrame changes and update position directly (no React re-renders) + useEffect(() => { + isDraggingRef.current = isDragging; + }, [isDragging]); + + // Subscribe to playback frame changes and update position directly. + // During playhead drags, use the same atomic scrub state as the main ruler + // so the fast-scrub overlay hands back to the player consistently. useEffect(() => { const updatePosition = (frame: number) => { if (!playheadRef.current) return; @@ -88,17 +92,24 @@ export function TimelinePlayhead({ inRuler = false, maxFrame }: TimelinePlayhead // Subscribe to store changes return usePlaybackStore.subscribe((state) => { - updatePosition(state.currentFrame); + updatePosition( + isDraggingRef.current && state.previewFrame !== null + ? state.previewFrame + : state.currentFrame + ); }); }, []); // Also update position when frameToPixels changes (zoom changes) useLayoutEffect(() => { if (!playheadRef.current) return; - const frame = usePlaybackStore.getState().currentFrame; + const playbackState = usePlaybackStore.getState(); + const frame = isDraggingRef.current && playbackState.previewFrame !== null + ? playbackState.previewFrame + : playbackState.currentFrame; const leftPosition = Math.round(frameToPixels(frame)); playheadRef.current.style.left = `${leftPosition}px`; - }, [frameToPixels]); + }, [frameToPixels, isDragging]); // Track external drag operations to disable pointer events on hit areas useEffect(() => { @@ -120,7 +131,6 @@ export function TimelinePlayhead({ inRuler = false, maxFrame }: TimelinePlayhead const handleMouseDown = useCallback((e: React.MouseEvent) => { e.preventDefault(); e.stopPropagation(); - usePlaybackStore.getState().setPreviewFrame(null); const container = inRuler ? playheadRef.current?.closest('.timeline-ruler') : playheadRef.current?.closest('.timeline-tracks'); @@ -196,7 +206,7 @@ export function TimelinePlayhead({ inRuler = false, maxFrame }: TimelinePlayhead } if (pendingFrame !== null) { - setCurrentFrameRef.current(pendingFrame); + setScrubFrameRef.current(pendingFrame); } pendingFrameRef.current = null; diff --git a/src/features/timeline/contracts/media-library.ts b/src/features/timeline/contracts/media-library.ts index 0b8ec0426..5cc6da95e 100644 --- a/src/features/timeline/contracts/media-library.ts +++ b/src/features/timeline/contracts/media-library.ts @@ -7,6 +7,8 @@ export { useTimelineStore } from '../stores/timeline-store'; export { useCompositionNavigationStore } from '../stores/composition-navigation-store'; export { DEFAULT_TRACK_HEIGHT } from '../constants'; export { timelineToSourceFrames, sourceToTimelineFrames } from '../utils/source-calculations'; +export { getNextClassicTrackName, getTrackKind, type TrackKind } from '../utils/classic-tracks'; +export { getEffectiveTrackKindForItem } from '../utils/track-item-compatibility'; export { useCompositionsStore, type SubComposition, diff --git a/src/features/timeline/contracts/preview.ts b/src/features/timeline/contracts/preview.ts index fd5d54cac..05fab42a7 100644 --- a/src/features/timeline/contracts/preview.ts +++ b/src/features/timeline/contracts/preview.ts @@ -35,3 +35,8 @@ export { buildSubCompositionInput, collectSubCompositionMediaIds, } from '../utils/sub-composition-preview'; +export { + createScrubThrottleState, + shouldCommitScrubFrame, + type ScrubThrottleState, +} from '../utils/scrub-throttle'; diff --git a/src/features/timeline/deps/transcribe-dialog.ts b/src/features/timeline/deps/transcribe-dialog.ts new file mode 100644 index 000000000..16f353d07 --- /dev/null +++ b/src/features/timeline/deps/transcribe-dialog.ts @@ -0,0 +1,4 @@ +export { + TranscribeDialog, + type TranscribeDialogValues, +} from './media-library-contract'; diff --git a/src/features/timeline/hooks/use-rate-stretch.ts b/src/features/timeline/hooks/use-rate-stretch.ts index 6825b011d..b1d6e704d 100644 --- a/src/features/timeline/hooks/use-rate-stretch.ts +++ b/src/features/timeline/hooks/use-rate-stretch.ts @@ -1,7 +1,7 @@ import { useState, useCallback, useRef, useEffect, useEffectEvent } from 'react'; import type { TimelineItem } from '@/types/timeline'; import { useEditorStore } from '@/app/state/editor'; -import { usePlaybackStore } from '@/shared/state/playback'; +import { commitPreviewFrameToCurrentFrame } from '@/shared/state/playback'; import type { SnapTarget } from '../types/drag'; import { useTimelineStore } from '../stores/timeline-store'; import { useSelectionStore } from '@/shared/state/selection'; @@ -16,7 +16,11 @@ import { timelineToSourceFrames, } from '../utils/source-calculations'; import { useLinkedEditPreviewStore } from '../stores/linked-edit-preview-store'; -import { getSynchronizedLinkedItems, getLinkedItemIds } from '../utils/linked-items'; +import { + expandItemIdsWithAttachedCaptions, + getSynchronizedLinkedItems, + getLinkedItemIds, +} from '../utils/linked-items'; import { applyRateStretchPreview, applyMovePreview } from '../utils/item-edit-preview'; import type { PreviewItemUpdate } from '../utils/item-edit-preview'; import { useTransitionsStore } from '../stores/transitions-store'; @@ -69,7 +73,7 @@ function computeRipplePreviewUpdates( movedIds.add(itemId); updates.push(applyMovePreview(it, delta)); - for (const linkedId of getLinkedItemIds(items, itemId)) { + for (const linkedId of expandItemIdsWithAttachedCaptions(items, getLinkedItemIds(items, itemId))) { if (linkedId === itemId || movedIds.has(linkedId)) continue; const linked = items.find((i) => i.id === linkedId); if (linked) { @@ -566,7 +570,7 @@ export function useRateStretch(item: TimelineItem, timelineDuration: number, tra e.stopPropagation(); e.preventDefault(); - usePlaybackStore.getState().setPreviewFrame(null); + commitPreviewFrameToCurrentFrame(); setDragState({ isDragging: true, diff --git a/src/features/timeline/hooks/use-timeline-drag.ts b/src/features/timeline/hooks/use-timeline-drag.ts index a8e42f861..92c89d6fb 100644 --- a/src/features/timeline/hooks/use-timeline-drag.ts +++ b/src/features/timeline/hooks/use-timeline-drag.ts @@ -10,6 +10,7 @@ import { useSnapCalculator } from './use-snap-calculator'; import { findNearestAvailableSpace } from '../utils/collision-utils'; import { getTrackKind } from '../utils/classic-tracks'; import { + expandItemIdsWithAttachedCaptions, buildLinkedMovePreviewUpdates, expandSelectionWithLinkedItems, filterUnlockedItemIds, @@ -577,12 +578,13 @@ export function useTimelineDrag( } // Determine which items to drag - const itemsToDrag = isInSelection + const baseItemsToDrag = isInSelection ? (linkedSelectionEnabled ? expandSelectionWithLinkedItems(allItems, currentSelectedIds) : currentSelectedIds) : linkedIds; + const itemsToDrag = expandItemIdsWithAttachedCaptions(allItems, baseItemsToDrag); const draggableItemIds = filterUnlockedItemIds(allItems, currentTracks, itemsToDrag); - if (isInSelection && itemsToDrag.length !== currentSelectedIds.length) { - selectItems(itemsToDrag); + if (isInSelection && baseItemsToDrag.length !== currentSelectedIds.length) { + selectItems(baseItemsToDrag); } // Store initial state for all dragged items diff --git a/src/features/timeline/hooks/use-timeline-slip-slide.ts b/src/features/timeline/hooks/use-timeline-slip-slide.ts index 233d13732..0f279b5d7 100644 --- a/src/features/timeline/hooks/use-timeline-slip-slide.ts +++ b/src/features/timeline/hooks/use-timeline-slip-slide.ts @@ -1,6 +1,7 @@ import { useState, useCallback, useRef, useEffect } from 'react'; import type { TimelineItem } from '@/types/timeline'; -import { usePlaybackStore } from '@/shared/state/playback'; +import type { Transition } from '@/types/transition'; +import { commitPreviewFrameToCurrentFrame } from '@/shared/state/playback'; import { useEditorStore } from '@/app/state/editor'; import { DRAG_THRESHOLD_PIXELS } from '../constants'; import { useTimelineStore } from '../stores/timeline-store'; @@ -8,6 +9,7 @@ import { useTransitionsStore } from '../stores/transitions-store'; import { useSelectionStore } from '@/shared/state/selection'; import { pixelsToTimeNow } from '../utils/zoom-conversions'; import { useSnapCalculator } from './use-snap-calculator'; +import type { SnapTarget } from '../types/drag'; import { useSlipEditPreviewStore } from '../stores/slip-edit-preview-store'; import { useSlideEditPreviewStore } from '../stores/slide-edit-preview-store'; import { useLinkedEditPreviewStore } from '../stores/linked-edit-preview-store'; @@ -24,7 +26,7 @@ import { getMatchingSynchronizedLinkedCounterpart, getSynchronizedLinkedItems, } from '../utils/linked-items'; -import { clampSlipDeltaToPreserveTransitions, clampSlideDeltaToPreserveTransitions } from '../utils/transition-utils'; +import { canAddTransition, clampSlipDeltaToPreserveTransitions, clampSlideDeltaToPreserveTransitions } from '../utils/transition-utils'; import { applyMovePreview, applySlipPreview, @@ -33,6 +35,7 @@ import { type PreviewItemUpdate, } from '../utils/item-edit-preview'; import { hasExceededDragThreshold } from '../utils/drag-threshold'; +import { computeSlideContinuitySourceDelta } from '../utils/slide-utils'; interface SlipSlideState { isActive: boolean; @@ -50,6 +53,153 @@ interface SlipSlideStartOptions { activateOnMoveThreshold?: boolean; } +interface SlideParticipantConstraintContext { + participant: TimelineItem; + leftAdjacent: TimelineItem | null; + rightAdjacent: TimelineItem | null; + nearestNeighbors: ReturnType; + excludeIds: Set; + leftAdjacentNearestStart: number | null; + rightAdjacentNearestEnd: number | null; +} + +interface SlideGestureContext { + currentItem: TimelineItem; + allItems: TimelineItem[]; + itemsById: Map; + transitions: Transition[]; + leftNeighbor: TimelineItem | null; + rightNeighbor: TimelineItem | null; + snapTargets: SnapTarget[]; + snapExcludeIds: Set; + linkedSelectionEnabled: boolean; + synchronizedCounterpart: TimelineItem | null; + leftCounterpart: TimelineItem | null; + rightCounterpart: TimelineItem | null; + slideItemIds: Set; + primaryNearestNeighbors: ReturnType; + leftNeighborNearestStart: number | null; + rightNeighborNearestEnd: number | null; + participantContexts: SlideParticipantConstraintContext[]; + relatedTransitions: Transition[]; +} + +function findAdjacentTrackNeighbors( + item: TimelineItem, + items: TimelineItem[], +): { leftAdjacent: TimelineItem | null; rightAdjacent: TimelineItem | null } { + const itemEnd = item.from + item.durationInFrames; + let leftAdjacent: TimelineItem | null = null; + let rightAdjacent: TimelineItem | null = null; + + for (const other of items) { + if (other.id === item.id || other.trackId !== item.trackId) continue; + const otherEnd = other.from + other.durationInFrames; + + if (otherEnd === item.from && (!leftAdjacent || other.from > leftAdjacent.from)) { + leftAdjacent = other; + } + if (other.from === itemEnd && (!rightAdjacent || other.from < rightAdjacent.from)) { + rightAdjacent = other; + } + } + + return { leftAdjacent, rightAdjacent }; +} + +function findNearestStartAtOrAfter( + item: TimelineItem, + items: TimelineItem[], + excludeIds: ReadonlySet, +): number | null { + const itemEnd = item.from + item.durationInFrames; + let nearestStart = Infinity; + + for (const other of items) { + if (other.id === item.id || other.trackId !== item.trackId || excludeIds.has(other.id)) continue; + if (other.from >= itemEnd) { + nearestStart = Math.min(nearestStart, other.from); + } + } + + return Number.isFinite(nearestStart) ? nearestStart : null; +} + +function findNearestEndAtOrBefore( + item: TimelineItem, + items: TimelineItem[], + excludeIds: ReadonlySet, +): number | null { + let nearestEnd = -Infinity; + + for (const other of items) { + if (other.id === item.id || other.trackId !== item.trackId || excludeIds.has(other.id)) continue; + const otherEnd = other.from + other.durationInFrames; + if (otherEnd <= item.from) { + nearestEnd = Math.max(nearestEnd, otherEnd); + } + } + + return Number.isFinite(nearestEnd) ? nearestEnd : null; +} + +function clampEndAgainstNearestStart( + item: TimelineItem, + trimAmount: number, + nearestStart: number | null, +): number { + if (trimAmount <= 0 || nearestStart === null) return trimAmount; + const itemEnd = item.from + item.durationInFrames; + const maxExtend = nearestStart - itemEnd; + return trimAmount > maxExtend ? maxExtend : trimAmount; +} + +function clampStartAgainstNearestEnd( + item: TimelineItem, + trimAmount: number, + nearestEnd: number | null, +): number { + if (trimAmount >= 0 || nearestEnd === null) return trimAmount; + const maxExtend = item.from - nearestEnd; + if (-trimAmount > maxExtend) { + return maxExtend > 0 ? -maxExtend : 0; + } + return trimAmount; +} + +function applyPreviewUpdate( + item: TimelineItem, + previewUpdate: PreviewItemUpdate | null | undefined, +): TimelineItem { + return previewUpdate + ? ({ ...item, ...previewUpdate } as TimelineItem) + : item; +} + +function clampDeltaToLastValidValue( + requestedDelta: number, + isValid: (delta: number) => boolean, +): number { + if (!isValid(0)) return 0; + if (isValid(requestedDelta)) return requestedDelta; + + const sign = requestedDelta < 0 ? -1 : 1; + let low = 0; + let high = Math.abs(requestedDelta); + + while (low < high) { + const mid = Math.ceil((low + high) / 2); + const candidate = sign * mid; + if (isValid(candidate)) { + low = mid; + } else { + high = mid - 1; + } + } + + return sign * low; +} + /** * Hook for handling slip and slide editing on timeline items. * @@ -88,6 +238,7 @@ export function useTimelineSlipSlide( stateRef.current = state; const latestDeltaRef = useRef(0); const pendingStartCleanupRef = useRef<(() => void) | null>(null); + const slideGestureContextRef = useRef(null); const getItemFromStore = useCallback(() => { return useTimelineStore.getState().items.find((i) => i.id === item.id) ?? item; @@ -104,8 +255,98 @@ export function useTimelineSlipSlide( return findEditNeighborsWithTransitions(currentItem, allItems, transitions); }, [getItemFromStore]); + const buildSlideGestureContext = useCallback(( + currentItem: TimelineItem, + leftNeighbor: TimelineItem | null, + rightNeighbor: TimelineItem | null, + ): SlideGestureContext => { + const allItems = useTimelineStore.getState().items; + const transitions = useTransitionsStore.getState().transitions; + const itemsById = new Map(allItems.map((candidate) => [candidate.id, candidate])); + const linkedSelectionEnabled = useEditorStore.getState().linkedSelectionEnabled; + const synchronizedItems = linkedSelectionEnabled + ? getSynchronizedLinkedItems(allItems, currentItem.id) + : [currentItem]; + const synchronizedCounterpart = synchronizedItems.find((candidate) => candidate.id !== currentItem.id) ?? null; + const leftCounterpart = leftNeighbor && synchronizedCounterpart + ? getMatchingSynchronizedLinkedCounterpart(allItems, leftNeighbor.id, synchronizedCounterpart.trackId, synchronizedCounterpart.type) + : null; + const rightCounterpart = rightNeighbor && synchronizedCounterpart + ? getMatchingSynchronizedLinkedCounterpart(allItems, rightNeighbor.id, synchronizedCounterpart.trackId, synchronizedCounterpart.type) + : null; + const slideItemIds = new Set([ + currentItem.id, + leftNeighbor?.id ?? '', + rightNeighbor?.id ?? '', + ].filter(Boolean)); + const snapExcludeIds = new Set(slideItemIds); + const snapTargets = snapEnabled ? getMagneticSnapTargets() : []; + const primaryNearestNeighbors = findNearestNeighbors(currentItem, allItems); + const leftNeighborNearestStart = leftNeighbor + ? findNearestStartAtOrAfter(leftNeighbor, allItems, slideItemIds) + : null; + const rightNeighborNearestEnd = rightNeighbor + ? findNearestEndAtOrBefore(rightNeighbor, allItems, slideItemIds) + : null; + + const participantContexts: SlideParticipantConstraintContext[] = synchronizedItems + .filter((candidate) => candidate.id !== currentItem.id) + .map((participant) => { + const excludeIds = new Set(slideItemIds); + for (const synchronizedItem of synchronizedItems) { + excludeIds.add(synchronizedItem.id); + } + + const { leftAdjacent, rightAdjacent } = findAdjacentTrackNeighbors(participant, allItems); + if (leftAdjacent) excludeIds.add(leftAdjacent.id); + if (rightAdjacent) excludeIds.add(rightAdjacent.id); + + return { + participant, + leftAdjacent, + rightAdjacent, + nearestNeighbors: findNearestNeighbors(participant, allItems), + excludeIds, + leftAdjacentNearestStart: leftAdjacent + ? findNearestStartAtOrAfter(leftAdjacent, allItems, excludeIds) + : null, + rightAdjacentNearestEnd: rightAdjacent + ? findNearestEndAtOrBefore(rightAdjacent, allItems, excludeIds) + : null, + }; + }); + + const affectedIds = new Set([currentItem.id]); + if (leftNeighbor) affectedIds.add(leftNeighbor.id); + if (rightNeighbor) affectedIds.add(rightNeighbor.id); + const relatedTransitions = transitions.filter((transition) => ( + affectedIds.has(transition.leftClipId) || affectedIds.has(transition.rightClipId) + )); + + return { + currentItem, + allItems, + itemsById, + transitions, + leftNeighbor, + rightNeighbor, + snapTargets, + snapExcludeIds, + linkedSelectionEnabled, + synchronizedCounterpart, + leftCounterpart, + rightCounterpart, + slideItemIds, + primaryNearestNeighbors, + leftNeighborNearestStart, + rightNeighborNearestEnd, + participantContexts, + relatedTransitions, + }; + }, [getMagneticSnapTargets, snapEnabled]); + const beginSlipSlideGesture = useCallback((startX: number, mode: 'slip' | 'slide') => { - usePlaybackStore.getState().setPreviewFrame(null); + commitPreviewFrameToCurrentFrame(); const { leftNeighbor, rightNeighbor } = findNeighbors(); const currentItem = getItemFromStore(); @@ -137,6 +378,7 @@ export function useTimelineSlipSlide( trackId: currentItem.trackId, slipDelta: 0, }); + slideGestureContextRef.current = null; } else { // Compute the effective slide range (tightest across all tracks), // incorporating transition constraints so the initial limit box matches @@ -162,6 +404,7 @@ export function useTimelineSlipSlide( minDelta: slideMinDelta, maxDelta: slideMaxDelta, }); + slideGestureContextRef.current = buildSlideGestureContext(currentItem, leftNeighbor ?? null, rightNeighbor ?? null); } // Seed linked companion previews with zero-delta so their overlays appear immediately @@ -179,7 +422,7 @@ export function useTimelineSlipSlide( } // Note: clampSlideDelta intentionally omitted — it reads fps from store at // call time, and including it would cause a TDZ error (defined after this hook). - }, [findNeighbors, getItemFromStore, item.id, setDragState]); + }, [buildSlideGestureContext, findNeighbors, getItemFromStore, item.id, setDragState]); /** * Clamp slip delta to source boundaries. @@ -303,6 +546,152 @@ export function useTimelineSlipSlide( return clamped; }, [getItemFromStore, fps, item.id]); + const clampSlideDeltaWithContext = useCallback((delta: number, context: SlideGestureContext): number => { + let clamped = delta; + const { currentItem } = context; + + if (currentItem.from + clamped < 0) { + clamped = -currentItem.from; + } + + if (context.leftNeighbor) { + const { clampedAmount } = clampTrimAmount(context.leftNeighbor, 'end', clamped, fps); + if (Math.abs(clampedAmount) < Math.abs(clamped)) { + clamped = clampedAmount; + } + clamped = clampEndAgainstNearestStart( + context.leftNeighbor, + clamped, + context.leftNeighborNearestStart, + ); + } + + if (context.rightNeighbor) { + const { clampedAmount } = clampTrimAmount(context.rightNeighbor, 'start', clamped, fps); + if (Math.abs(clampedAmount) < Math.abs(clamped)) { + clamped = clampedAmount; + } + clamped = clampStartAgainstNearestEnd( + context.rightNeighbor, + clamped, + context.rightNeighborNearestEnd, + ); + } + + for (const participantContext of context.participantContexts) { + if (participantContext.leftAdjacent) { + const { clampedAmount } = clampTrimAmount(participantContext.leftAdjacent, 'end', clamped, fps); + if (Math.abs(clampedAmount) < Math.abs(clamped)) { + clamped = clampedAmount; + } + clamped = clampEndAgainstNearestStart( + participantContext.leftAdjacent, + clamped, + participantContext.leftAdjacentNearestStart, + ); + } + + if (participantContext.rightAdjacent) { + const { clampedAmount } = clampTrimAmount(participantContext.rightAdjacent, 'start', clamped, fps); + if (Math.abs(clampedAmount) < Math.abs(clamped)) { + clamped = clampedAmount; + } + clamped = clampStartAgainstNearestEnd( + participantContext.rightAdjacent, + clamped, + participantContext.rightAdjacentNearestEnd, + ); + } + + const leftWall = participantContext.nearestNeighbors.leftNeighbor; + if (leftWall && !participantContext.excludeIds.has(leftWall.id)) { + const wallRight = leftWall.from + leftWall.durationInFrames; + const maxLeft = -(participantContext.participant.from - wallRight); + if (clamped < maxLeft) clamped = maxLeft; + } + + const rightWall = participantContext.nearestNeighbors.rightNeighbor; + if (rightWall && !participantContext.excludeIds.has(rightWall.id)) { + const participantEnd = participantContext.participant.from + participantContext.participant.durationInFrames; + const maxRight = rightWall.from - participantEnd; + if (clamped > maxRight) clamped = maxRight; + } + } + + const primaryLeftWall = context.primaryNearestNeighbors.leftNeighbor; + if (primaryLeftWall && !context.slideItemIds.has(primaryLeftWall.id)) { + const wallRight = primaryLeftWall.from + primaryLeftWall.durationInFrames; + const maxLeft = -(currentItem.from - wallRight); + if (clamped < maxLeft) clamped = maxLeft; + } + + const primaryRightWall = context.primaryNearestNeighbors.rightNeighbor; + if (primaryRightWall && !context.slideItemIds.has(primaryRightWall.id)) { + const primaryEnd = currentItem.from + currentItem.durationInFrames; + const maxRight = primaryRightWall.from - primaryEnd; + if (clamped > maxRight) clamped = maxRight; + } + + return clamped; + }, [fps]); + + const clampSlideDeltaToPreserveTransitionsWithContext = useCallback(( + requestedDelta: number, + context: SlideGestureContext, + ): number => { + if (requestedDelta === 0 || context.relatedTransitions.length === 0) { + return requestedDelta; + } + + const isValid = (delta: number): boolean => { + const previewById = new Map(); + + if (context.leftNeighbor) { + previewById.set( + context.leftNeighbor.id, + applyPreviewUpdate(context.leftNeighbor, applyTrimEndPreview(context.leftNeighbor, delta, fps)), + ); + } + + if (context.rightNeighbor) { + previewById.set( + context.rightNeighbor.id, + applyPreviewUpdate(context.rightNeighbor, applyTrimStartPreview(context.rightNeighbor, delta, fps)), + ); + } + + let slidItemPreview = applyPreviewUpdate(context.currentItem, applyMovePreview(context.currentItem, delta)); + const continuitySourceDelta = computeSlideContinuitySourceDelta( + context.currentItem, + context.leftNeighbor, + context.rightNeighbor, + delta, + fps, + ); + if ( + continuitySourceDelta !== 0 + && (slidItemPreview.type === 'video' || slidItemPreview.type === 'audio' || slidItemPreview.type === 'composition') + && slidItemPreview.sourceEnd !== undefined + ) { + slidItemPreview = { + ...slidItemPreview, + sourceStart: (slidItemPreview.sourceStart ?? 0) + continuitySourceDelta, + sourceEnd: slidItemPreview.sourceEnd + continuitySourceDelta, + }; + } + previewById.set(context.currentItem.id, slidItemPreview); + + return context.relatedTransitions.every((transition) => { + const leftClip = previewById.get(transition.leftClipId) ?? context.itemsById.get(transition.leftClipId) ?? null; + const rightClip = previewById.get(transition.rightClipId) ?? context.itemsById.get(transition.rightClipId) ?? null; + if (!leftClip || !rightClip) return true; + return canAddTransition(leftClip, rightClip, transition.durationInFrames, transition.alignment).canAdd; + }); + }; + + return clampDeltaToLastValidValue(requestedDelta, isValid); + }, [fps]); + // Mouse move handler const handleMouseMove = useCallback( (e: MouseEvent) => { @@ -385,15 +774,18 @@ export function useTimelineSlipSlide( useLinkedEditPreviewStore.getState().setUpdates(linkedPreviewUpdates); } else if (mode === 'slide') { + const slideContext = slideGestureContextRef.current; const { leftNeighborId, rightNeighborId } = stateRef.current; - const storeItem = getItemFromStore(); + const storeItem = slideContext?.currentItem ?? getItemFromStore(); // Apply snapping for slide (clip edges snap to items/playhead/grid) if (snapEnabled) { - const targets = getMagneticSnapTargets(); - const excludeIds = new Set([item.id]); - if (leftNeighborId) excludeIds.add(leftNeighborId); - if (rightNeighborId) excludeIds.add(rightNeighborId); + const targets = slideContext?.snapTargets ?? getMagneticSnapTargets(); + const excludeIds = slideContext?.snapExcludeIds ?? new Set([ + item.id, + leftNeighborId ?? '', + rightNeighborId ?? '', + ].filter(Boolean)); const newStart = storeItem.from + deltaFrames; const newEnd = newStart + storeItem.durationInFrames; @@ -425,17 +817,21 @@ export function useTimelineSlipSlide( } } - const allItems = useTimelineStore.getState().items; - const sourceClamped = clampSlideDelta(deltaFrames, leftNeighborId, rightNeighborId); - const transitionClamped = clampSlideDeltaToPreserveTransitions( - storeItem, - sourceClamped, - leftNeighborId ? (allItems.find((candidate) => candidate.id === leftNeighborId) ?? null) : null, - rightNeighborId ? (allItems.find((candidate) => candidate.id === rightNeighborId) ?? null) : null, - allItems, - useTransitionsStore.getState().transitions, - fps, - ); + const allItems = slideContext?.allItems ?? useTimelineStore.getState().items; + const sourceClamped = slideContext + ? clampSlideDeltaWithContext(deltaFrames, slideContext) + : clampSlideDelta(deltaFrames, leftNeighborId, rightNeighborId); + const transitionClamped = slideContext + ? clampSlideDeltaToPreserveTransitionsWithContext(sourceClamped, slideContext) + : clampSlideDeltaToPreserveTransitions( + storeItem, + sourceClamped, + leftNeighborId ? (allItems.find((candidate) => candidate.id === leftNeighborId) ?? null) : null, + rightNeighborId ? (allItems.find((candidate) => candidate.id === rightNeighborId) ?? null) : null, + allItems, + useTransitionsStore.getState().transitions, + fps, + ); const clamped = transitionClamped; const isConstrained = clamped !== deltaFrames; const constraintEdge = !isConstrained @@ -485,21 +881,27 @@ export function useTimelineSlipSlide( } const linkedSelectionEnabled = useEditorStore.getState().linkedSelectionEnabled; - const synchronizedCounterpart = linkedSelectionEnabled - ? getSynchronizedLinkedItems(allItems, storeItem.id) - .find((candidate) => candidate.id !== storeItem.id) ?? null - : null; + const synchronizedCounterpart = slideContext + ? slideContext.synchronizedCounterpart + : linkedSelectionEnabled + ? getSynchronizedLinkedItems(allItems, storeItem.id) + .find((candidate) => candidate.id !== storeItem.id) ?? null + : null; const linkedPreviewUpdates: PreviewItemUpdate[] = []; if (synchronizedCounterpart) { linkedPreviewUpdates.push(applyMovePreview(synchronizedCounterpart, clamped)); - const leftCounterpart = leftNeighborId - ? getMatchingSynchronizedLinkedCounterpart(allItems, leftNeighborId, synchronizedCounterpart.trackId, synchronizedCounterpart.type) - : null; - const rightCounterpart = rightNeighborId - ? getMatchingSynchronizedLinkedCounterpart(allItems, rightNeighborId, synchronizedCounterpart.trackId, synchronizedCounterpart.type) - : null; + const leftCounterpart = slideContext + ? slideContext.leftCounterpart + : leftNeighborId + ? getMatchingSynchronizedLinkedCounterpart(allItems, leftNeighborId, synchronizedCounterpart.trackId, synchronizedCounterpart.type) + : null; + const rightCounterpart = slideContext + ? slideContext.rightCounterpart + : rightNeighborId + ? getMatchingSynchronizedLinkedCounterpart(allItems, rightNeighborId, synchronizedCounterpart.trackId, synchronizedCounterpart.type) + : null; if (leftCounterpart) { linkedPreviewUpdates.push(applyTrimEndPreview(leftCounterpart, clamped, fps)); @@ -513,7 +915,20 @@ export function useTimelineSlipSlide( } }, - [pixelsToTime, fps, trackLocked, item.id, getItemFromStore, clampSlipDelta, clampSlideDelta, snapEnabled, getMagneticSnapTargets, getSnapThresholdFrames], + [ + pixelsToTime, + fps, + trackLocked, + item.id, + getItemFromStore, + clampSlipDelta, + clampSlideDelta, + clampSlideDeltaToPreserveTransitionsWithContext, + clampSlideDeltaWithContext, + snapEnabled, + getMagneticSnapTargets, + getSnapThresholdFrames, + ], ); // Mouse up handler — commits changes @@ -552,6 +967,7 @@ export function useTimelineSlipSlide( constraintLabel: null, }); latestDeltaRef.current = 0; + slideGestureContextRef.current = null; } }, [item.id, setDragState]); @@ -571,6 +987,7 @@ export function useTimelineSlipSlide( useLinkedEditPreviewStore.getState().clear(); setDragState(null); latestDeltaRef.current = 0; + slideGestureContextRef.current = null; } }; } @@ -578,6 +995,7 @@ export function useTimelineSlipSlide( useEffect(() => () => { pendingStartCleanupRef.current?.(); + slideGestureContextRef.current = null; }, []); // Start slip/slide drag diff --git a/src/features/timeline/hooks/use-timeline-trim.ts b/src/features/timeline/hooks/use-timeline-trim.ts index a1700b824..7c98a728d 100644 --- a/src/features/timeline/hooks/use-timeline-trim.ts +++ b/src/features/timeline/hooks/use-timeline-trim.ts @@ -1,6 +1,6 @@ import { useState, useCallback, useRef, useEffect } from 'react'; import type { TimelineItem } from '@/types/timeline'; -import { usePlaybackStore } from '@/shared/state/playback'; +import { commitPreviewFrameToCurrentFrame } from '@/shared/state/playback'; import { useEditorStore } from '@/app/state/editor'; import { toast } from 'sonner'; import type { SnapTarget } from '../types/drag'; @@ -719,7 +719,7 @@ export function useTimelineTrim(item: TimelineItem, timelineDuration: number, tr // including guardrail early returns. e.stopPropagation(); e.preventDefault(); - usePlaybackStore.getState().setPreviewFrame(null); + commitPreviewFrameToCurrentFrame(); const forcedMode = options?.forcedMode ?? null; const destroyTransitionAtHandle = options?.destroyTransitionAtHandle ?? false; diff --git a/src/features/timeline/hooks/use-track-push.ts b/src/features/timeline/hooks/use-track-push.ts index 5ef1e5a2b..95d895cd9 100644 --- a/src/features/timeline/hooks/use-track-push.ts +++ b/src/features/timeline/hooks/use-track-push.ts @@ -1,6 +1,6 @@ import { useState, useCallback, useRef, useEffect } from 'react'; import type { TimelineItem } from '@/types/timeline'; -import { usePlaybackStore } from '@/shared/state/playback'; +import { commitPreviewFrameToCurrentFrame } from '@/shared/state/playback'; import { useSelectionStore } from '@/shared/state/selection'; import { useTimelineStore } from '../stores/timeline-store'; import { useItemsStore } from '../stores/items-store'; @@ -135,7 +135,7 @@ export function useTrackPush(item: TimelineItem, timelineDuration: number, track if (e.button !== 0 || trackLocked) return; e.stopPropagation(); e.preventDefault(); - usePlaybackStore.getState().setPreviewFrame(null); + commitPreviewFrameToCurrentFrame(); const { items: allItems, itemsByTrackId } = useItemsStore.getState(); const cutFrame = item.from; diff --git a/src/features/timeline/hooks/use-transition-resize.ts b/src/features/timeline/hooks/use-transition-resize.ts index c1cc96f0b..57173b276 100644 --- a/src/features/timeline/hooks/use-transition-resize.ts +++ b/src/features/timeline/hooks/use-transition-resize.ts @@ -1,6 +1,6 @@ import { useState, useCallback, useRef, useEffect, useMemo } from 'react'; import type { Transition } from '@/types/transition'; -import { usePlaybackStore } from '@/shared/state/playback'; +import { commitPreviewFrameToCurrentFrame } from '@/shared/state/playback'; import { TRANSITION_CONFIGS } from '@/types/transition'; import { useTimelineStore } from '../stores/timeline-store'; import { useItemsStore } from '../stores/items-store'; @@ -132,7 +132,7 @@ export function useTransitionResize(transition: Transition) { (e: React.MouseEvent, handle: ResizeHandle) => { e.preventDefault(); e.stopPropagation(); - usePlaybackStore.getState().setPreviewFrame(null); + commitPreviewFrameToCurrentFrame(); setResizeState({ isResizing: true, diff --git a/src/features/timeline/services/filmstrip-cache.test.ts b/src/features/timeline/services/filmstrip-cache.test.ts index ef383b5fc..9b5a2fca1 100644 --- a/src/features/timeline/services/filmstrip-cache.test.ts +++ b/src/features/timeline/services/filmstrip-cache.test.ts @@ -22,10 +22,15 @@ vi.mock('@/shared/logging/logger', () => ({ createLogger: vi.fn(() => loggerMocks), })); -vi.mock('./filmstrip-opfs-storage', () => ({ - filmstripOPFSStorage: { +vi.mock('./filmstrip-storage', () => ({ + filmstripStorage: { load: vi.fn(), saveMetadata: vi.fn(), + saveFrameBlob: vi.fn(), + loadSingleFrame: vi.fn(), + getExistingIndices: vi.fn(), + createFrameFromBitmap: vi.fn(), + createFrameFromBlob: vi.fn(), revokeUrls: vi.fn(), delete: vi.fn(), clearAll: vi.fn(), diff --git a/src/features/timeline/services/filmstrip-cache.ts b/src/features/timeline/services/filmstrip-cache.ts index d17f065d9..f9e968d88 100644 --- a/src/features/timeline/services/filmstrip-cache.ts +++ b/src/features/timeline/services/filmstrip-cache.ts @@ -3,7 +3,7 @@ * * Simple service that: * 1. Manages extraction worker - * 2. Provides object URLs from OPFS storage + * 2. Provides object URLs from persisted filmstrip storage * 3. Notifies subscribers when new frames are available * * No ImageBitmaps in memory - just URLs for tags. @@ -25,7 +25,7 @@ import { FILMSTRIP_EXTRACT_HEIGHT, THUMBNAIL_WIDTH, } from '@/features/timeline/constants'; -import { filmstripOPFSStorage, type FilmstripFrame } from './filmstrip-opfs-storage'; +import { filmstripStorage, type FilmstripFrame } from './filmstrip-storage'; import { FilmstripMemoryState } from './filmstrip-memory-state'; import type { ExtractRequest, WorkerResponse } from '../workers/filmstrip-extraction-worker'; @@ -319,7 +319,7 @@ class FilmstripCacheService { this.cache.delete(mediaId); this.clearCacheMeta(mediaId); - filmstripOPFSStorage.revokeUrls(mediaId); + filmstripStorage.revokeUrls(mediaId); this.clearIdleEvictionTimer(mediaId); logger.debug(`Evicted in-memory filmstrip ${mediaId} (${reason})`); return true; @@ -798,7 +798,7 @@ class FilmstripCacheService { frames: cached.frames, existingIndices: cached.frames.map((frame) => frame.index), } - : await filmstripOPFSStorage.load(mediaId); + : await filmstripStorage.load(mediaId); const existingFrames = stored?.frames ?? []; const existingIndices = stored?.existingIndices ?? []; @@ -1082,7 +1082,7 @@ class FilmstripCacheService { options?: FilmstripLoadOptions, ): Promise { // Try loading from storage - const stored = await filmstripOPFSStorage.load(mediaId); + const stored = await filmstripStorage.load(mediaId); if (stored?.metadata.isComplete) { // Complete - return immediately @@ -1231,7 +1231,7 @@ class FilmstripCacheService { const targetFrames = [...existingFrames].sort((a, b) => a.index - b.index); const settled = this.buildSettledFilmstrip(pending, targetFrames); if (settled.isComplete && this.shouldPersistCompletionMetadata(pending)) { - void filmstripOPFSStorage.saveMetadata(mediaId, { + void filmstripStorage.saveMetadata(mediaId, { width: FILMSTRIP_EXTRACT_WIDTH, height: FILMSTRIP_EXTRACT_HEIGHT, isComplete: true, @@ -1251,7 +1251,7 @@ class FilmstripCacheService { // Persist extraction session metadata once. Workers should focus on frame // writes; centralizing meta writes avoids cross-worker file contention. - void filmstripOPFSStorage.saveMetadata(mediaId, { + void filmstripStorage.saveMetadata(mediaId, { width: FILMSTRIP_EXTRACT_WIDTH, height: FILMSTRIP_EXTRACT_HEIGHT, isComplete: false, @@ -1504,9 +1504,9 @@ class FilmstripCacheService { } // When blobs arrive (after JPEG encode), upgrade frames with proper URLs - // and persist to OPFS. This replaces bitmap-only frames. + // and persist them to the workspace. This replaces bitmap-only frames. if (Array.isArray(response.savedFrames) && response.savedFrames.length > 0) { - this.ingestSavedFrames( + await this.ingestSavedFrames( mediaId, response.savedFrames.filter((frame) => frame.index >= workerState.startIndex @@ -1525,14 +1525,14 @@ class FilmstripCacheService { try { await this.loadNewFrames(mediaId, newIndices); } catch (error) { - logger.error('Failed to load saved filmstrip frames from OPFS', { + logger.error('Failed to load saved filmstrip frames from persisted storage', { mediaId, requestId: workerState.requestId, range: [workerState.startIndex, workerState.endIndex], newIndicesCount: newIndices.length, error, }); - this.handleWorkerError(mediaId, 'Failed to load saved frames from OPFS'); + this.handleWorkerError(mediaId, 'Failed to load saved frames from storage'); return; } } @@ -1544,14 +1544,14 @@ class FilmstripCacheService { try { await this.flushWorkerRangeLoads(mediaId, workerState); } catch (error) { - logger.error('Failed to flush worker frame range loads from OPFS', { + logger.error('Failed to flush worker frame range loads from persisted storage', { mediaId, requestId: workerState.requestId, range: [workerState.startIndex, workerState.endIndex], newFrameCount, error, }); - this.handleWorkerError(mediaId, 'Failed to refresh worker frame range from OPFS'); + this.handleWorkerError(mediaId, 'Failed to refresh worker frame range from storage'); return; } } @@ -1580,12 +1580,12 @@ class FilmstripCacheService { // Check if all workers are done if (pending.completedWorkers === pending.workers.length) { // All workers done - finalize directly from in-memory extracted frames - // to avoid an extra full OPFS directory scan and URL recreation pass. + // to avoid an extra full storage scan and URL recreation pass. const finalFrames = Array.from(pending.extractedFrames.values()) .sort((a, b) => a.index - b.index); const settled = this.buildSettledFilmstrip(pending, finalFrames); try { - await filmstripOPFSStorage.saveMetadata(mediaId, { + await filmstripStorage.saveMetadata(mediaId, { width: FILMSTRIP_EXTRACT_WIDTH, height: FILMSTRIP_EXTRACT_HEIGHT, isComplete: settled.isComplete && this.shouldPersistCompletionMetadata(pending), @@ -1678,7 +1678,7 @@ class FilmstripCacheService { if (!pending) return 0; // Discover what is actually saved on disk for this worker's range. - const inRangeExistingIndices = await filmstripOPFSStorage.getExistingIndices( + const inRangeExistingIndices = await filmstripStorage.getExistingIndices( mediaId, startIndex, endIndex @@ -1703,7 +1703,7 @@ class FilmstripCacheService { if (indices.length === 0) return; const loadPromises = indices.map(async (index) => { - const frame = await filmstripOPFSStorage.loadSingleFrame(mediaId, index); + const frame = await filmstripStorage.loadSingleFrame(mediaId, index); if (frame) { pending.extractedFrames.set(index, frame); this.noteFirstFrame(pending.metrics); @@ -1725,7 +1725,7 @@ class FilmstripCacheService { bf.bitmap.close(); continue; } - const frame = filmstripOPFSStorage.createFrameFromBitmap(mediaId, bf.index, bf.bitmap); + const frame = filmstripStorage.createFrameFromBitmap(mediaId, bf.index, bf.bitmap); if (frame) { pending.extractedFrames.set(bf.index, frame); this.noteFirstFrame(pending.metrics); @@ -1733,16 +1733,18 @@ class FilmstripCacheService { } } - private ingestSavedFrames( + private async ingestSavedFrames( mediaId: string, savedFrames: Array<{ index: number; blob: Blob }> - ): void { + ): Promise { const pending = this.pendingExtractions.get(mediaId); if (!pending || savedFrames.length === 0) return; + const persistWrites: Promise[] = []; + for (const saved of savedFrames) { const existing = pending.extractedFrames.get(saved.index); - const frame = filmstripOPFSStorage.createFrameFromBlob(mediaId, saved.index, saved.blob); + const frame = filmstripStorage.createFrameFromBlob(mediaId, saved.index, saved.blob); if (frame) { // Close bitmap if this frame was previously bitmap-only if (existing?.bitmap) { @@ -1753,6 +1755,13 @@ class FilmstripCacheService { this.noteFirstFrame(pending.metrics); } } + persistWrites.push( + filmstripStorage.saveFrameBlob(mediaId, saved.index, saved.blob), + ); + } + + if (persistWrites.length > 0) { + await Promise.all(persistWrites); } } @@ -1919,7 +1928,7 @@ class FilmstripCacheService { const totalTargetFrames = Math.max(1, targetIndices.length); let extractedTargetCount = skipSet.size; - await filmstripOPFSStorage.saveMetadata(mediaId, { + await filmstripStorage.saveMetadata(mediaId, { width: FILMSTRIP_EXTRACT_WIDTH, height: FILMSTRIP_EXTRACT_HEIGHT, isComplete: false, @@ -1951,9 +1960,9 @@ class FilmstripCacheService { this.drawCoverFrame(video, ctx, canvas.width, canvas.height); const blob = await this.canvasToBlob(canvas); - await filmstripOPFSStorage.saveFrameBlob(mediaId, i, blob); + await filmstripStorage.saveFrameBlob(mediaId, i, blob); - const frame = await filmstripOPFSStorage.loadSingleFrame(mediaId, i); + const frame = await filmstripStorage.loadSingleFrame(mediaId, i); if (frame) { currentPending.extractedFrames.set(i, frame); this.noteFirstFrame(currentPending.metrics); @@ -1987,7 +1996,7 @@ class FilmstripCacheService { const finalFrames = Array.from(finishedPending.extractedFrames.values()) .sort((a, b) => a.index - b.index); const settled = this.buildSettledFilmstrip(finishedPending, finalFrames); - await filmstripOPFSStorage.saveMetadata(mediaId, { + await filmstripStorage.saveMetadata(mediaId, { width: FILMSTRIP_EXTRACT_WIDTH, height: FILMSTRIP_EXTRACT_HEIGHT, isComplete: settled.isComplete && this.shouldPersistCompletionMetadata(finishedPending), @@ -2285,7 +2294,7 @@ class FilmstripCacheService { } /** - * Refresh cached frame URLs from OPFS when a visible tile reports a stale source. + * Refresh cached frame URLs from persisted storage when a visible tile reports a stale source. */ async refreshFrames(mediaId: string, frameIndices: number[]): Promise { const normalizedIndices = Array.from(new Set( @@ -2296,7 +2305,7 @@ class FilmstripCacheService { } const refreshedEntries = await Promise.all(normalizedIndices.map(async (index) => { - const frame = await filmstripOPFSStorage.loadSingleFrame(mediaId, index); + const frame = await filmstripStorage.loadSingleFrame(mediaId, index); return frame ? [index, frame] as const : null; })); const refreshedByIndex = new Map( @@ -2369,8 +2378,8 @@ class FilmstripCacheService { this.clearIdleEvictionTimer(mediaId); this.cache.delete(mediaId); this.clearCacheMeta(mediaId); - filmstripOPFSStorage.revokeUrls(mediaId); - await filmstripOPFSStorage.delete(mediaId); + filmstripStorage.revokeUrls(mediaId); + await filmstripStorage.delete(mediaId); } /** @@ -2382,7 +2391,7 @@ class FilmstripCacheService { } this.cache.clear(); this.memoryState.clear(); - await filmstripOPFSStorage.clearAll(); + await filmstripStorage.clearAll(); } /** @@ -2390,7 +2399,7 @@ class FilmstripCacheService { * * IMPORTANT: * - This is runtime cleanup only (workers, timers, in-memory URLs/cache). - * - Do NOT clear OPFS filmstrip files here. + * - Do NOT clear persisted filmstrip files here. * Persistent filmstrip data must survive page refresh so F5 can reuse cache. * - Use clearAll()/clearMedia() only for explicit user/debug cache reset flows. */ @@ -2399,9 +2408,9 @@ class FilmstripCacheService { this.abort(mediaId); } this.workerPoolManager.terminateAll(); - // Revoke in-memory object URLs only; keep persisted OPFS filmstrip files. + // Revoke in-memory object URLs only; keep persisted filmstrip files. for (const mediaId of this.cache.keys()) { - filmstripOPFSStorage.revokeUrls(mediaId); + filmstripStorage.revokeUrls(mediaId); } this.cache.clear(); this.memoryState.clear(); diff --git a/src/features/timeline/services/filmstrip-memory-state.ts b/src/features/timeline/services/filmstrip-memory-state.ts index 1e395ea1a..7be6ed086 100644 --- a/src/features/timeline/services/filmstrip-memory-state.ts +++ b/src/features/timeline/services/filmstrip-memory-state.ts @@ -2,7 +2,7 @@ import { FILMSTRIP_EXTRACT_HEIGHT, FILMSTRIP_EXTRACT_WIDTH, } from '@/features/timeline/constants'; -import type { FilmstripFrame } from './filmstrip-opfs-storage'; +import type { FilmstripFrame } from './filmstrip-storage'; const FRAME_MEMORY_FALLBACK_BYTES = FILMSTRIP_EXTRACT_WIDTH * FILMSTRIP_EXTRACT_HEIGHT * 4; diff --git a/src/features/timeline/services/filmstrip-opfs-storage.ts b/src/features/timeline/services/filmstrip-opfs-storage.ts deleted file mode 100644 index f1698f910..000000000 --- a/src/features/timeline/services/filmstrip-opfs-storage.ts +++ /dev/null @@ -1,623 +0,0 @@ -/** - * OPFS Filmstrip Storage - * - * Simple storage for filmstrip frames. Worker handles saving, - * this service handles loading and providing object URLs. - * - * Storage structure: - * filmstrips/{mediaId}/ - * meta.json - { width, height, isComplete, frameCount } - * 0.jpg, 1.jpg, 2.jpg, ... (legacy caches may still use .webp) - */ - -import { createLogger } from '@/shared/logging/logger'; -import { getCacheMigration } from '@/infrastructure/storage/cache-version'; -import { - mirrorBlobToWorkspace, - mirrorJsonToWorkspace, - readWorkspaceBlob, - removeWorkspaceCacheEntry, -} from '@/infrastructure/storage/workspace-fs/cache-mirror'; -import { - filmstripFileFramePath, - filmstripMetaPath, - WORKSPACE_FILMSTRIPS_DIR, -} from '@/infrastructure/storage/workspace-fs/paths'; -import { safeWrite } from '../utils/opfs-safe-write'; - -const logger = createLogger('FilmstripOPFS'); - -const FILMSTRIP_DIR = 'filmstrips'; -const FRAME_RATE = 1; // Must match worker - 1fps for filmstrip thumbnails -const PRIMARY_FRAME_EXT = 'jpg'; -const LEGACY_FRAME_EXT = 'webp'; -const FRAME_EXTENSIONS = new Set([PRIMARY_FRAME_EXT, LEGACY_FRAME_EXT]); -const VALIDATION_TTL_MS = 10_000; - -function parseFrameFileNameParts(name: string): { index: number; ext: string } | null { - const dotIndex = name.lastIndexOf('.'); - if (dotIndex <= 0) return null; - const ext = name.slice(dotIndex + 1).toLowerCase(); - if (!FRAME_EXTENSIONS.has(ext)) return null; - const index = parseInt(name.slice(0, dotIndex), 10); - if (Number.isNaN(index)) return null; - return { index, ext }; -} - -function parseFrameFileName(name: string): number | null { - const parsed = parseFrameFileNameParts(name); - return parsed?.index ?? null; -} - -interface FilmstripMetadata { - width: number; - height: number; - isComplete: boolean; - frameCount: number; -} - -export interface FilmstripFrame { - index: number; - timestamp: number; - url: string; // Object URL for img src - byteSize?: number; - /** Hardware-backed bitmap for instant canvas rendering (skips JPEG decode) */ - bitmap?: ImageBitmap; -} - -interface LoadedFilmstrip { - metadata: FilmstripMetadata; - frames: FilmstripFrame[]; - existingIndices: number[]; -} - -interface MediaDirCacheEntry { - handle: FileSystemDirectoryHandle; - lastValidated: number; -} - -/** - * OPFS Filmstrip Storage Service - */ -class FilmstripOPFSStorage { - private dirHandle: FileSystemDirectoryHandle | null = null; - private initPromise: Promise | null = null; - private objectUrls = new Map>(); // mediaId -> frameIndex -> url - private mediaDirCache = new Map(); - - private scheduleRevoke(urls: string[]): void { - if (urls.length === 0) return; - - const revoke = () => { - for (const url of urls) { - URL.revokeObjectURL(url); - } - }; - - if (typeof requestIdleCallback === 'function') { - requestIdleCallback(revoke, { timeout: 10_000 }); - return; - } - - setTimeout(revoke, 0); - } - - private setFrameUrl(mediaId: string, index: number, url: string): void { - const urlsByIndex = this.objectUrls.get(mediaId) ?? new Map(); - const previous = urlsByIndex.get(index); - urlsByIndex.set(index, url); - this.objectUrls.set(mediaId, urlsByIndex); - - if (previous && previous !== url) { - this.scheduleRevoke([previous]); - } - } - - private replaceAllFrameUrls( - mediaId: string, - entries: Array<{ index: number; url: string }> - ): void { - const previous = this.objectUrls.get(mediaId); - const next = new Map(); - for (const entry of entries) { - next.set(entry.index, entry.url); - } - this.objectUrls.set(mediaId, next); - - if (!previous) return; - - const toRevoke: string[] = []; - for (const [index, url] of previous) { - const nextUrl = next.get(index); - if (nextUrl !== url) { - toRevoke.push(url); - } - } - this.scheduleRevoke(toRevoke); - } - - /** - * Initialize OPFS directory - */ - private async ensureDirectory(): Promise { - if (this.dirHandle) return this.dirHandle; - if (this.initPromise) return this.initPromise; - - this.initPromise = this.initialize(); - return this.initPromise; - } - - private async initialize(): Promise { - try { - const root = await navigator.storage.getDirectory(); - const dir = await root.getDirectoryHandle(FILMSTRIP_DIR, { create: true }); - - // Run migration if needed - const migration = getCacheMigration('filmstrip'); - if (migration.needsMigration) { - const entries: string[] = []; - for await (const entry of dir.values()) { - entries.push(entry.name); - } - for (const name of entries) { - await dir.removeEntry(name, { recursive: true }).catch(() => {}); - } - migration.markComplete(); - logger.info(`Filmstrip cache cleared for v${migration.newVersion}`); - } - - this.dirHandle = dir; - return dir; - } catch (error) { - logger.error('Failed to initialize OPFS:', error); - throw error; - } - } - - /** - * Get media directory handle - */ - private async getMediaDir(mediaId: string): Promise { - const cached = this.mediaDirCache.get(mediaId); - const dir = await this.ensureDirectory(); - - if (cached) { - if (Date.now() - cached.lastValidated <= VALIDATION_TTL_MS) { - return cached.handle; - } - - try { - // Probe the cached handle. If the underlying directory was removed, - // OPFS access will throw and we'll invalidate + recover below. - const iterator = cached.handle.values(); - await iterator.next(); - this.mediaDirCache.set(mediaId, { - handle: cached.handle, - lastValidated: Date.now(), - }); - return cached.handle; - } catch { - this.mediaDirCache.delete(mediaId); - try { - const reopened = await dir.getDirectoryHandle(mediaId); - this.mediaDirCache.set(mediaId, { - handle: reopened, - lastValidated: Date.now(), - }); - return reopened; - } catch { - return null; - } - } - } - - try { - const mediaDir = await dir.getDirectoryHandle(mediaId); - this.mediaDirCache.set(mediaId, { - handle: mediaDir, - lastValidated: Date.now(), - }); - return mediaDir; - } catch { - return null; - } - } - - /** - * Get or create media directory handle - */ - private async getOrCreateMediaDir(mediaId: string): Promise { - const cached = this.mediaDirCache.get(mediaId); - if (cached && Date.now() - cached.lastValidated <= VALIDATION_TTL_MS) { - return cached.handle; - } - - const dir = await this.ensureDirectory(); - const mediaDir = await dir.getDirectoryHandle(mediaId, { create: true }); - this.mediaDirCache.set(mediaId, { - handle: mediaDir, - lastValidated: Date.now(), - }); - return mediaDir; - } - - /** - * Save metadata file (used by worker and fallback extraction) - */ - async saveMetadata( - mediaId: string, - metadata: { width: number; height: number; isComplete: boolean; frameCount: number } - ): Promise { - const mediaDir = await this.getOrCreateMediaDir(mediaId); - const fileHandle = await mediaDir.getFileHandle('meta.json', { create: true }); - const writable = await fileHandle.createWritable(); - await safeWrite(writable, JSON.stringify(metadata)); - void mirrorJsonToWorkspace(filmstripMetaPath(mediaId), metadata); - } - - /** - * Save a frame blob at a specific index - */ - async saveFrameBlob(mediaId: string, index: number, blob: Blob): Promise { - const mediaDir = await this.getOrCreateMediaDir(mediaId); - const fileHandle = await mediaDir.getFileHandle(`${index}.${PRIMARY_FRAME_EXT}`, { create: true }); - const writable = await fileHandle.createWritable(); - await safeWrite(writable, blob); - void mirrorBlobToWorkspace( - filmstripFileFramePath(mediaId, index, PRIMARY_FRAME_EXT), - blob, - ); - } - - /** - * Load filmstrip - returns object URLs for img src - */ - async load(mediaId: string): Promise { - try { - let mediaDir = await this.getMediaDir(mediaId); - - // If OPFS has nothing for this media, try hydrating from the workspace - // folder — another origin may have produced the filmstrip. hydration - // writes the meta.json + primary-ext frames back into OPFS so the - // normal load path can pick it up. - if (!mediaDir) { - const hydrated = await this.hydrateFromWorkspace(mediaId); - if (!hydrated) return null; - mediaDir = await this.getMediaDir(mediaId); - if (!mediaDir) return null; - } - - // Load metadata - let metadata: FilmstripMetadata; - try { - const metaHandle = await mediaDir.getFileHandle('meta.json'); - const metaFile = await metaHandle.getFile(); - metadata = JSON.parse(await metaFile.text()); - } catch { - const hydrated = await this.hydrateFromWorkspace(mediaId); - if (!hydrated) return null; - try { - const metaHandle = await mediaDir.getFileHandle('meta.json'); - const metaFile = await metaHandle.getFile(); - metadata = JSON.parse(await metaFile.text()); - } catch { - return null; - } - } - - // Collect frame files (dedupe by frame index, prefer primary extension). - const frameFilesByIndex = new Map(); - for await (const entry of mediaDir.values()) { - if (entry.kind !== 'file') continue; - const parsed = parseFrameFileNameParts(entry.name); - if (!parsed) continue; - try { - const fileHandle = entry as FileSystemFileHandle; - const file = await fileHandle.getFile(); - if (file.size <= 0) continue; - - const existing = frameFilesByIndex.get(parsed.index); - const shouldReplace = !existing - || (parsed.ext === PRIMARY_FRAME_EXT && existing.ext !== PRIMARY_FRAME_EXT); - if (shouldReplace) { - frameFilesByIndex.set(parsed.index, { file, ext: parsed.ext }); - } - } catch { - // Skip unreadable files - } - } - - const frameFiles = Array.from(frameFilesByIndex.entries()) - .map(([index, value]) => ({ index, file: value.file })) - .sort((a, b) => a.index - b.index); - - // Create object URLs - const nextUrls: Array<{ index: number; url: string }> = []; - const frames: FilmstripFrame[] = frameFiles.map(({ index, file }) => { - const url = URL.createObjectURL(file); - nextUrls.push({ index, url }); - return { - index, - timestamp: index / FRAME_RATE, - url, - byteSize: file.size, - }; - }); - this.replaceAllFrameUrls(mediaId, nextUrls); - - const existingIndices = frameFiles.map(f => f.index); - - // Sanity check: if marked complete but no frames, treat as incomplete - if (metadata.isComplete && frames.length === 0) { - logger.warn(`Filmstrip ${mediaId} marked complete but has 0 frames - resetting`); - metadata.isComplete = false; - metadata.frameCount = 0; - } - - logger.debug(`Loaded filmstrip ${mediaId}: ${frames.length} frames, complete: ${metadata.isComplete}`); - - return { metadata, frames, existingIndices }; - } catch (error) { - logger.warn('Failed to load filmstrip:', error); - return null; - } - } - - /** - * Get existing frame indices (for resume) - */ - async getExistingIndices( - mediaId: string, - startIndex?: number, - endIndex?: number - ): Promise { - try { - const mediaDir = await this.getMediaDir(mediaId); - if (!mediaDir) return []; - - const indices = new Set(); - for await (const entry of mediaDir.values()) { - if (entry.kind !== 'file') continue; - const index = parseFrameFileName(entry.name); - if (index !== null) { - if (typeof startIndex === 'number' && index < startIndex) { - continue; - } - if (typeof endIndex === 'number' && index >= endIndex) { - continue; - } - try { - const fileHandle = entry as FileSystemFileHandle; - const file = await fileHandle.getFile(); - if (file.size > 0) { - indices.add(index); - } - } catch { - // Skip - } - } - } - - return Array.from(indices).sort((a, b) => a - b); - } catch { - return []; - } - } - - /** - * Load a single frame by index - for incremental updates during extraction - */ - async loadSingleFrame(mediaId: string, index: number): Promise { - try { - const mediaDir = await this.getMediaDir(mediaId); - if (!mediaDir) return null; - - let file: File | null = null; - try { - const primaryHandle = await mediaDir.getFileHandle(`${index}.${PRIMARY_FRAME_EXT}`); - file = await primaryHandle.getFile(); - } catch { - try { - const legacyHandle = await mediaDir.getFileHandle(`${index}.${LEGACY_FRAME_EXT}`); - file = await legacyHandle.getFile(); - } catch { - return null; - } - } - if (!file || file.size === 0) return null; - - const url = URL.createObjectURL(file); - this.setFrameUrl(mediaId, index, url); - - return { - index, - timestamp: index / FRAME_RATE, - url, - byteSize: file.size, - }; - } catch { - return null; - } - } - - /** - * Create an in-memory frame URL from a worker-provided blob. - * Used for progressive UI updates to avoid immediate OPFS read-after-write. - */ - createFrameFromBlob(mediaId: string, index: number, blob: Blob): FilmstripFrame | null { - if (!blob || blob.size === 0) { - return null; - } - - const url = URL.createObjectURL(blob); - this.setFrameUrl(mediaId, index, url); - - return { - index, - timestamp: index / FRAME_RATE, - url, - byteSize: blob.size, - }; - } - - /** - * Create an in-memory frame from a transferred ImageBitmap. - * Provides instant display without JPEG encode/decode roundtrip. - * URL is empty — the component renders from bitmap directly via canvas. - * Once the JPEG blob arrives (via createFrameFromBlob), the URL is set - * and the bitmap can be closed. - */ - createFrameFromBitmap(_mediaId: string, index: number, bitmap: ImageBitmap): FilmstripFrame | null { - if (!bitmap || bitmap.width === 0) return null; - - return { - index, - timestamp: index / FRAME_RATE, - url: '', - byteSize: bitmap.width * bitmap.height * 4, - bitmap, - }; - } - - /** - * Check if filmstrip is complete - */ - async isComplete(mediaId: string): Promise { - try { - const mediaDir = await this.getMediaDir(mediaId); - if (!mediaDir) return false; - - const metaHandle = await mediaDir.getFileHandle('meta.json'); - const metaFile = await metaHandle.getFile(); - const metadata: FilmstripMetadata = JSON.parse(await metaFile.text()); - return metadata.isComplete; - } catch { - return false; - } - } - - /** - * Delete filmstrip - */ - async delete(mediaId: string): Promise { - this.revokeUrls(mediaId); - this.mediaDirCache.delete(mediaId); - try { - const dir = await this.ensureDirectory(); - await dir.removeEntry(mediaId, { recursive: true }); - logger.debug(`Deleted filmstrip ${mediaId}`); - } catch { - // May not exist - } - void removeWorkspaceCacheEntry([WORKSPACE_FILMSTRIPS_DIR, mediaId], { - recursive: true, - }); - } - - /** - * Pull filmstrip meta + frames from the workspace folder into OPFS. - * Used as a cross-origin fallback when OPFS has no cache for this media. - * Returns true when at least the metadata was recovered. - */ - private async hydrateFromWorkspace(mediaId: string): Promise { - try { - const metaBlob = await readWorkspaceBlob(filmstripMetaPath(mediaId)); - if (!metaBlob) return false; - - let metadata: FilmstripMetadata; - try { - metadata = JSON.parse(await metaBlob.text()) as FilmstripMetadata; - } catch { - return false; - } - - const mediaDir = await this.getOrCreateMediaDir(mediaId); - - const metaHandle = await mediaDir.getFileHandle('meta.json', { create: true }); - const metaWritable = await metaHandle.createWritable(); - await safeWrite(metaWritable, JSON.stringify(metadata)); - - // Frame count is the declared upper bound; missing frames are skipped. - const expected = Math.max(0, metadata.frameCount | 0); - for (let index = 0; index < expected; index += 1) { - const frameBlob = await readWorkspaceBlob( - filmstripFileFramePath(mediaId, index, PRIMARY_FRAME_EXT), - ); - if (!frameBlob || frameBlob.size === 0) continue; - const frameHandle = await mediaDir.getFileHandle( - `${index}.${PRIMARY_FRAME_EXT}`, - { create: true }, - ); - const frameWritable = await frameHandle.createWritable(); - await safeWrite(frameWritable, frameBlob); - } - - logger.debug(`Hydrated filmstrip ${mediaId} from workspace`); - return true; - } catch (error) { - logger.warn(`hydrateFromWorkspace(${mediaId}) failed`, error); - return false; - } - } - - /** - * Revoke object URLs for a media - */ - revokeUrls(mediaId: string): void { - const urlsByIndex = this.objectUrls.get(mediaId); - if (urlsByIndex) { - for (const url of urlsByIndex.values()) { - URL.revokeObjectURL(url); - } - this.objectUrls.delete(mediaId); - } - } - - /** - * Clear all filmstrips - */ - async clearAll(): Promise { - // Revoke all URLs - for (const mediaId of this.objectUrls.keys()) { - this.revokeUrls(mediaId); - } - this.mediaDirCache.clear(); - - try { - const dir = await this.ensureDirectory(); - const entries: string[] = []; - for await (const entry of dir.values()) { - entries.push(entry.name); - } - for (const name of entries) { - await dir.removeEntry(name, { recursive: true }); - } - logger.debug(`Cleared ${entries.length} filmstrips`); - } catch (error) { - logger.error('Failed to clear filmstrips:', error); - } - void removeWorkspaceCacheEntry([WORKSPACE_FILMSTRIPS_DIR], { recursive: true }); - } - - /** - * List all stored filmstrips - */ - async list(): Promise { - try { - const dir = await this.ensureDirectory(); - const ids: string[] = []; - for await (const entry of dir.values()) { - if (entry.kind === 'directory') { - ids.push(entry.name); - } - } - return ids; - } catch { - return []; - } - } -} - -// Singleton -export const filmstripOPFSStorage = new FilmstripOPFSStorage(); - diff --git a/src/features/timeline/services/filmstrip-storage.ts b/src/features/timeline/services/filmstrip-storage.ts new file mode 100644 index 000000000..c05472043 --- /dev/null +++ b/src/features/timeline/services/filmstrip-storage.ts @@ -0,0 +1,446 @@ +/** + * Filmstrip Storage + * + * Filmstrip frames are now persisted in the selected workspace folder: + * filmstrips/{mediaId}/ + * meta.json - { width, height, isComplete, frameCount } + * 0.jpg, 1.jpg, 2.jpg, ... (legacy caches may still use .webp) + * + * Legacy OPFS filmstrips are read only as a fallback. When encountered, + * they are hydrated into the workspace so subsequent reads stay unified. + */ + +import { createLogger } from '@/shared/logging/logger'; +import { getCacheMigration } from '@/infrastructure/storage/cache-version'; +import { + readBlob, + readJson, + writeBlob, + writeJsonAtomic, + removeEntry, + listDirectory, +} from '@/infrastructure/storage/workspace-fs/fs-primitives'; +import { requireWorkspaceRoot } from '@/infrastructure/storage/workspace-fs/root'; +import { + filmstripFileFramePath, + filmstripMetaPath, + WORKSPACE_FILMSTRIPS_DIR, +} from '@/infrastructure/storage/workspace-fs/paths'; + +const logger = createLogger('FilmstripStorage'); + +const FILMSTRIP_DIR = 'filmstrips'; +const FRAME_RATE = 1; // Must match worker - 1fps for filmstrip thumbnails +const PRIMARY_FRAME_EXT = 'jpg'; +const LEGACY_FRAME_EXT = 'webp'; +const FRAME_EXTENSIONS = new Set([PRIMARY_FRAME_EXT, LEGACY_FRAME_EXT]); + +function parseFrameFileNameParts(name: string): { index: number; ext: string } | null { + const dotIndex = name.lastIndexOf('.'); + if (dotIndex <= 0) return null; + const ext = name.slice(dotIndex + 1).toLowerCase(); + if (!FRAME_EXTENSIONS.has(ext)) return null; + const index = parseInt(name.slice(0, dotIndex), 10); + if (Number.isNaN(index)) return null; + return { index, ext }; +} + +function parseFrameFileName(name: string): number | null { + return parseFrameFileNameParts(name)?.index ?? null; +} + +interface FilmstripMetadata { + width: number; + height: number; + isComplete: boolean; + frameCount: number; +} + +export interface FilmstripFrame { + index: number; + timestamp: number; + url: string; + byteSize?: number; + bitmap?: ImageBitmap; +} + +interface LoadedFilmstrip { + metadata: FilmstripMetadata; + frames: FilmstripFrame[]; + existingIndices: number[]; +} + +class FilmstripStorage { + private objectUrls = new Map>(); + private legacyInitPromise: Promise | null = null; + + private scheduleRevoke(urls: string[]): void { + if (urls.length === 0) return; + + const revoke = () => { + for (const url of urls) { + URL.revokeObjectURL(url); + } + }; + + if (typeof requestIdleCallback === 'function') { + requestIdleCallback(revoke, { timeout: 10_000 }); + return; + } + + setTimeout(revoke, 0); + } + + private setFrameUrl(mediaId: string, index: number, url: string): void { + const urlsByIndex = this.objectUrls.get(mediaId) ?? new Map(); + const previous = urlsByIndex.get(index); + urlsByIndex.set(index, url); + this.objectUrls.set(mediaId, urlsByIndex); + + if (previous && previous !== url) { + this.scheduleRevoke([previous]); + } + } + + private replaceAllFrameUrls( + mediaId: string, + entries: Array<{ index: number; url: string }>, + ): void { + const previous = this.objectUrls.get(mediaId); + const next = new Map(); + for (const entry of entries) { + next.set(entry.index, entry.url); + } + this.objectUrls.set(mediaId, next); + + if (!previous) return; + + const toRevoke: string[] = []; + for (const [index, url] of previous) { + const nextUrl = next.get(index); + if (nextUrl !== url) { + toRevoke.push(url); + } + } + this.scheduleRevoke(toRevoke); + } + + private async readMetadata( + mediaId: string, + ): Promise { + return await readJson(requireWorkspaceRoot(), filmstripMetaPath(mediaId)); + } + + private async ensureWorkspaceFilmstrip( + mediaId: string, + ): Promise { + const existing = await this.readMetadata(mediaId); + if (existing) return existing; + + const hydrated = await this.hydrateFromLegacyOpfs(mediaId); + if (!hydrated) return null; + return await this.readMetadata(mediaId); + } + + private async getLegacyFilmstripRoot(): Promise { + if (this.legacyInitPromise) return this.legacyInitPromise; + + this.legacyInitPromise = (async () => { + try { + const root = await navigator.storage.getDirectory(); + const dir = await root.getDirectoryHandle(FILMSTRIP_DIR, { create: true }); + + const migration = getCacheMigration('filmstrip'); + if (migration.needsMigration) { + const entries: string[] = []; + for await (const entry of dir.values()) { + entries.push(entry.name); + } + for (const name of entries) { + await dir.removeEntry(name, { recursive: true }).catch(() => undefined); + } + migration.markComplete(); + logger.info(`Legacy filmstrip cache cleared for v${migration.newVersion}`); + } + + return dir; + } catch (error) { + logger.warn('Failed to access legacy OPFS filmstrip root', error); + return null; + } + })(); + + return this.legacyInitPromise; + } + + private async getLegacyMediaDir(mediaId: string): Promise { + try { + const root = await this.getLegacyFilmstripRoot(); + if (!root) return null; + return await root.getDirectoryHandle(mediaId); + } catch { + return null; + } + } + + private async deleteLegacyFilmstrip(mediaId: string): Promise { + try { + const root = await this.getLegacyFilmstripRoot(); + if (!root) return; + await root.removeEntry(mediaId, { recursive: true }); + } catch { + // ignore missing legacy cache + } + } + + private async clearLegacyFilmstrips(): Promise { + try { + const root = await this.getLegacyFilmstripRoot(); + if (!root) return; + const entries: string[] = []; + for await (const entry of root.values()) { + entries.push(entry.name); + } + for (const name of entries) { + await root.removeEntry(name, { recursive: true }).catch(() => undefined); + } + } catch (error) { + logger.warn('Failed to clear legacy OPFS filmstrips', error); + } + } + + private async hydrateFromLegacyOpfs(mediaId: string): Promise { + try { + const mediaDir = await this.getLegacyMediaDir(mediaId); + if (!mediaDir) return false; + + const metaHandle = await mediaDir.getFileHandle('meta.json'); + const metaFile = await metaHandle.getFile(); + const metadata = JSON.parse(await metaFile.text()) as FilmstripMetadata; + await writeJsonAtomic(requireWorkspaceRoot(), filmstripMetaPath(mediaId), metadata); + + for await (const entry of mediaDir.values()) { + if (entry.kind !== 'file') continue; + const parsed = parseFrameFileNameParts(entry.name); + if (!parsed) continue; + const file = await (entry as FileSystemFileHandle).getFile(); + if (file.size <= 0) continue; + await writeBlob( + requireWorkspaceRoot(), + filmstripFileFramePath(mediaId, parsed.index, parsed.ext), + file, + ); + } + + logger.debug(`Hydrated filmstrip ${mediaId} from legacy OPFS`); + return true; + } catch (error) { + logger.warn(`hydrateFromLegacyOpfs(${mediaId}) failed`, error); + return false; + } + } + + async saveMetadata( + mediaId: string, + metadata: { width: number; height: number; isComplete: boolean; frameCount: number }, + ): Promise { + await writeJsonAtomic(requireWorkspaceRoot(), filmstripMetaPath(mediaId), metadata); + } + + async saveFrameBlob(mediaId: string, index: number, blob: Blob): Promise { + await writeBlob( + requireWorkspaceRoot(), + filmstripFileFramePath(mediaId, index, PRIMARY_FRAME_EXT), + blob, + ); + } + + async load(mediaId: string): Promise { + try { + const metadata = await this.ensureWorkspaceFilmstrip(mediaId); + if (!metadata) return null; + + const entries = await listDirectory(requireWorkspaceRoot(), [WORKSPACE_FILMSTRIPS_DIR, mediaId]); + const frameFilesByIndex = new Map(); + + for (const entry of entries) { + if (entry.kind !== 'file') continue; + const parsed = parseFrameFileNameParts(entry.name); + if (!parsed) continue; + + const blob = await readBlob( + requireWorkspaceRoot(), + filmstripFileFramePath(mediaId, parsed.index, parsed.ext), + ); + if (!blob || blob.size <= 0) continue; + + const existing = frameFilesByIndex.get(parsed.index); + const shouldReplace = !existing + || (parsed.ext === PRIMARY_FRAME_EXT && existing.ext !== PRIMARY_FRAME_EXT); + if (shouldReplace) { + frameFilesByIndex.set(parsed.index, { blob, ext: parsed.ext }); + } + } + + const frameFiles = Array.from(frameFilesByIndex.entries()) + .map(([index, value]) => ({ index, blob: value.blob })) + .sort((a, b) => a.index - b.index); + + const nextUrls: Array<{ index: number; url: string }> = []; + const frames: FilmstripFrame[] = frameFiles.map(({ index, blob }) => { + const url = URL.createObjectURL(blob); + nextUrls.push({ index, url }); + return { + index, + timestamp: index / FRAME_RATE, + url, + byteSize: blob.size, + }; + }); + this.replaceAllFrameUrls(mediaId, nextUrls); + + const existingIndices = frameFiles.map((frame) => frame.index); + + if (metadata.isComplete && frames.length === 0) { + logger.warn(`Filmstrip ${mediaId} marked complete but has 0 frames - resetting`); + metadata.isComplete = false; + metadata.frameCount = 0; + } + + logger.debug(`Loaded filmstrip ${mediaId}: ${frames.length} frames, complete: ${metadata.isComplete}`); + return { metadata, frames, existingIndices }; + } catch (error) { + logger.warn('Failed to load filmstrip:', error); + return null; + } + } + + async getExistingIndices( + mediaId: string, + startIndex?: number, + endIndex?: number, + ): Promise { + const metadata = await this.ensureWorkspaceFilmstrip(mediaId); + if (!metadata) return []; + + const entries = await listDirectory(requireWorkspaceRoot(), [WORKSPACE_FILMSTRIPS_DIR, mediaId]); + const indices = new Set(); + + for (const entry of entries) { + if (entry.kind !== 'file') continue; + const index = parseFrameFileName(entry.name); + if (index === null) continue; + if (typeof startIndex === 'number' && index < startIndex) continue; + if (typeof endIndex === 'number' && index >= endIndex) continue; + + const parsed = parseFrameFileNameParts(entry.name); + if (!parsed) continue; + const blob = await readBlob( + requireWorkspaceRoot(), + filmstripFileFramePath(mediaId, parsed.index, parsed.ext), + ); + if (blob && blob.size > 0) { + indices.add(index); + } + } + + return Array.from(indices).sort((a, b) => a - b); + } + + async loadSingleFrame(mediaId: string, index: number): Promise { + const metadata = await this.ensureWorkspaceFilmstrip(mediaId); + if (!metadata) return null; + + let blob = await readBlob( + requireWorkspaceRoot(), + filmstripFileFramePath(mediaId, index, PRIMARY_FRAME_EXT), + ); + if (!blob || blob.size === 0) { + blob = await readBlob( + requireWorkspaceRoot(), + filmstripFileFramePath(mediaId, index, LEGACY_FRAME_EXT), + ); + } + if (!blob || blob.size === 0) return null; + + const url = URL.createObjectURL(blob); + this.setFrameUrl(mediaId, index, url); + + return { + index, + timestamp: index / FRAME_RATE, + url, + byteSize: blob.size, + }; + } + + createFrameFromBlob(mediaId: string, index: number, blob: Blob): FilmstripFrame | null { + if (!blob || blob.size === 0) return null; + + const url = URL.createObjectURL(blob); + this.setFrameUrl(mediaId, index, url); + + return { + index, + timestamp: index / FRAME_RATE, + url, + byteSize: blob.size, + }; + } + + createFrameFromBitmap(_mediaId: string, index: number, bitmap: ImageBitmap): FilmstripFrame | null { + if (!bitmap || bitmap.width === 0) return null; + + return { + index, + timestamp: index / FRAME_RATE, + url: '', + byteSize: bitmap.width * bitmap.height * 4, + bitmap, + }; + } + + async isComplete(mediaId: string): Promise { + const metadata = await this.ensureWorkspaceFilmstrip(mediaId); + return metadata?.isComplete ?? false; + } + + async delete(mediaId: string): Promise { + this.revokeUrls(mediaId); + await removeEntry(requireWorkspaceRoot(), [WORKSPACE_FILMSTRIPS_DIR, mediaId], { + recursive: true, + }); + await this.deleteLegacyFilmstrip(mediaId); + logger.debug(`Deleted filmstrip ${mediaId}`); + } + + revokeUrls(mediaId: string): void { + const urlsByIndex = this.objectUrls.get(mediaId); + if (!urlsByIndex) return; + + for (const url of urlsByIndex.values()) { + URL.revokeObjectURL(url); + } + this.objectUrls.delete(mediaId); + } + + async clearAll(): Promise { + for (const mediaId of this.objectUrls.keys()) { + this.revokeUrls(mediaId); + } + + await removeEntry(requireWorkspaceRoot(), [WORKSPACE_FILMSTRIPS_DIR], { + recursive: true, + }).catch(() => undefined); + await this.clearLegacyFilmstrips(); + } + + async list(): Promise { + const entries = await listDirectory(requireWorkspaceRoot(), [WORKSPACE_FILMSTRIPS_DIR]); + return entries + .filter((entry) => entry.kind === 'directory') + .map((entry) => entry.name); + } +} + +export const filmstripStorage = new FilmstripStorage(); diff --git a/src/features/timeline/stores/actions/item-actions.linked-items.test.ts b/src/features/timeline/stores/actions/item-actions.linked-items.test.ts index a6b3530f4..e570d537a 100644 --- a/src/features/timeline/stores/actions/item-actions.linked-items.test.ts +++ b/src/features/timeline/stores/actions/item-actions.linked-items.test.ts @@ -1,5 +1,5 @@ import { beforeEach, describe, expect, it } from 'vitest'; -import type { AudioItem, TimelineTrack, VideoItem } from '@/types/timeline'; +import type { AudioItem, TextItem, TimelineTrack, VideoItem } from '@/types/timeline'; import { useItemsStore } from '../items-store'; import { useTransitionsStore } from '../transitions-store'; import { useKeyframesStore } from '../keyframes-store'; @@ -51,6 +51,20 @@ function makeAudioItem(overrides: Partial = {}): AudioItem { }; } +function makeTextItem(overrides: Partial = {}): TextItem { + return { + id: 'text-1', + type: 'text', + trackId: 'caption-track', + from: 0, + durationInFrames: 60, + text: 'Caption', + style: {}, + textRole: 'caption', + ...overrides, + }; +} + function makeTrack(overrides: Partial & Pick): TimelineTrack { return { height: 80, @@ -599,6 +613,65 @@ describe('linked timeline items', () => { expect(items.find((item) => item.id === 'solo-audio')).toBeUndefined(); }); + it('ripple delete shifts attached captions with their surviving clip', () => { + useItemsStore.getState().setTracks([ + makeTrack({ id: 'video-track', name: 'V1', order: 0, kind: 'video' }), + makeTrack({ id: 'audio-track', name: 'A1', order: 1, kind: 'audio' }), + makeTrack({ id: 'video-track-2', name: 'V2', order: 2, kind: 'video', syncLock: false }), + ]); + useItemsStore.getState().setItems([ + makeVideoItem({ + id: 'video-delete', + durationInFrames: 100, + linkedGroupId: 'group-del', + originId: 'origin-del', + mediaId: 'media-del', + }), + makeAudioItem({ + id: 'audio-delete', + durationInFrames: 100, + linkedGroupId: 'group-del', + originId: 'origin-del', + mediaId: 'media-del', + }), + makeVideoItem({ + id: 'video-survivor', + from: 100, + durationInFrames: 60, + linkedGroupId: 'group-survivor', + originId: 'origin-survivor', + mediaId: 'media-survivor', + }), + makeAudioItem({ + id: 'audio-survivor', + from: 100, + durationInFrames: 60, + linkedGroupId: 'group-survivor', + originId: 'origin-survivor', + mediaId: 'media-survivor', + }), + makeTextItem({ + id: 'caption-survivor', + trackId: 'video-track-2', + from: 100, + durationInFrames: 60, + captionSource: { + type: 'transcript', + clipId: 'video-survivor', + mediaId: 'media-survivor', + }, + }), + ]); + + rippleDeleteItems(['video-delete']); + + const state = useItemsStore.getState(); + expect(state.items.find((item) => item.id === 'video-survivor')).toMatchObject({ from: 0 }); + expect(state.items.find((item) => item.id === 'audio-survivor')).toMatchObject({ from: 0 }); + expect(state.items.find((item) => item.id === 'caption-survivor')).toMatchObject({ from: 0 }); + expect(state.maxItemEndFrame).toBe(60); + }); + it('links an arbitrary multi-selection with a fresh group id', () => { useItemsStore.getState().setItems([ makeVideoItem({ linkedGroupId: 'video-1' }), diff --git a/src/features/timeline/stores/actions/item-actions.slip-slide.test.ts b/src/features/timeline/stores/actions/item-actions.slip-slide.test.ts index f009d2fab..997e2008c 100644 --- a/src/features/timeline/stores/actions/item-actions.slip-slide.test.ts +++ b/src/features/timeline/stores/actions/item-actions.slip-slide.test.ts @@ -2,6 +2,8 @@ import { beforeEach, describe, expect, it } from 'vitest'; import type { VideoItem, TextItem } from '@/types/timeline'; import { useItemsStore } from '../items-store'; import { useTimelineSettingsStore } from '../timeline-settings-store'; +import { usePlaybackStore } from '@/shared/state/playback'; +import { usePreviewBridgeStore } from '@/shared/state/preview-bridge'; import { slipItem, slideItem } from './item-actions'; function makeVideoItem(overrides: Partial = {}): VideoItem { @@ -23,6 +25,20 @@ describe('slipItem', () => { useTimelineSettingsStore.setState({ fps: 30 }); useItemsStore.getState().setItems([]); useItemsStore.getState().setTracks([]); + usePlaybackStore.setState({ + currentFrame: 150, + currentFrameEpoch: 0, + previewFrame: null, + previewFrameEpoch: 0, + isPlaying: false, + }); + usePreviewBridgeStore.setState({ + displayedFrame: null, + captureFrame: null, + captureFrameImageData: null, + captureCanvasSource: null, + postEditWarmRequest: null, + }); }); it('shifts sourceStart and sourceEnd by slipDelta', () => { @@ -371,6 +387,51 @@ describe('slideItem', () => { expect(updatedMiddle.sourceEnd).toBe(updatedRight.sourceStart); }); + it('queues post-edit warm frames around the edited clip boundaries', () => { + const left = makeVideoItem({ + id: 'left', + trackId: 'track-1', + from: 0, + durationInFrames: 100, + sourceStart: 0, + sourceEnd: 100, + sourceDuration: 200, + sourceFps: 30, + }); + const middle = makeVideoItem({ + id: 'middle', + trackId: 'track-1', + from: 100, + durationInFrames: 100, + sourceStart: 0, + sourceEnd: 100, + sourceDuration: 200, + sourceFps: 30, + mediaId: 'media-2', + }); + const right = makeVideoItem({ + id: 'right', + trackId: 'track-1', + from: 200, + durationInFrames: 100, + sourceStart: 0, + sourceEnd: 100, + sourceDuration: 200, + sourceFps: 30, + mediaId: 'media-3', + }); + + useItemsStore.getState().setItems([left, middle, right]); + + slideItem('middle', 20, 'left', 'right'); + + expect(usePreviewBridgeStore.getState().postEditWarmRequest).toMatchObject({ + frame: 150, + itemIds: ['middle', 'left', 'right'], + frames: expect.arrayContaining([150, 120, 121, 218, 219, 0, 1, 118, 119, 220, 221, 298, 299]), + }); + }); + it('keeps default slide semantics for non-split chains', () => { const left = makeVideoItem({ id: 'left', diff --git a/src/features/timeline/stores/actions/item-actions.ts b/src/features/timeline/stores/actions/item-actions.ts index 03d878255..6744b7e7e 100644 --- a/src/features/timeline/stores/actions/item-actions.ts +++ b/src/features/timeline/stores/actions/item-actions.ts @@ -22,6 +22,7 @@ import { expandSelectionWithLinkedItems, getLinkedItemIds, } from '../../utils/linked-items'; +import { isTrackSyncLockEnabled } from '../../utils/track-sync-lock'; import { placeItemsWithoutTimelineOverlap } from './item-placement'; function isLinkedSelectionEnabled(): boolean { @@ -167,8 +168,9 @@ export function rippleDeleteItems(ids: string[]): void { })); // Per-track: shift downstream items on the same track as each deleted item. - // Linked counterparts on other tracks shift via buildLinkedLeftShiftUpdates. - // Solo clips on unrelated tracks are left in place. + // Linked counterparts and attached captions on tracks that won't be handled + // by sync-lock ripple get shifted manually. Solo clips on unrelated tracks + // are left in place. for (const item of remainingItems) { const shiftAmount = items .filter((candidate) => idsToDelete.has(candidate.id)) @@ -180,8 +182,30 @@ export function rippleDeleteItems(ids: string[]): void { } } + const trackById = new Map(useItemsStore.getState().tracks.map((track) => [track.id, track])); + const itemById = new Map(remainingItems.map((item) => [item.id, item])); + const shiftByItemId = new Map(); + + for (const [itemId, shiftAmount] of baseShiftByItemId) { + if (shiftAmount <= 0) continue; + + const relatedIds = expandIdsWithLinkedItems(remainingItems, [itemId], linkedSelectionEnabled); + for (const relatedId of relatedIds) { + const relatedItem = itemById.get(relatedId); + if (!relatedItem) continue; + + const handledBySyncLock = !editedTrackIds.has(relatedItem.trackId) + && isTrackSyncLockEnabled(trackById.get(relatedItem.trackId)); + if (handledBySyncLock) { + continue; + } + + shiftByItemId.set(relatedId, Math.max(shiftByItemId.get(relatedId) ?? 0, shiftAmount)); + } + } + const updates = remainingItems.flatMap((item) => { - const shiftAmount = baseShiftByItemId.get(item.id) ?? 0; + const shiftAmount = shiftByItemId.get(item.id) ?? 0; return shiftAmount > 0 ? [{ id: item.id, from: item.from - shiftAmount }] : []; diff --git a/src/features/timeline/stores/actions/item-edit-actions.ts b/src/features/timeline/stores/actions/item-edit-actions.ts index 3f5e05f04..e33cf730f 100644 --- a/src/features/timeline/stores/actions/item-edit-actions.ts +++ b/src/features/timeline/stores/actions/item-edit-actions.ts @@ -15,6 +15,7 @@ import { mediaLibraryService, opfsService, } from '@/features/timeline/deps/media-library-service'; +import { writeMediaSource } from '@/infrastructure/storage/workspace-fs/media-source'; import { toast } from 'sonner'; import { execute, applyTransitionRepairs, getLogger } from './shared'; import { @@ -24,12 +25,18 @@ import { getSynchronizedLinkedItemsForEdit, } from './linked-edit'; import { blobUrlManager } from '@/infrastructure/browser/blob-url-manager'; +import { usePlaybackStore } from '@/shared/state/playback'; +import { usePreviewBridgeStore } from '@/shared/state/preview-bridge'; import { timelineToSourceFrames, sourceToTimelineFrames } from '../../utils/source-calculations'; import { computeClampedSlipDelta } from '../../utils/slip-utils'; import { computeSlideContinuitySourceDelta } from '../../utils/slide-utils'; import { clampSlideDeltaToPreserveTransitions } from '../../utils/transition-utils'; import { calculateTransitionPortions } from '@/core/timeline/transitions/transition-planner'; -import { getLinkedItemIds, getUniqueLinkedItemAnchorIds } from '../../utils/linked-items'; +import { + expandItemIdsWithAttachedCaptions, + getLinkedItemIds, + getUniqueLinkedItemAnchorIds, +} from '../../utils/linked-items'; import { propagateInsertedGapToSyncLockedTracks, propagateRemovedIntervalsToSyncLockedTracks, @@ -40,6 +47,67 @@ function isLinkedSelectionEnabled(): boolean { return useEditorStore.getState().linkedSelectionEnabled; } +const POST_EDIT_WARM_MAX_FRAMES = 32; + +function appendWarmFrame(target: number[], seen: Set, frame: number): void { + if (!Number.isFinite(frame)) return; + const normalizedFrame = Math.max(0, Math.round(frame)); + if (seen.has(normalizedFrame)) return; + seen.add(normalizedFrame); + target.push(normalizedFrame); +} + +function appendItemWarmFrames( + target: number[], + seen: Set, + item: TimelineItem | undefined, +): void { + if (!item) return; + const startFrame = Math.max(0, Math.trunc(item.from)); + const endFrame = Math.max(startFrame, Math.trunc(item.from + item.durationInFrames) - 1); + appendWarmFrame(target, seen, startFrame); + appendWarmFrame(target, seen, Math.min(endFrame, startFrame + 1)); + appendWarmFrame(target, seen, Math.max(startFrame, endFrame - 1)); + appendWarmFrame(target, seen, endFrame); +} + +function collectPostEditWarmFrames( + itemIds: Iterable, + preferredFrames: number[] = [], +): number[] { + const frames: number[] = []; + const seen = new Set(); + + for (const frame of preferredFrames) { + appendWarmFrame(frames, seen, frame); + } + + const itemById = useItemsStore.getState().itemById; + for (const itemId of itemIds) { + appendItemWarmFrames(frames, seen, itemById[itemId]); + if (frames.length >= POST_EDIT_WARM_MAX_FRAMES) { + break; + } + } + + return frames.slice(0, POST_EDIT_WARM_MAX_FRAMES); +} + +function requestPostEditWarmForItems( + itemIds: Iterable, + preferredFrames: number[] = [], +): void { + const playbackState = usePlaybackStore.getState(); + if (playbackState.isPlaying) return; + + const uniqueItemIds = Array.from(new Set(itemIds)); + if (uniqueItemIds.length === 0) return; + + const primaryFrame = playbackState.currentFrame; + const warmFrames = collectPostEditWarmFrames(uniqueItemIds, [primaryFrame, ...preferredFrames]); + usePreviewBridgeStore.getState().requestPostEditWarm(primaryFrame, uniqueItemIds, warmFrames); +} + function applySynchronizedTrim(id: string, handle: 'start' | 'end', trimAmount: number): void { const itemsStore = useItemsStore.getState(); const itemsBefore = itemsStore.items; @@ -70,7 +138,9 @@ function applySynchronizedTrim(id: string, handle: 'start' | 'end', trimAmount: } } - applyTransitionRepairs(synchronizedItems.map((item) => item.id)); + const affectedIds = synchronizedItems.map((item) => item.id); + applyTransitionRepairs(affectedIds); + requestPostEditWarmForItems(affectedIds); useTimelineSettingsStore.getState().markDirty(); } @@ -485,7 +555,7 @@ export function rateStretchItem( moveUpdates.push({ id: downstream.id, from: downstream.from + endDelta }); // Also move linked companions on other tracks - const linkedIds = getLinkedItemIds(freshItems, downstream.id); + const linkedIds = expandItemIdsWithAttachedCaptions(freshItems, getLinkedItemIds(freshItems, downstream.id)); for (const linkedId of linkedIds) { if (linkedId === downstream.id || movedIds.has(linkedId)) continue; const linked = freshItems.find((i) => i.id === linkedId); @@ -506,7 +576,7 @@ export function rateStretchItem( if (neighbor) { movedIds.add(neighbor.id); moveUpdates.push({ id: neighbor.id, from: neighbor.from + endDelta }); - const linkedIds = getLinkedItemIds(freshItems, neighbor.id); + const linkedIds = expandItemIdsWithAttachedCaptions(freshItems, getLinkedItemIds(freshItems, neighbor.id)); for (const linkedId of linkedIds) { if (linkedId === neighbor.id || movedIds.has(linkedId)) continue; const linked = freshItems.find((i) => i.id === linkedId); @@ -537,7 +607,7 @@ export function rateStretchItem( movedIds.add(upstream.id); moveUpdates.push({ id: upstream.id, from: Math.max(0, upstream.from + fromDelta) }); - const linkedIds = getLinkedItemIds(freshItems, upstream.id); + const linkedIds = expandItemIdsWithAttachedCaptions(freshItems, getLinkedItemIds(freshItems, upstream.id)); for (const linkedId of linkedIds) { if (linkedId === upstream.id || movedIds.has(linkedId)) continue; const linked = freshItems.find((i) => i.id === linkedId); @@ -557,7 +627,7 @@ export function rateStretchItem( if (neighbor) { movedIds.add(neighbor.id); moveUpdates.push({ id: neighbor.id, from: Math.max(0, neighbor.from + fromDelta) }); - const linkedIds = getLinkedItemIds(freshItems, neighbor.id); + const linkedIds = expandItemIdsWithAttachedCaptions(freshItems, getLinkedItemIds(freshItems, neighbor.id)); for (const linkedId of linkedIds) { if (linkedId === neighbor.id || movedIds.has(linkedId)) continue; const linked = freshItems.find((i) => i.id === linkedId); @@ -579,6 +649,7 @@ export function rateStretchItem( // Repair transitions for all affected clips const allAffectedIds = [...allSynchronizedIds, ...movedIds]; applyTransitionRepairs(allAffectedIds); + requestPostEditWarmForItems(allAffectedIds); useTimelineSettingsStore.getState().markDirty(); }, { id, newFrom, newDuration, newSpeed }); @@ -703,7 +774,7 @@ export function resetSpeedWithRipple(itemIds: string[]): void { moveUpdates.push({ id: downstream.id, from: downstream.from + growth }); // Also move linked companions on other tracks - const linkedIds = getLinkedItemIds(freshItems, downstream.id); + const linkedIds = expandItemIdsWithAttachedCaptions(freshItems, getLinkedItemIds(freshItems, downstream.id)); for (const linkedId of linkedIds) { if (linkedId === downstream.id || movedIds.has(linkedId)) continue; const linked = freshItems.find((i) => i.id === linkedId); @@ -723,6 +794,7 @@ export function resetSpeedWithRipple(itemIds: string[]): void { // Phase 3: Repair transitions for all affected clips const allAffectedIds = [...allChangedIds, ...movedIds]; applyTransitionRepairs(allAffectedIds); + requestPostEditWarmForItems(allAffectedIds); useTimelineSettingsStore.getState().markDirty(); }, { itemIds }); @@ -861,10 +933,14 @@ export async function insertFreezeFrame( updatedAt: Date.now(), }; - // Store the frame blob in OPFS + // Store the frame blob in OPFS, then mirror it into the workspace folder + // so other origins and external tooling can see it on disk. const opfsPath = `content/${frameMediaId.slice(0, 2)}/${frameMediaId.slice(2, 4)}/${frameMediaId}/data`; await opfsService.saveFile(opfsPath, await frameBlob.arrayBuffer()); mediaMetadata.opfsPath = opfsPath; + void writeMediaSource(frameMediaId, frameBlob, fileName).catch((error) => { + getLogger().warn('[insertFreezeFrame] Failed to mirror frame to workspace', error); + }); await createMedia(mediaMetadata); await associateMediaWithProject(currentProjectId, frameMediaId); @@ -1067,6 +1143,7 @@ export function rippleTrimItem(id: string, handle: 'start' | 'end', trimDelta: n useKeyframesStore.getState()._removeKeyframesForItems(lockedRemoved); } applyTransitionRepairs(affected, lockedRemoved.length > 0 ? new Set(lockedRemoved) : undefined); + requestPostEditWarmForItems(affected); useTimelineSettingsStore.getState().markDirty(); }, { id, handle, trimDelta }); } @@ -1122,9 +1199,11 @@ export function rollingTrimItems(leftId: string, rightId: string, editPointDelta } // Repair transitions for both clips - applyTransitionRepairs(counterpartPair + const affectedIds = counterpartPair ? [leftId, rightId, counterpartPair.leftCounterpart.id, counterpartPair.rightCounterpart.id] - : [leftId, rightId]); + : [leftId, rightId]; + applyTransitionRepairs(affectedIds); + requestPostEditWarmForItems(affectedIds); useTimelineSettingsStore.getState().markDirty(); }, { leftId, rightId, editPointDelta }); @@ -1172,7 +1251,9 @@ export function slipItem(id: string, slipDelta: number): void { }); } - applyTransitionRepairs(synchronizedItems.map((synchronizedItem) => synchronizedItem.id)); + const affectedIds = synchronizedItems.map((synchronizedItem) => synchronizedItem.id); + applyTransitionRepairs(affectedIds); + requestPostEditWarmForItems(affectedIds); useTimelineSettingsStore.getState().markDirty(); }, { id, slipDelta }); @@ -1346,6 +1427,7 @@ export function slideItem( if (cpRightAdj) affectedIds.push(cpRightAdj.id); } applyTransitionRepairs(affectedIds); + requestPostEditWarmForItems(affectedIds); useTimelineSettingsStore.getState().markDirty(); }, { id, slideDelta, leftNeighborId, rightNeighborId }); diff --git a/src/features/timeline/stores/actions/linked-edit.test.ts b/src/features/timeline/stores/actions/linked-edit.test.ts index 0de97b5fa..7f7439c4f 100644 --- a/src/features/timeline/stores/actions/linked-edit.test.ts +++ b/src/features/timeline/stores/actions/linked-edit.test.ts @@ -62,6 +62,18 @@ describe('linked-edit helpers', () => { const items: TimelineItem[] = [ makeVideoItem({ id: 'video-2', from: 90, linkedGroupId: 'group-2' }), makeAudioItem({ id: 'audio-2', from: 90, linkedGroupId: 'group-2' }), + { + id: 'caption-2', + type: 'text', + trackId: 'caption-track', + from: 90, + durationInFrames: 60, + label: 'Caption', + text: 'Caption', + color: '#fff', + textRole: 'caption', + captionSource: { type: 'transcript', clipId: 'video-2', mediaId: 'media-1' }, + }, ]; expect( @@ -71,6 +83,7 @@ describe('linked-edit helpers', () => { ).toEqual([ { id: 'video-2', from: 60 }, { id: 'audio-2', from: 60 }, + { id: 'caption-2', from: 60 }, ]); }); @@ -118,4 +131,24 @@ describe('linked-edit helpers', () => { getMatchingSynchronizedLinkedCounterpartForEdit(items, 'video-left', 'audio-track', 'audio', true)?.id ).toBe('audio-left'); }); + + it('includes attached captions when expanding ids for deletion', () => { + const items: TimelineItem[] = [ + makeVideoItem({ id: 'video-1', linkedGroupId: undefined }), + { + id: 'caption-1', + type: 'text', + trackId: 'caption-track', + from: 0, + durationInFrames: 60, + label: 'Caption', + text: 'Caption', + color: '#fff', + textRole: 'caption', + captionSource: { type: 'transcript', clipId: 'video-1', mediaId: 'media-1' }, + }, + ]; + + expect(expandIdsWithLinkedItems(items, ['video-1'], false)).toEqual(['video-1', 'caption-1']); + }); }); diff --git a/src/features/timeline/stores/actions/linked-edit.ts b/src/features/timeline/stores/actions/linked-edit.ts index 31a8f6fd0..356975ecf 100644 --- a/src/features/timeline/stores/actions/linked-edit.ts +++ b/src/features/timeline/stores/actions/linked-edit.ts @@ -1,6 +1,7 @@ import type { TimelineItem } from '@/types/timeline'; import { buildSynchronizedLinkedMoveUpdates, + expandItemIdsWithAttachedCaptions, expandSelectionWithLinkedItems, getLinkedItems, getMatchingSynchronizedLinkedCounterpart, @@ -19,10 +20,10 @@ export function expandIdsWithLinkedItems( linkedSelectionEnabled: boolean, ): string[] { if (!linkedSelectionEnabled) { - return Array.from(new Set(ids)); + return expandItemIdsWithAttachedCaptions(items, Array.from(new Set(ids))); } - return expandSelectionWithLinkedItems(items, ids); + return expandItemIdsWithAttachedCaptions(items, expandSelectionWithLinkedItems(items, ids)); } export function getLinkedItemsForEdit( @@ -78,8 +79,16 @@ export function buildLinkedLeftShiftUpdates( linkedSelectionEnabled: boolean, ): Array<{ id: string; from: number }> { if (!linkedSelectionEnabled) { + const shiftByItemId = new Map(baseShiftByItemId); + for (const [itemId, shiftAmount] of baseShiftByItemId) { + if (shiftAmount <= 0) continue; + for (const attachedId of expandItemIdsWithAttachedCaptions(items, [itemId])) { + shiftByItemId.set(attachedId, Math.max(shiftByItemId.get(attachedId) ?? 0, shiftAmount)); + } + } + return items.flatMap((item) => { - const shiftAmount = baseShiftByItemId.get(item.id) ?? 0; + const shiftAmount = shiftByItemId.get(item.id) ?? 0; return shiftAmount > 0 ? [{ id: item.id, from: item.from - shiftAmount }] : []; @@ -111,6 +120,13 @@ export function buildLinkedLeftShiftUpdates( } } + for (const [itemId, shiftAmount] of shiftByItemId) { + if (shiftAmount <= 0) continue; + for (const attachedId of expandItemIdsWithAttachedCaptions(items, [itemId])) { + shiftByItemId.set(attachedId, Math.max(shiftByItemId.get(attachedId) ?? 0, shiftAmount)); + } + } + return items.flatMap((item) => { const shiftAmount = shiftByItemId.get(item.id) ?? 0; return shiftAmount > 0 diff --git a/src/features/timeline/stores/timeline-persistence.ts b/src/features/timeline/stores/timeline-persistence.ts index 4bf32a806..c209b7bdc 100644 --- a/src/features/timeline/stores/timeline-persistence.ts +++ b/src/features/timeline/stores/timeline-persistence.ts @@ -768,7 +768,7 @@ export async function saveTimeline(projectId: string): Promise { }); } - // Save thumbnail to IndexedDB + // Save thumbnail to workspace storage thumbnailId = `project:${projectId}:cover`; await saveThumbnail({ id: thumbnailId, diff --git a/src/features/timeline/utils/linked-items.test.ts b/src/features/timeline/utils/linked-items.test.ts index 96a72f68b..90204bcce 100644 --- a/src/features/timeline/utils/linked-items.test.ts +++ b/src/features/timeline/utils/linked-items.test.ts @@ -1,11 +1,14 @@ import { describe, expect, it } from 'vitest'; import type { TimelineItem } from '@/types/timeline'; import { + expandItemIdsWithAttachedCaptions, buildLinkedMovePreviewUpdates, canLinkSelection, canLinkItems, expandSelectionWithLinkedItems, filterUnlockedItemIds, + getAttachedCaptionItemIds, + getLinkedAndAttachedItemIds, getLinkedItemIds, getLinkedSyncOffsetFrames, getUniqueLinkedItemAnchorIds, @@ -70,6 +73,46 @@ describe('linked items', () => { expect(expandSelectionWithLinkedItems(items, ['video-1', 'video-2'])).toEqual(['video-1', 'audio-1', 'video-2']); }); + it('finds caption-role text attached to a clip', () => { + const items = [ + makeItem({ id: 'video-1', type: 'video' }), + makeItem({ + id: 'caption-1', + type: 'text', + text: 'Hello', + color: '#fff', + textRole: 'caption', + captionSource: { type: 'transcript', clipId: 'video-1', mediaId: 'media-1' }, + }), + makeItem({ + id: 'manual-text', + type: 'text', + text: 'Manual', + color: '#fff', + }), + ]; + + expect(getAttachedCaptionItemIds(items, 'video-1')).toEqual(['caption-1']); + expect(expandItemIdsWithAttachedCaptions(items, ['video-1'])).toEqual(['video-1', 'caption-1']); + }); + + it('includes attached captions when expanding a linked clip pair', () => { + const items = [ + makeItem({ id: 'video-1', linkedGroupId: 'group-1', type: 'video' }), + makeItem({ id: 'audio-1', linkedGroupId: 'group-1', type: 'audio' }), + makeItem({ + id: 'caption-1', + type: 'text', + text: 'Caption', + color: '#fff', + textRole: 'caption', + captionSource: { type: 'transcript', clipId: 'video-1', mediaId: 'media-1' }, + }), + ]; + + expect(getLinkedAndAttachedItemIds(items, 'audio-1')).toEqual(['video-1', 'audio-1', 'caption-1']); + }); + it('dedupes linked groups down to one split anchor', () => { const items = [ makeItem({ id: 'comp-video-1', linkedGroupId: 'group-1', type: 'composition', compositionId: 'comp-1' }), diff --git a/src/features/timeline/utils/linked-items.ts b/src/features/timeline/utils/linked-items.ts index be42022ae..3d34d7567 100644 --- a/src/features/timeline/utils/linked-items.ts +++ b/src/features/timeline/utils/linked-items.ts @@ -30,6 +30,43 @@ export function getLinkedItemIds(items: TimelineItem[], itemId: string): string[ return getLinkedItems(items, itemId).map((item) => item.id); } +export function getAttachedCaptionItemIds(items: TimelineItem[], itemId: string): string[] { + const anchor = items.find((item) => item.id === itemId); + if (!anchor || anchor.type === 'text') { + return []; + } + + return items + .filter((item) => + item.type === 'text' + && (item.textRole === 'caption' || item.captionSource !== undefined) + && item.captionSource?.clipId === anchor.id + ) + .map((item) => item.id); +} + +export function expandItemIdsWithAttachedCaptions(items: TimelineItem[], itemIds: string[]): string[] { + const expandedIds = new Set(); + const captionIds = new Set(); + + for (const itemId of itemIds) { + expandedIds.add(itemId); + for (const captionId of getAttachedCaptionItemIds(items, itemId)) { + captionIds.add(captionId); + } + } + + for (const captionId of captionIds) { + expandedIds.add(captionId); + } + + return Array.from(expandedIds); +} + +export function getLinkedAndAttachedItemIds(items: TimelineItem[], itemId: string): string[] { + return expandItemIdsWithAttachedCaptions(items, getLinkedItemIds(items, itemId)); +} + export function filterUnlockedItemIds( items: TimelineItem[], tracks: Pick[], diff --git a/src/features/timeline/utils/zoom-anchor.test.ts b/src/features/timeline/utils/zoom-anchor.test.ts new file mode 100644 index 000000000..627a41baa --- /dev/null +++ b/src/features/timeline/utils/zoom-anchor.test.ts @@ -0,0 +1,50 @@ +import { describe, expect, it } from 'vitest'; + +import { + getAnchoredZoomScrollLeft, + getCursorZoomAnchor, + getPlayheadZoomAnchor, +} from './zoom-anchor'; + +describe('zoom-anchor', () => { + it('derives a cursor anchor from the visible cursor position', () => { + expect(getCursorZoomAnchor({ + currentZoomLevel: 1, + cursorScreenX: 180, + maxDurationSeconds: 10, + scrollLeft: 40, + })).toEqual({ + anchorScreenX: 180, + anchorTimeSeconds: 2.2, + }); + }); + + it('derives a playhead anchor from the current playhead frame', () => { + expect(getPlayheadZoomAnchor({ + currentFrame: 60, + currentZoomLevel: 1, + fps: 30, + maxDurationSeconds: 10, + scrollLeft: 50, + })).toEqual({ + anchorScreenX: 150, + anchorTimeSeconds: 2, + }); + }); + + it('computes scrollLeft so the playhead stays in place while zooming', () => { + const playheadAnchor = getPlayheadZoomAnchor({ + currentFrame: 60, + currentZoomLevel: 1, + fps: 30, + maxDurationSeconds: 10, + scrollLeft: 50, + }); + + expect(getAnchoredZoomScrollLeft({ + anchor: playheadAnchor, + maxDurationSeconds: 10, + nextZoomLevel: 2, + })).toBe(250); + }); +}); diff --git a/src/features/timeline/utils/zoom-anchor.ts b/src/features/timeline/utils/zoom-anchor.ts new file mode 100644 index 000000000..38563737d --- /dev/null +++ b/src/features/timeline/utils/zoom-anchor.ts @@ -0,0 +1,69 @@ +import { ZOOM_MAX, ZOOM_MIN } from '../constants'; + +const PIXELS_PER_SECOND_AT_100_PERCENT = 100; + +function zoomLevelToPixelsPerSecond(zoomLevel: number): number { + return zoomLevel * PIXELS_PER_SECOND_AT_100_PERCENT; +} + +function clampTimeSeconds(timeSeconds: number, maxDurationSeconds: number): number { + return Math.max(0, Math.min(timeSeconds, maxDurationSeconds)); +} + +export interface TimelineZoomAnchor { + anchorScreenX: number; + anchorTimeSeconds: number; +} + +export function getCursorZoomAnchor(params: { + currentZoomLevel: number; + cursorScreenX: number; + maxDurationSeconds: number; + scrollLeft: number; +}): TimelineZoomAnchor { + const currentPixelsPerSecond = zoomLevelToPixelsPerSecond(params.currentZoomLevel); + const anchorContentX = params.scrollLeft + params.cursorScreenX; + + return { + anchorScreenX: params.cursorScreenX, + anchorTimeSeconds: clampTimeSeconds( + anchorContentX / currentPixelsPerSecond, + params.maxDurationSeconds, + ), + }; +} + +export function getPlayheadZoomAnchor(params: { + currentFrame: number; + currentZoomLevel: number; + fps: number; + maxDurationSeconds: number; + scrollLeft: number; +}): TimelineZoomAnchor { + const safeFps = params.fps > 0 ? params.fps : 1; + const anchorTimeSeconds = clampTimeSeconds( + params.currentFrame / safeFps, + params.maxDurationSeconds, + ); + const anchorContentX = anchorTimeSeconds * zoomLevelToPixelsPerSecond(params.currentZoomLevel); + + return { + anchorScreenX: anchorContentX - params.scrollLeft, + anchorTimeSeconds, + }; +} + +export function getAnchoredZoomScrollLeft(params: { + anchor: TimelineZoomAnchor; + maxDurationSeconds: number; + nextZoomLevel: number; +}): number { + const clampedZoomLevel = Math.max(ZOOM_MIN, Math.min(ZOOM_MAX, params.nextZoomLevel)); + const anchorTimeSeconds = clampTimeSeconds( + params.anchor.anchorTimeSeconds, + params.maxDurationSeconds, + ); + const nextAnchorContentX = anchorTimeSeconds * zoomLevelToPixelsPerSecond(clampedZoomLevel); + + return Math.max(0, nextAnchorContentX - params.anchor.anchorScreenX); +} diff --git a/src/features/timeline/workers/filmstrip-extraction-worker.ts b/src/features/timeline/workers/filmstrip-extraction-worker.ts index 293851396..38f8e16ec 100644 --- a/src/features/timeline/workers/filmstrip-extraction-worker.ts +++ b/src/features/timeline/workers/filmstrip-extraction-worker.ts @@ -1,23 +1,15 @@ /** * Filmstrip Extraction Worker * - * Extracts video frames using mediabunny's CanvasSink and saves - * directly to OPFS. All heavy work happens in the worker. - * - * Storage structure: - * filmstrips/{mediaId}/ - * meta.json - { width, height, isComplete, frameCount } - * 0.jpg, 1.jpg, 2.jpg, ... (legacy caches may still include .webp) + * Extracts video frames using mediabunny's CanvasSink. + * All heavy decode and JPEG encode work happens in the worker; the + * main thread persists the resulting blobs into the workspace. */ import { createMediabunnyInputSource } from '@/infrastructure/browser/mediabunny-input-source'; import type { ObjectUrlSourceMetadata } from '@/infrastructure/browser/object-url-registry'; -import { safeWrite } from '../utils/opfs-safe-write'; - -const FILMSTRIP_DIR = 'filmstrips'; const IMAGE_FORMAT = 'image/jpeg'; const IMAGE_QUALITY = 0.7; // JPEG is substantially faster to encode for tiny thumbnails -const FRAME_FILE_EXT = 'jpg'; const FRAME_RATE = 1; // 1fps for filmstrip thumbnails // Message types @@ -39,7 +31,7 @@ export interface ExtractRequest { endIndex?: number; // End frame index (exclusive) totalFrames?: number; // Total frames across all workers (for progress) workerId?: number; // Worker identifier for debugging - maxParallelSaves?: number; // Optional memory-pressure throttle from main thread + maxParallelSaves?: number; // Reserved for future worker-local throttling } export interface AbortRequest { @@ -93,37 +85,15 @@ function getRequestIdFromMessage(data: unknown): string { const loadMediabunny = () => import('mediabunny'); /** - * Get or create OPFS directory for filmstrip storage - */ -async function getFilmstripDir(mediaId: string): Promise { - const root = await navigator.storage.getDirectory(); - const filmstripRoot = await root.getDirectoryHandle(FILMSTRIP_DIR, { create: true }); - return filmstripRoot.getDirectoryHandle(mediaId, { create: true }); -} - -/** - * Save a frame to OPFS - */ -async function saveFrame( - dir: FileSystemDirectoryHandle, - index: number, - blob: Blob -): Promise { - const fileHandle = await dir.getFileHandle(`${index}.${FRAME_FILE_EXT}`, { create: true }); - const writable = await fileHandle.createWritable(); - await safeWrite(writable, blob); -} - -/** - * Extract frames and save directly to OPFS + * Extract frames and return encoded JPEG blobs to the main thread. */ async function extractAndSave( request: ExtractRequest, state: { aborted: boolean } ): Promise { const { - requestId, mediaId, blobUrl, blob, sourceMetadata, duration, width, height, skipIndices, priorityIndices, targetIndices, - startIndex, endIndex, totalFrames: totalFramesOverride, maxParallelSaves + requestId, blobUrl, blob, sourceMetadata, duration, width, height, skipIndices, priorityIndices, targetIndices, + startIndex, endIndex, totalFrames: totalFramesOverride } = request; // Calculate frame range - support both full extraction and chunked @@ -183,9 +153,6 @@ async function extractAndSave( return; } - // Get OPFS directory - const dir = await getFilmstripDir(mediaId); - // Load mediabunny const { Input, CanvasSink, ALL_FORMATS } = await loadMediabunny(); @@ -231,13 +198,11 @@ async function extractAndSave( } // Two parallel pipelines per frame: - // 1. FAST: createImageBitmap → transfer to main thread (instant display, no encode) - // 2. SLOW: convertToBlob (JPEG) → save to OPFS (persistence, runs in background) + // 1. FAST: createImageBitmap -> transfer to main thread (instant display, no encode) + // 2. SLOW: convertToBlob (JPEG) -> send blob to main thread for persistence // // Bitmaps are sent immediately on every decoded frame for instant UI updates. - // JPEG encode + OPFS save runs concurrently, blobs reported when ready. - const pendingSaves: Promise[] = []; - const MAX_PARALLEL_SAVES = Math.max(1, Math.min(6, maxParallelSaves ?? 4)); + // JPEG encode runs concurrently, with blobs reported as soon as they are ready. let pendingEncode: Promise<{ blob: Blob; frameIndex: number }> | null = null; let bitmapsSinceLastReport: Array<{ index: number; bitmap: ImageBitmap }> = []; @@ -245,15 +210,7 @@ async function extractAndSave( if (!pendingEncode) return; const { blob, frameIndex } = await pendingEncode; pendingEncode = null; - const savePromise = saveFrame(dir, frameIndex, blob).then(() => { - const idx = pendingSaves.indexOf(savePromise); - if (idx > -1) pendingSaves.splice(idx, 1); - savedSinceLastReport.push({ index: frameIndex, blob }); - }); - pendingSaves.push(savePromise); - if (pendingSaves.length >= MAX_PARALLEL_SAVES) { - await Promise.race(pendingSaves); - } + savedSinceLastReport.push({ index: frameIndex, blob }); }; for await (const wrapped of sink.canvasesAtTimestamps(timestampGenerator())) { @@ -281,7 +238,7 @@ async function extractAndSave( // Queue bitmap for immediate transfer to main thread (no JPEG encode needed) bitmapsSinceLastReport.push({ index: frameIndex, bitmap: displayBitmap }); - // Flush prior encode, then start JPEG encode in background for OPFS persistence + // Flush prior encode, then start JPEG encode in background for workspace persistence await flushPendingEncode(); const encodeCanvas = new OffscreenCanvas(encodeBitmap.width, encodeBitmap.height); const encodeCtx = encodeCanvas.getContext('2d')!; @@ -296,7 +253,7 @@ async function extractAndSave( frameListIndex++; // Send progress with bitmaps on every frame for instant display. - // savedFrames/savedIndices lag behind as JPEG encode + OPFS write complete. + // savedFrames/savedIndices lag behind as JPEG encode completes. const shouldReport = extractedCount <= 3 || extractedCount % 10 === 0 || bitmapsSinceLastReport.length > 0; if (shouldReport) { @@ -325,11 +282,6 @@ async function extractAndSave( // Flush the last pipelined encode await flushPendingEncode(); - // Wait for all pending saves to complete - if (pendingSaves.length > 0) { - await Promise.all(pendingSaves); - } - // Emit any saved frames that completed after the final progress report. if (savedSinceLastReport.length > 0) { const progress = Math.round((extractedCount / totalFrames) * 100); diff --git a/src/infrastructure/analysis/index.ts b/src/infrastructure/analysis/index.ts index d4267f818..02dd6138e 100644 --- a/src/infrastructure/analysis/index.ts +++ b/src/infrastructure/analysis/index.ts @@ -4,10 +4,38 @@ */ export { detectScenes, clearSceneCache } from '@/lib/analysis'; -export type { SceneCut, SceneDetectionProgress, VerificationModel } from '@/lib/analysis'; +export type { + SceneCut, + SceneDetectionProgress, + VerificationModel, +} from '@/lib/analysis'; export { getSceneVerificationModelLabel, getSceneVerificationModelOptions, } from '@/lib/analysis'; export { captionVideo, captionImage } from '@/lib/analysis'; export type { MediaCaption, CaptioningProgress, CaptioningOptions } from '@/lib/analysis'; +export { + embeddingsProvider, + EMBEDDING_MODEL_ID, + EMBEDDING_MODEL_DIM, + clipProvider, + CLIP_MODEL_ID, + CLIP_EMBEDDING_DIM, + buildEmbeddingText, + sliceTranscript, + extractDominantColors, + extractDominantColorPhrase, + rgbToLab, + deltaE76, + deltaE2000, +} from '@/lib/analysis'; +export type { + EmbeddingsOptions, + EmbeddingsProgress, + EmbeddingsProvider, + BuildEmbeddingTextInput, + TranscriptSegment, + PaletteEntry, + LabColor, +} from '@/lib/analysis'; diff --git a/src/infrastructure/storage/cache-version.ts b/src/infrastructure/storage/cache-version.ts index 32b825fb1..f0dea0018 100644 --- a/src/infrastructure/storage/cache-version.ts +++ b/src/infrastructure/storage/cache-version.ts @@ -25,7 +25,7 @@ const VERSION_PREFIX = 'cache-version-'; const CACHE_VERSIONS = { filmstrip: 9, // OPFS filmstrip frames (v9: invalidate incorrect partial-complete prewarms) waveform: 3, // OPFS waveform data (v3: stereo interleaved L/R peaks) - thumbnail: 1, // IndexedDB thumbnails + thumbnail: 1, // Workspace-backed thumbnails media: 1, // OPFS media files } as const; diff --git a/src/infrastructure/storage/index.ts b/src/infrastructure/storage/index.ts index 46ae9bedc..ba07549c4 100644 --- a/src/infrastructure/storage/index.ts +++ b/src/infrastructure/storage/index.ts @@ -86,6 +86,44 @@ export { deleteTranscript, } from '@/infrastructure/storage/workspace-fs/transcripts'; +// AI captions (vision-language-model frame descriptions) +export { + getCaptions, + saveCaptions, + deleteCaptions, + saveCaptionThumbnail, + getCaptionThumbnailBlob, + probeCaptionThumbnail, + deleteCaptionThumbnails, + saveCaptionEmbeddings, + getCaptionEmbeddings, + getCaptionsEmbeddingsMeta, + deleteCaptionEmbeddings, + saveCaptionImageEmbeddings, + getCaptionImageEmbeddings, +} from '@/infrastructure/storage/workspace-fs/captions'; + +// Scene-detection results +export { + getScenes, + saveScenes, + deleteScenes, + type SavedScenes, +} from '@/infrastructure/storage/workspace-fs/scenes'; + +// Generic AI-output envelope (use these directly for new AI services) +export { + readAiOutput, + writeAiOutput, + deleteAiOutput, + listAiOutputs, + getMediaIdsWithAiOutput, + AI_OUTPUT_SCHEMA_VERSION, + type AiOutput, + type AiOutputKind, + type AiOutputPayloads, +} from '@/infrastructure/storage/workspace-fs/ai-outputs'; + // Orphan cache sweep export { sweepWorkspaceOrphans, diff --git a/src/infrastructure/storage/workspace-fs/README.template.md b/src/infrastructure/storage/workspace-fs/README.template.md index 816c33044..ed229aff6 100644 --- a/src/infrastructure/storage/workspace-fs/README.template.md +++ b/src/infrastructure/storage/workspace-fs/README.template.md @@ -1,6 +1,6 @@ # FreeCut Workspace -This folder is your FreeCut project workspace — the app's source of truth +This folder is your FreeCut project workspace - the app's source of truth for everything: projects, media metadata, thumbnails, waveforms, caches. Everything here is **plain files** you can `cat`, `grep`, and diff with @@ -10,30 +10,33 @@ normal tools. AI coding agents can read them directly without a browser. ``` ./ -├── README.md ← this file -├── .freecut-workspace.json ← marker + schema version -├── index.json ← fast project list -├── projects/ -│ └── / -│ ├── project.json ← timeline, settings, keyframes, markers, transitions -│ ├── thumbnail.jpg -│ └── media-links.json ← which media this project uses -├── media/ -│ └── / -│ ├── metadata.json ← codec, duration, resolution, etc. -│ ├── source. ← inline source file -│ ├── source.link.json ← OR a link descriptor to an external file -│ ├── thumbnail.jpg -│ └── cache/ -│ ├── filmstrip/ ← timeline thumbnails -│ ├── waveform/ ← audio peaks (binned binary) -│ ├── gif-frames/ ← pre-extracted GIF frames -│ ├── decoded-audio/ ← preview audio for non-browser codecs -│ └── transcript.json -└── content/ - └── / - ├── refs.json ← reference count - └── data. ← deduped blob +|-- README.md <- this file +|-- .freecut-workspace.json <- marker + schema version +|-- index.json <- fast project list +|-- projects/ +| `-- / +| |-- project.json <- timeline, settings, keyframes, markers, transitions +| |-- thumbnail.jpg +| `-- media-links.json <- which media this project uses +|-- media/ +| `-- / +| |-- metadata.json <- codec, duration, resolution, etc. +| |-- source. <- inline source file +| |-- source.link.json <- OR a link descriptor to an external file +| |-- thumbnail.jpg +| `-- cache/ +| |-- waveform/ <- audio peaks (binned binary) +| |-- gif-frames/ <- pre-extracted GIF frames +| |-- decoded-audio/ <- preview audio for non-browser codecs +| `-- transcript.json +|-- filmstrips/ +| `-- / <- timeline thumbnail cache +| |-- meta.json +| `-- 0.jpg, 1.jpg, ... +`-- content/ + `-- / + |-- refs.json <- reference count + `-- data. <- deduped blob ``` ## Safe to edit? @@ -41,10 +44,10 @@ normal tools. AI coding agents can read them directly without a browser. Everything except media source bytes is safe to inspect. Editing `project.json` externally works; FreeCut picks up changes on next load. -Binary caches (waveforms, decoded audio) are regeneratable — delete and -the app will rebuild them on demand. +Binary caches (waveforms, decoded audio, filmstrips) are regeneratable - +delete them and the app will rebuild them on demand. ## Moving the workspace -You can move this folder to a new location — the app just needs you to +You can move this folder to a new location - the app just needs you to re-pick it via the "Reconnect" prompt on next launch. diff --git a/src/infrastructure/storage/workspace-fs/ai-outputs/ai-outputs.test.ts b/src/infrastructure/storage/workspace-fs/ai-outputs/ai-outputs.test.ts new file mode 100644 index 000000000..5de646e47 --- /dev/null +++ b/src/infrastructure/storage/workspace-fs/ai-outputs/ai-outputs.test.ts @@ -0,0 +1,146 @@ +import { afterEach, describe, expect, it, vi } from 'vitest'; + +vi.mock('@/shared/logging/logger', () => ({ + createLogger: () => ({ + info: vi.fn(), + warn: vi.fn(), + error: vi.fn(), + debug: vi.fn(), + event: vi.fn(), + startEvent: () => ({ set: vi.fn(), merge: vi.fn(), success: vi.fn(), failure: vi.fn() }), + child: vi.fn(), + setLevel: vi.fn(), + }), + createOperationId: () => 'op-test', +})); + +import { + deleteAiOutput, + getMediaIdsWithAiOutput, + listAiOutputs, + readAiOutput, + writeAiOutput, +} from './io'; +import { setWorkspaceRoot } from '../root'; +import { asHandle, createRoot } from '../__tests__/in-memory-handle'; + +afterEach(() => setWorkspaceRoot(null)); + +describe('workspace-fs ai-outputs', () => { + it('round-trips a captions envelope', async () => { + setWorkspaceRoot(asHandle(createRoot())); + const written = await writeAiOutput({ + mediaId: 'm1', + kind: 'captions', + service: 'lfm-captioning', + model: 'lfm-2.5-vl', + params: { sampleIntervalSec: 2 }, + data: { sampleIntervalSec: 2, captions: [{ timeSec: 0, text: 'hello' }] }, + }); + expect(written.createdAt).toBeGreaterThan(0); + expect(written.updatedAt).toBeGreaterThanOrEqual(written.createdAt); + + const loaded = await readAiOutput('m1', 'captions'); + expect(loaded?.data.captions[0]?.text).toBe('hello'); + expect(loaded?.service).toBe('lfm-captioning'); + }); + + it('preserves createdAt across updates', async () => { + setWorkspaceRoot(asHandle(createRoot())); + const first = await writeAiOutput({ + mediaId: 'm1', + kind: 'captions', + service: 'lfm-captioning', + model: 'lfm-2.5-vl', + data: { captions: [{ timeSec: 0, text: 'v1' }] }, + }); + + await new Promise((r) => setTimeout(r, 2)); + const second = await writeAiOutput({ + mediaId: 'm1', + kind: 'captions', + service: 'lfm-captioning', + model: 'lfm-2.5-vl', + data: { captions: [{ timeSec: 0, text: 'v2' }] }, + }); + + expect(second.createdAt).toBe(first.createdAt); + expect(second.updatedAt).toBeGreaterThanOrEqual(first.updatedAt); + }); + + it('readAiOutput returns undefined when missing', async () => { + setWorkspaceRoot(asHandle(createRoot())); + expect(await readAiOutput('missing', 'captions')).toBeUndefined(); + }); + + it('deleteAiOutput removes the file', async () => { + setWorkspaceRoot(asHandle(createRoot())); + await writeAiOutput({ + mediaId: 'm1', + kind: 'scenes', + service: 'scene-detect', + model: 'histogram', + data: { + method: 'histogram', + sampleIntervalMs: 250, + fps: 30, + cuts: [], + }, + }); + await deleteAiOutput('m1', 'scenes'); + expect(await readAiOutput('m1', 'scenes')).toBeUndefined(); + }); + + it('listAiOutputs returns saved kinds', async () => { + setWorkspaceRoot(asHandle(createRoot())); + await writeAiOutput({ + mediaId: 'm1', + kind: 'captions', + service: 'lfm-captioning', + model: 'lfm-2.5-vl', + data: { captions: [] }, + }); + await writeAiOutput({ + mediaId: 'm1', + kind: 'scenes', + service: 'scene-detect', + model: 'histogram', + data: { + method: 'histogram', + sampleIntervalMs: 250, + fps: 30, + cuts: [], + }, + }); + + const kinds = await listAiOutputs('m1'); + expect(new Set(kinds)).toEqual(new Set(['captions', 'scenes'])); + }); + + it('getMediaIdsWithAiOutput filters to ids with output present', async () => { + setWorkspaceRoot(asHandle(createRoot())); + await writeAiOutput({ + mediaId: 'm1', + kind: 'captions', + service: 'lfm-captioning', + model: 'lfm-2.5-vl', + data: { captions: [] }, + }); + await writeAiOutput({ + mediaId: 'm3', + kind: 'captions', + service: 'lfm-captioning', + model: 'lfm-2.5-vl', + data: { captions: [] }, + }); + + const ids = await getMediaIdsWithAiOutput(['m1', 'm2', 'm3'], 'captions'); + expect(ids).toEqual(new Set(['m1', 'm3'])); + }); + + it('getMediaIdsWithAiOutput short-circuits on empty input', async () => { + setWorkspaceRoot(null); + const ids = await getMediaIdsWithAiOutput([], 'captions'); + expect(ids.size).toBe(0); + }); +}); diff --git a/src/infrastructure/storage/workspace-fs/ai-outputs/index.ts b/src/infrastructure/storage/workspace-fs/ai-outputs/index.ts new file mode 100644 index 000000000..06b683d46 --- /dev/null +++ b/src/infrastructure/storage/workspace-fs/ai-outputs/index.ts @@ -0,0 +1,21 @@ +export type { + AiOutput, + AiOutputKind, + AiOutputPayloads, + TranscriptPayload, + CaptionsPayload, + ScenesPayload, + SceneCutPayload, +} from './types'; +export { + AI_OUTPUT_SCHEMA_VERSION, + transcriptFromLegacy, + transcriptToLegacy, +} from './types'; +export { + readAiOutput, + writeAiOutput, + deleteAiOutput, + listAiOutputs, + getMediaIdsWithAiOutput, +} from './io'; diff --git a/src/infrastructure/storage/workspace-fs/ai-outputs/io.ts b/src/infrastructure/storage/workspace-fs/ai-outputs/io.ts new file mode 100644 index 000000000..3e3e48633 --- /dev/null +++ b/src/infrastructure/storage/workspace-fs/ai-outputs/io.ts @@ -0,0 +1,134 @@ +/** + * Generic CRUD for AI output envelopes under `media/{id}/cache/ai/{kind}.json`. + * + * Every per-kind wrapper (transcripts, captions, scenes…) delegates here so + * the on-disk layout, envelope shape, and error handling stay uniform. + */ + +import { createLogger } from '@/shared/logging/logger'; + +import { requireWorkspaceRoot } from '../root'; +import { + readJson, + removeEntry, + writeJsonAtomic, + listDirectory, +} from '../fs-primitives'; +import { aiOutputPath, aiOutputsDir } from '../paths'; + +import { + AI_OUTPUT_SCHEMA_VERSION, + type AiOutput, + type AiOutputKind, + type AiOutputPayloads, +} from './types'; + +const logger = createLogger('WorkspaceFS:AiOutputs'); + +export async function readAiOutput( + mediaId: string, + kind: K, +): Promise | undefined> { + const root = requireWorkspaceRoot(); + try { + const result = await readJson>(root, aiOutputPath(mediaId, kind)); + return result ?? undefined; + } catch (error) { + logger.error(`readAiOutput(${mediaId}, ${kind}) failed`, error); + throw new Error(`Failed to load AI output ${kind} for ${mediaId}`); + } +} + +interface WriteInput { + mediaId: string; + kind: K; + service: string; + model: string; + params?: Record; + data: AiOutputPayloads[K]; +} + +/** + * Write an envelope atomically. Sets `createdAt` on first write and updates + * `updatedAt` every time. Returns the persisted envelope. + */ +export async function writeAiOutput( + input: WriteInput, +): Promise> { + const root = requireWorkspaceRoot(); + const now = Date.now(); + const existing = await readJson>(root, aiOutputPath(input.mediaId, input.kind)); + + const envelope: AiOutput = { + schemaVersion: AI_OUTPUT_SCHEMA_VERSION, + kind: input.kind, + mediaId: input.mediaId, + service: input.service, + model: input.model, + params: input.params ?? {}, + createdAt: existing?.createdAt ?? now, + updatedAt: now, + data: input.data, + }; + + try { + await writeJsonAtomic(root, aiOutputPath(input.mediaId, input.kind), envelope); + return envelope; + } catch (error) { + logger.error(`writeAiOutput(${input.mediaId}, ${input.kind}) failed`, error); + throw new Error(`Failed to save AI output ${input.kind} for ${input.mediaId}`); + } +} + +export async function deleteAiOutput( + mediaId: string, + kind: AiOutputKind, +): Promise { + const root = requireWorkspaceRoot(); + try { + await removeEntry(root, aiOutputPath(mediaId, kind)); + } catch (error) { + logger.error(`deleteAiOutput(${mediaId}, ${kind}) failed`, error); + throw new Error(`Failed to delete AI output ${kind} for ${mediaId}`); + } +} + +/** + * List every AI output kind present for `mediaId`. Returns the `kind` stems + * (no extension). Used by cleanup sweeps and debug UIs. + */ +export async function listAiOutputs(mediaId: string): Promise { + const root = requireWorkspaceRoot(); + try { + const entries = await listDirectory(root, aiOutputsDir(mediaId)); + return entries + .filter((entry) => entry.kind === 'file' && entry.name.endsWith('.json')) + .map((entry) => entry.name.slice(0, -'.json'.length) as AiOutputKind); + } catch (error) { + logger.warn(`listAiOutputs(${mediaId}) failed`, error); + return []; + } +} + +/** + * Bulk existence probe. Returns the subset of `mediaIds` that have a saved + * output of `kind`. Concurrent reads — callers should pre-batch by kind. + */ +export async function getMediaIdsWithAiOutput( + mediaIds: string[], + kind: AiOutputKind, +): Promise> { + if (mediaIds.length === 0) return new Set(); + const root = requireWorkspaceRoot(); + const ready = new Set(); + const results = await Promise.all( + mediaIds.map(async (id) => { + const env = await readJson>(root, aiOutputPath(id, kind)); + return env ? id : null; + }), + ); + for (const id of results) { + if (id) ready.add(id); + } + return ready; +} diff --git a/src/infrastructure/storage/workspace-fs/ai-outputs/types.ts b/src/infrastructure/storage/workspace-fs/ai-outputs/types.ts new file mode 100644 index 000000000..00411af3e --- /dev/null +++ b/src/infrastructure/storage/workspace-fs/ai-outputs/types.ts @@ -0,0 +1,143 @@ +/** + * Shared envelope for every AI-derived analysis output stored under + * `media/{id}/cache/ai/{kind}.json`. + * + * One file per `AiOutputKind`. Envelope fields are identical across kinds so + * invalidation logic ("is this transcript from the same model we use today?") + * works uniformly. Service-specific data goes inside `data`. + */ + +import type { MediaCaption } from '@/infrastructure/analysis'; +import type { MediaTranscript, MediaTranscriptModel, MediaTranscriptQuantization } from '@/types/storage'; + +/** + * Registry of AI output kinds. Adding a new AI service means: + * 1. Add its name here. + * 2. Add its payload type to `AiOutputPayloads` below. + * 3. (Optional) Add a thin wrapper in `workspace-fs/` that calls + * `readAiOutput/writeAiOutput` with that kind. + */ +export type AiOutputKind = + | 'transcript' + | 'captions' + | 'scenes'; + +/** + * Typed payload per kind. Matches the `data` field on `AiOutput`. + * New kinds must be registered here so the storage API stays strongly typed. + */ +export interface AiOutputPayloads { + transcript: TranscriptPayload; + captions: CaptionsPayload; + scenes: ScenesPayload; +} + +/** + * Current schema version for the envelope itself. Bump when the envelope + * shape changes (not when a payload changes — that's the payload's concern). + */ +export const AI_OUTPUT_SCHEMA_VERSION = 1; + +export interface AiOutput { + schemaVersion: typeof AI_OUTPUT_SCHEMA_VERSION; + kind: K; + mediaId: string; + /** Stable service identifier, e.g. `"whisper-wasm"`, `"lfm-captioning"`. */ + service: string; + /** Model id/version, e.g. `"whisper-small"`, `"lfm-2.5-vl"`. */ + model: string; + /** Service-specific inputs that affect the output (quantization, threshold, sample interval). */ + params: Record; + createdAt: number; + updatedAt: number; + data: AiOutputPayloads[K]; +} + +/* ───────────────── Payload shapes ───────────────── */ + +export interface TranscriptPayload { + language?: string; + quantization: MediaTranscriptQuantization; + modelVariant: MediaTranscriptModel; + text: string; + segments: Array<{ text: string; start: number; end: number }>; +} + +export type CaptionsPayload = { + sampleIntervalSec?: number; + /** + * Identifier of the text embedding model whose vectors live in the + * companion `captions-embeddings.bin` file. Absence means embeddings + * haven't been computed yet (keyword search still works). + */ + embeddingModel?: string; + /** Dimension of each text embedding vector, e.g. 384 for all-MiniLM-L6-v2. */ + embeddingDim?: number; + /** + * Identifier of the image (CLIP) embedding model whose vectors live + * in `captions-image-embeddings.bin`. Independent of the text model; + * present only when thumbnails have been visually indexed. + */ + imageEmbeddingModel?: string; + /** Dimension of each image embedding vector, e.g. 512 for CLIP base. */ + imageEmbeddingDim?: number; + captions: MediaCaption[]; +}; + +export interface SceneCutPayload { + frame: number; + time: number; + /** Service-defined motion metadata (histogram distance, flow magnitude, etc.). */ + motion: unknown; + verified?: boolean; +} + +export interface ScenesPayload { + method: 'histogram' | 'optical-flow'; + sampleIntervalMs: number; + verificationModel?: string; + fps: number; + cuts: SceneCutPayload[]; +} + +/* ───────────────── Conversions ───────────────── */ + +/** + * Adapter: build a transcript envelope from the legacy {@link MediaTranscript} + * record shape. Keeps callers that already construct `MediaTranscript` working + * unchanged during the migration. + */ +export function transcriptFromLegacy(record: MediaTranscript): AiOutput<'transcript'> { + return { + schemaVersion: AI_OUTPUT_SCHEMA_VERSION, + kind: 'transcript', + mediaId: record.mediaId, + service: 'whisper', + model: record.model, + params: { quantization: record.quantization, language: record.language }, + createdAt: record.createdAt, + updatedAt: record.updatedAt ?? record.createdAt, + data: { + language: record.language, + quantization: record.quantization, + modelVariant: record.model, + text: record.text, + segments: record.segments, + }, + }; +} + +/** Inverse of {@link transcriptFromLegacy}. */ +export function transcriptToLegacy(envelope: AiOutput<'transcript'>): MediaTranscript { + return { + id: envelope.mediaId, + mediaId: envelope.mediaId, + model: envelope.data.modelVariant, + language: envelope.data.language, + quantization: envelope.data.quantization, + text: envelope.data.text, + segments: envelope.data.segments, + createdAt: envelope.createdAt, + updatedAt: envelope.updatedAt, + }; +} diff --git a/src/infrastructure/storage/workspace-fs/captions.ts b/src/infrastructure/storage/workspace-fs/captions.ts new file mode 100644 index 000000000..25821876a --- /dev/null +++ b/src/infrastructure/storage/workspace-fs/captions.ts @@ -0,0 +1,321 @@ +/** + * Per-media AI captions (vision-language-model frame descriptions). + * + * Stored at `media/{mediaId}/cache/ai/captions.json` as an {@link AiOutput} + * envelope. A denormalized copy lives on `MediaMetadata.aiCaptions` as a + * read-path convenience for UI consumers — writers must keep them in sync. + */ + +import type { MediaCaption } from '@/infrastructure/analysis'; +import { createLogger } from '@/shared/logging/logger'; + +import { readAiOutput, writeAiOutput, deleteAiOutput } from './ai-outputs'; +import { readArrayBuffer, readBlob, removeEntry, writeBlob } from './fs-primitives'; +import { + captionEmbeddingsPath, + captionImageEmbeddingsPath, + captionThumbPath, + captionThumbRelPath, + captionThumbsDir, +} from './paths'; +import { requireWorkspaceRoot } from './root'; + +const logger = createLogger('WorkspaceFS:Captions'); + +interface SaveCaptionsInput { + mediaId: string; + captions: MediaCaption[]; + /** Stable provider id, e.g. `"lfm-captioning"`. */ + service: string; + /** Model id/version reported by the provider, e.g. `"lfm-2.5-vl"`. */ + model: string; + /** Sample interval used at generation time — kept for invalidation. */ + sampleIntervalSec?: number; + /** Text-embedding model id whose vectors are stored in the companion `.bin`. */ + embeddingModel?: string; + /** Dimension of each text embedding vector. */ + embeddingDim?: number; + /** CLIP image-embedding model id (separate bin). */ + imageEmbeddingModel?: string; + /** Dimension of each image embedding vector. */ + imageEmbeddingDim?: number; +} + +export async function getCaptions( + mediaId: string, +): Promise { + try { + const envelope = await readAiOutput(mediaId, 'captions'); + return envelope?.data.captions; + } catch (error) { + logger.error(`getCaptions(${mediaId}) failed`, error); + throw new Error(`Failed to load captions: ${mediaId}`); + } +} + +export async function saveCaptions(input: SaveCaptionsInput): Promise { + try { + const written = await writeAiOutput({ + mediaId: input.mediaId, + kind: 'captions', + service: input.service, + model: input.model, + params: input.sampleIntervalSec !== undefined ? { sampleIntervalSec: input.sampleIntervalSec } : {}, + data: { + sampleIntervalSec: input.sampleIntervalSec, + embeddingModel: input.embeddingModel, + embeddingDim: input.embeddingDim, + imageEmbeddingModel: input.imageEmbeddingModel, + imageEmbeddingDim: input.imageEmbeddingDim, + captions: input.captions, + }, + }); + return written.data.captions; + } catch (error) { + logger.error(`saveCaptions(${input.mediaId}) failed`, error); + throw new Error(`Failed to save captions: ${input.mediaId}`); + } +} + +/** + * Read the raw embedding metadata saved alongside captions — both text + * and image model identifiers so ranking can decide whether each bin is + * safe to load back in. + */ +export async function getCaptionsEmbeddingsMeta( + mediaId: string, +): Promise<{ + embeddingModel?: string; + embeddingDim?: number; + imageEmbeddingModel?: string; + imageEmbeddingDim?: number; +} | null> { + const envelope = await readAiOutput(mediaId, 'captions'); + if (!envelope) return null; + return { + embeddingModel: envelope.data.embeddingModel, + embeddingDim: envelope.data.embeddingDim, + imageEmbeddingModel: envelope.data.imageEmbeddingModel, + imageEmbeddingDim: envelope.data.imageEmbeddingDim, + }; +} + +/** + * Persist caption embeddings as a contiguous `Float32Array`. Layout is + * `captionCount * embeddingDim` floats, stored in caption index order. + * The companion `captions.json` records {@link embeddingModel} and + * {@link embeddingDim} so a later read can detect model-drift before + * trusting the payload. + */ +export async function saveCaptionEmbeddings( + mediaId: string, + vectors: Float32Array[], + embeddingDim: number, +): Promise { + if (vectors.length === 0) return; + const root = requireWorkspaceRoot(); + const packed = new Float32Array(vectors.length * embeddingDim); + vectors.forEach((vector, index) => { + if (vector.length !== embeddingDim) { + throw new Error( + `Embedding dim mismatch at index ${index}: got ${vector.length}, expected ${embeddingDim}`, + ); + } + packed.set(vector, index * embeddingDim); + }); + try { + await writeBlob(root, captionEmbeddingsPath(mediaId), packed.buffer); + } catch (error) { + logger.error(`saveCaptionEmbeddings(${mediaId}) failed`, error); + throw new Error(`Failed to save caption embeddings: ${mediaId}`); + } +} + +/** + * Load caption embeddings back into an array of `Float32Array`s. Returns + * `null` when no `.bin` exists (pre-feature captions) or when the saved + * vector count doesn't match `expectedCount` (captions changed under our + * feet and the bin is stale). + */ +export async function getCaptionEmbeddings( + mediaId: string, + embeddingDim: number, + expectedCount: number, +): Promise { + if (expectedCount === 0) return []; + const root = requireWorkspaceRoot(); + try { + const buffer = await readArrayBuffer(root, captionEmbeddingsPath(mediaId)); + if (!buffer) return null; + const expectedFloats = expectedCount * embeddingDim; + const got = buffer.byteLength / Float32Array.BYTES_PER_ELEMENT; + if (got !== expectedFloats) { + logger.warn( + `getCaptionEmbeddings(${mediaId}): bin has ${got} floats, expected ${expectedFloats} — treating as stale`, + ); + return null; + } + const packed = new Float32Array(buffer); + const vectors: Float32Array[] = []; + for (let i = 0; i < expectedCount; i += 1) { + vectors.push(packed.slice(i * embeddingDim, (i + 1) * embeddingDim)); + } + return vectors; + } catch (error) { + logger.warn(`getCaptionEmbeddings(${mediaId}) failed`, error); + return null; + } +} + +export async function deleteCaptionEmbeddings(mediaId: string): Promise { + const root = requireWorkspaceRoot(); + try { + await removeEntry(root, captionEmbeddingsPath(mediaId)); + } catch (error) { + logger.warn(`deleteCaptionEmbeddings(${mediaId}) failed`, error); + } + try { + await removeEntry(root, captionImageEmbeddingsPath(mediaId)); + } catch (error) { + logger.warn(`deleteCaptionImageEmbeddings(${mediaId}) failed`, error); + } +} + +/** + * Persist per-caption CLIP image embeddings. Same layout as text + * embeddings — `captionCount * embeddingDim` packed floats in caption + * order. Safe to call independently of {@link saveCaptionEmbeddings}; + * either bin can exist without the other. + */ +export async function saveCaptionImageEmbeddings( + mediaId: string, + vectors: Float32Array[], + embeddingDim: number, +): Promise { + if (vectors.length === 0) return; + const root = requireWorkspaceRoot(); + const packed = new Float32Array(vectors.length * embeddingDim); + vectors.forEach((vector, index) => { + if (vector.length !== embeddingDim) { + throw new Error( + `Image embedding dim mismatch at index ${index}: got ${vector.length}, expected ${embeddingDim}`, + ); + } + packed.set(vector, index * embeddingDim); + }); + try { + await writeBlob(root, captionImageEmbeddingsPath(mediaId), packed.buffer); + } catch (error) { + logger.error(`saveCaptionImageEmbeddings(${mediaId}) failed`, error); + throw new Error(`Failed to save caption image embeddings: ${mediaId}`); + } +} + +export async function getCaptionImageEmbeddings( + mediaId: string, + embeddingDim: number, + expectedCount: number, +): Promise { + if (expectedCount === 0) return []; + const root = requireWorkspaceRoot(); + try { + const buffer = await readArrayBuffer(root, captionImageEmbeddingsPath(mediaId)); + if (!buffer) return null; + const expectedFloats = expectedCount * embeddingDim; + const got = buffer.byteLength / Float32Array.BYTES_PER_ELEMENT; + if (got !== expectedFloats) { + logger.warn( + `getCaptionImageEmbeddings(${mediaId}): bin has ${got} floats, expected ${expectedFloats} — treating as stale`, + ); + return null; + } + const packed = new Float32Array(buffer); + const vectors: Float32Array[] = []; + for (let i = 0; i < expectedCount; i += 1) { + vectors.push(packed.slice(i * embeddingDim, (i + 1) * embeddingDim)); + } + return vectors; + } catch (error) { + logger.warn(`getCaptionImageEmbeddings(${mediaId}) failed`, error); + return null; + } +} + +export async function deleteCaptions(mediaId: string): Promise { + try { + await deleteAiOutput(mediaId, 'captions'); + await deleteCaptionThumbnails(mediaId); + await deleteCaptionEmbeddings(mediaId); + } catch (error) { + logger.error(`deleteCaptions(${mediaId}) failed`, error); + throw new Error(`Failed to delete captions: ${mediaId}`); + } +} + +/** + * Persist a single caption thumbnail JPEG. Returns the workspace-relative + * path to stash on the corresponding `MediaCaption.thumbRelPath` so the + * Scene Browser can load the blob back on demand. + */ +export async function saveCaptionThumbnail( + mediaId: string, + index: number, + blob: Blob, +): Promise { + const root = requireWorkspaceRoot(); + try { + await writeBlob(root, captionThumbPath(mediaId, index), blob); + return captionThumbRelPath(mediaId, index); + } catch (error) { + logger.error(`saveCaptionThumbnail(${mediaId}, ${index}) failed`, error); + throw new Error(`Failed to save caption thumbnail: ${mediaId}#${index}`); + } +} + +/** + * Load a previously-saved caption thumbnail by its workspace-relative path. + * Returns `null` when the file is missing (captions from before the feature + * landed, or the directory was pruned). + */ +export async function getCaptionThumbnailBlob( + relPath: string, +): Promise { + const root = requireWorkspaceRoot(); + const segments = relPath.split('/').filter(Boolean); + if (segments.length === 0) return null; + try { + return await readBlob(root, segments); + } catch (error) { + logger.warn(`getCaptionThumbnailBlob(${relPath}) failed`, error); + return null; + } +} + +/** + * Probe the conventional caption thumbnail path for a (mediaId, captionIndex) + * pair. Returns the workspace-relative path when the file exists so the + * caller can reuse it without regenerating — useful for captions whose + * `thumbRelPath` pointer was dropped across a reload but whose JPEG is + * still on disk. + */ +export async function probeCaptionThumbnail( + mediaId: string, + captionIndex: number, +): Promise { + const relPath = captionThumbRelPath(mediaId, captionIndex); + const blob = await getCaptionThumbnailBlob(relPath); + return blob ? relPath : null; +} + +/** + * Remove the `captions-thumbs` directory for a media item. No-op when the + * directory is absent; never throws — thumbnail cleanup is opportunistic. + */ +export async function deleteCaptionThumbnails(mediaId: string): Promise { + const root = requireWorkspaceRoot(); + try { + await removeEntry(root, captionThumbsDir(mediaId), { recursive: true }); + } catch (error) { + logger.warn(`deleteCaptionThumbnails(${mediaId}) failed`, error); + } +} diff --git a/src/infrastructure/storage/workspace-fs/paths.ts b/src/infrastructure/storage/workspace-fs/paths.ts index 0f2553ca5..f60584d9e 100644 --- a/src/infrastructure/storage/workspace-fs/paths.ts +++ b/src/infrastructure/storage/workspace-fs/paths.ts @@ -28,7 +28,11 @@ * │ ├── waveform/{meta.json,bin-N.bin} * │ ├── gif-frames/{meta.json,frame-N.png} * │ ├── decoded-audio/{meta.json,left-N.bin,right-N.bin} - * │ └── transcript.json + * │ └── ai/ + * │ ├── transcript.json + * │ ├── captions.json + * │ ├── scenes.json + * │ └── {kind}.json # new AI outputs go here, one file per kind * └── content/ * └── {hash[0:2]}/{hash}/ * ├── refs.json @@ -67,11 +71,18 @@ export const MEDIA_THUMBNAIL_FILENAME = 'thumbnail.jpg'; export const MEDIA_SOURCE_LINK_FILENAME = 'source.link.json'; export const MEDIA_CACHE_DIR = 'cache'; -export const CACHE_FILMSTRIP_DIR = 'filmstrip'; export const CACHE_WAVEFORM_DIR = 'waveform'; export const CACHE_GIF_FRAMES_DIR = 'gif-frames'; export const CACHE_DECODED_AUDIO_DIR = 'decoded-audio'; -export const CACHE_TRANSCRIPT_FILENAME = 'transcript.json'; +export const CACHE_AI_DIR = 'ai'; +/** Per-caption thumbnail JPEGs captured alongside LFM caption generation. */ +export const CACHE_CAPTION_THUMBS_DIR = 'captions-thumbs'; +/** + * Legacy path for transcripts — was `cache/transcript.json` before AI outputs + * were consolidated under `cache/ai/`. Readers fall back to this on miss; a + * subsequent save rewrites to the new path. + */ +export const CACHE_TRANSCRIPT_FILENAME_LEGACY = 'transcript.json'; export const CACHE_META_FILENAME = 'meta.json'; export const CONTENT_REFS_FILENAME = 'refs.json'; @@ -180,14 +191,6 @@ export function mediaCacheDir(id: string): string[] { return [...mediaDir(id), MEDIA_CACHE_DIR]; } -export function filmstripDir(mediaId: string): string[] { - return [...mediaCacheDir(mediaId), CACHE_FILMSTRIP_DIR]; -} - -export function filmstripFramePath(mediaId: string, frameIndex: number): string[] { - return [...filmstripDir(mediaId), `frame-${frameIndex}.jpg`]; -} - export function waveformDir(mediaId: string): string[] { return [...mediaCacheDir(mediaId), CACHE_WAVEFORM_DIR]; } @@ -216,8 +219,65 @@ export function decodedAudioBinPath( return [...decodedAudioDir(mediaId), `${channel}-${binIndex}.bin`]; } -export function transcriptPath(mediaId: string): string[] { - return [...mediaCacheDir(mediaId), CACHE_TRANSCRIPT_FILENAME]; +/** + * Segments for `media/{id}/cache/ai/` — home for AI-derived analysis outputs + * (transcripts, captions, scene cuts, etc.). One file per `AiOutputKind`. + */ +export function aiOutputsDir(mediaId: string): string[] { + return [...mediaCacheDir(mediaId), CACHE_AI_DIR]; +} + +/** + * Segments for `media/{id}/cache/ai/{kind}.json`. The caller owns the `kind` + * enum (see `ai-outputs/types.ts`) — this helper only does path assembly. + */ +export function aiOutputPath(mediaId: string, kind: string): string[] { + return [...aiOutputsDir(mediaId), `${kind}.json`]; +} + +/** Segments for `media/{id}/cache/ai/captions-thumbs/`. */ +export function captionThumbsDir(mediaId: string): string[] { + return [...aiOutputsDir(mediaId), CACHE_CAPTION_THUMBS_DIR]; +} + +/** Segments for `media/{id}/cache/ai/captions-thumbs/{index}.jpg`. */ +export function captionThumbPath(mediaId: string, index: number): string[] { + return [...captionThumbsDir(mediaId), `${index}.jpg`]; +} + +/** + * Segments for `media/{id}/cache/ai/captions-embeddings.bin`. Stored as a + * contiguous `Float32Array` so 384-dim * N-caption embeddings stay compact + * (e.g. 500 captions = 750 KB vs ~4 MB if round-tripped through JSON). + */ +export function captionEmbeddingsPath(mediaId: string): string[] { + return [...aiOutputsDir(mediaId), 'captions-embeddings.bin']; +} + +/** + * Segments for `media/{id}/cache/ai/captions-image-embeddings.bin`. Same + * packing as the text embeddings bin but in the CLIP joint embedding + * space (typically 512-dim), so semantic queries can fall back to + * matching on what the clip *looks like* when caption text is thin. + */ +export function captionImageEmbeddingsPath(mediaId: string): string[] { + return [...aiOutputsDir(mediaId), 'captions-image-embeddings.bin']; +} + +/** + * Workspace-root-relative path (forward-slash separated) for a caption thumb, + * safe to persist in JSON / `MediaCaption.thumbRelPath`. + */ +export function captionThumbRelPath(mediaId: string, index: number): string { + return captionThumbPath(mediaId, index).join('/'); +} + +/** + * Legacy path kept only for read-fallback. New writes go through + * `aiOutputPath(mediaId, 'transcript')`. + */ +export function legacyTranscriptPath(mediaId: string): string[] { + return [...mediaCacheDir(mediaId), CACHE_TRANSCRIPT_FILENAME_LEGACY]; } export function cacheMetaPath(dir: string[]): string[] { @@ -239,10 +299,11 @@ export function contentDataPath(hash: string, extension: string): string[] { return [...contentDir(hash), `data.${ext}`]; } -/* ───────────────── Mirrored OPFS caches (shared across origins) ─────────────── */ +/* ---------------- Shared persisted caches ---------------- */ // -// These caches are primary in OPFS for speed but are also mirrored into the -// workspace folder so other origins can read them without regenerating. +// These caches live outside the media metadata tree so other origins can reuse +// them without regenerating. Some still hydrate from legacy OPFS on demand. +// Filmstrips intentionally use top-level `filmstrips/{mediaId}/...`. export const WORKSPACE_PROXIES_DIR = 'proxies'; export const WORKSPACE_FILMSTRIPS_DIR = 'filmstrips'; @@ -266,13 +327,16 @@ export function filmstripMetaPath(mediaId: string): string[] { } export function previewAudioPath(relativePath: string): string[] { - // relativePath like 'm-123/track-left.wav' — keep original OPFS layout. - return [WORKSPACE_PREVIEW_AUDIO_DIR, ...relativePath.split('/')]; + // Legacy metadata may already include the top-level 'preview-audio/' prefix. + // Normalize that away so workspace paths stay rooted at one shared folder. + const normalized = relativePath.replace(/^preview-audio\//, ''); + return [WORKSPACE_PREVIEW_AUDIO_DIR, ...normalized.split('/').filter(Boolean)]; } /** - * Fast multi-resolution waveform binary — the OPFS-primary cache used by the - * timeline renderer, mirrored here for cross-origin reuse. Different from + * Fast multi-resolution waveform binary. The timeline renderer still keeps an + * OPFS-local copy for range reads, while the workspace mirror enables + * cross-origin reuse. Different from * `waveformBinPath` above, which addresses bins inside the per-media cache. */ export function waveformBinaryPath(mediaId: string): string[] { diff --git a/src/infrastructure/storage/workspace-fs/scenes.ts b/src/infrastructure/storage/workspace-fs/scenes.ts new file mode 100644 index 000000000..3548fb3af --- /dev/null +++ b/src/infrastructure/storage/workspace-fs/scenes.ts @@ -0,0 +1,109 @@ +/** + * Per-media scene-detection results. + * + * Stored at `media/{mediaId}/cache/ai/scenes.json` as an {@link AiOutput} + * envelope. Scene cuts are a property of the source media (not the timeline + * clip), so caching by `mediaId` survives trim/split edits. + * + * Detection parameters (method, sample interval, verification model) are + * persisted in the envelope so consumers can skip the expensive recompute + * when the requested parameters match, and re-run when they don't. + */ + +import type { SceneCut } from '@/infrastructure/analysis'; +import { createLogger } from '@/shared/logging/logger'; + +import { readAiOutput, writeAiOutput, deleteAiOutput } from './ai-outputs'; +import type { ScenesPayload, SceneCutPayload } from './ai-outputs'; + +const logger = createLogger('WorkspaceFS:Scenes'); + +export interface SavedScenes { + method: 'histogram' | 'optical-flow'; + sampleIntervalMs: number; + verificationModel?: string; + fps: number; + cuts: SceneCut[]; +} + +interface SaveScenesInput extends SavedScenes { + mediaId: string; + /** Stable provider id (e.g. `"scene-detect-histogram"`, `"scene-detect-optical-flow"`). */ + service: string; + /** Detector/model identifier — for histogram this is just `"histogram"`. */ + model: string; +} + +function cutsToPayload(cuts: SceneCut[]): SceneCutPayload[] { + return cuts.map((cut) => ({ + frame: cut.frame, + time: cut.time, + motion: cut.motion, + verified: cut.verified, + })); +} + +function payloadToCuts(cuts: SceneCutPayload[]): SceneCut[] { + return cuts as unknown as SceneCut[]; +} + +export async function getScenes(mediaId: string): Promise { + try { + const envelope = await readAiOutput(mediaId, 'scenes'); + if (!envelope) return undefined; + const data: ScenesPayload = envelope.data; + return { + method: data.method, + sampleIntervalMs: data.sampleIntervalMs, + verificationModel: data.verificationModel, + fps: data.fps, + cuts: payloadToCuts(data.cuts), + }; + } catch (error) { + logger.error(`getScenes(${mediaId}) failed`, error); + throw new Error(`Failed to load scenes: ${mediaId}`); + } +} + +export async function saveScenes(input: SaveScenesInput): Promise { + try { + const payload: ScenesPayload = { + method: input.method, + sampleIntervalMs: input.sampleIntervalMs, + verificationModel: input.verificationModel, + fps: input.fps, + cuts: cutsToPayload(input.cuts), + }; + await writeAiOutput({ + mediaId: input.mediaId, + kind: 'scenes', + service: input.service, + model: input.model, + params: { + method: input.method, + sampleIntervalMs: input.sampleIntervalMs, + verificationModel: input.verificationModel ?? null, + }, + data: payload, + }); + return { + method: input.method, + sampleIntervalMs: input.sampleIntervalMs, + verificationModel: input.verificationModel, + fps: input.fps, + cuts: input.cuts, + }; + } catch (error) { + logger.error(`saveScenes(${input.mediaId}) failed`, error); + throw new Error(`Failed to save scenes: ${input.mediaId}`); + } +} + +export async function deleteScenes(mediaId: string): Promise { + try { + await deleteAiOutput(mediaId, 'scenes'); + } catch (error) { + logger.error(`deleteScenes(${mediaId}) failed`, error); + throw new Error(`Failed to delete scenes: ${mediaId}`); + } +} diff --git a/src/infrastructure/storage/workspace-fs/transcripts.test.ts b/src/infrastructure/storage/workspace-fs/transcripts.test.ts index bf9a358f5..9c8ecd501 100644 --- a/src/infrastructure/storage/workspace-fs/transcripts.test.ts +++ b/src/infrastructure/storage/workspace-fs/transcripts.test.ts @@ -23,6 +23,8 @@ import { } from './transcripts'; import { setWorkspaceRoot } from './root'; import { asHandle, createRoot } from './__tests__/in-memory-handle'; +import { writeJsonAtomic } from './fs-primitives'; +import { legacyTranscriptPath, aiOutputPath } from './paths'; function makeTranscript(mediaId: string): MediaTranscript { return { @@ -75,4 +77,33 @@ describe('workspace-fs transcripts', () => { await deleteTranscript('m1'); expect(await getTranscript('m1')).toBeUndefined(); }); + + it('reads a legacy cache/transcript.json written before the ai/ migration', async () => { + const root = createRoot(); + setWorkspaceRoot(asHandle(root)); + await writeJsonAtomic(asHandle(root), legacyTranscriptPath('legacy-id'), makeTranscript('legacy-id')); + + const loaded = await getTranscript('legacy-id'); + expect(loaded?.mediaId).toBe('legacy-id'); + expect(loaded?.segments[0]?.text).toBe('hello'); + }); + + it('saveTranscript migrates legacy path to ai/ envelope', async () => { + const root = createRoot(); + setWorkspaceRoot(asHandle(root)); + await writeJsonAtomic(asHandle(root), legacyTranscriptPath('m2'), makeTranscript('m2')); + + // Round-trip through save rewrites to the new path. + await saveTranscript(makeTranscript('m2')); + + // Allow the fire-and-forget legacy cleanup to settle. + await Promise.resolve(); + + const { readJson } = await import('./fs-primitives'); + const legacy = await readJson(asHandle(root), legacyTranscriptPath('m2')); + expect(legacy).toBeNull(); + + const envelope = await readJson(asHandle(root), aiOutputPath('m2', 'transcript')); + expect(envelope).toBeTruthy(); + }); }); diff --git a/src/infrastructure/storage/workspace-fs/transcripts.ts b/src/infrastructure/storage/workspace-fs/transcripts.ts index 5234b8dd1..aad1685c3 100644 --- a/src/infrastructure/storage/workspace-fs/transcripts.ts +++ b/src/infrastructure/storage/workspace-fs/transcripts.ts @@ -1,30 +1,47 @@ /** * Per-media transcripts backed by the workspace folder. * - * Stored at `media/{mediaId}/cache/transcript.json`. Pure JSON record — - * no binary data or handles involved. + * Persisted at `media/{mediaId}/cache/ai/transcript.json` as an + * {@link AiOutput} envelope. Reads fall back to the legacy + * `cache/transcript.json` path and rewrite-forward on next save, so this is + * invisible to callers. + * + * The public API still exposes {@link MediaTranscript} — the flat record + * shape predates the envelope and is what the UI and indexers consume. */ import type { MediaTranscript } from '@/types/storage'; import { createLogger } from '@/shared/logging/logger'; import { requireWorkspaceRoot } from './root'; +import { readJson, removeEntry } from './fs-primitives'; +import { legacyTranscriptPath } from './paths'; import { - readJson, - removeEntry, - writeJsonAtomic, -} from './fs-primitives'; -import { transcriptPath } from './paths'; + readAiOutput, + writeAiOutput, + deleteAiOutput, + getMediaIdsWithAiOutput, + transcriptFromLegacy, + transcriptToLegacy, +} from './ai-outputs'; const logger = createLogger('WorkspaceFS:Transcripts'); +async function readLegacyTranscript(mediaId: string): Promise { + const root = requireWorkspaceRoot(); + const legacy = await readJson(root, legacyTranscriptPath(mediaId)); + return legacy ?? undefined; +} + export async function getTranscript( mediaId: string, ): Promise { - const root = requireWorkspaceRoot(); try { - const transcript = await readJson(root, transcriptPath(mediaId)); - return transcript ?? undefined; + const envelope = await readAiOutput(mediaId, 'transcript'); + if (envelope) return transcriptToLegacy(envelope); + + const legacy = await readLegacyTranscript(mediaId); + return legacy ?? undefined; } catch (error) { logger.error(`getTranscript(${mediaId}) failed`, error); throw new Error(`Failed to load transcript: ${mediaId}`); @@ -35,18 +52,17 @@ export async function getTranscriptMediaIds( mediaIds: string[], ): Promise> { if (mediaIds.length === 0) return new Set(); - const root = requireWorkspaceRoot(); try { - const ready = new Set(); - const results = await Promise.all( - mediaIds.map(async (id) => { - const t = await readJson(root, transcriptPath(id)); - return t ?? null; - }), - ); - results.forEach((r) => { - if (r?.mediaId) ready.add(r.mediaId); - }); + const ready = await getMediaIdsWithAiOutput(mediaIds, 'transcript'); + const missing = mediaIds.filter((id) => !ready.has(id)); + if (missing.length > 0) { + const legacyResults = await Promise.all( + missing.map(async (id) => ((await readLegacyTranscript(id)) ? id : null)), + ); + for (const id of legacyResults) { + if (id) ready.add(id); + } + } return ready; } catch (error) { logger.error('getTranscriptMediaIds failed', error); @@ -57,10 +73,24 @@ export async function getTranscriptMediaIds( export async function saveTranscript( transcript: MediaTranscript, ): Promise { - const root = requireWorkspaceRoot(); try { - await writeJsonAtomic(root, transcriptPath(transcript.mediaId), transcript); - return transcript; + const envelope = transcriptFromLegacy(transcript); + const written = await writeAiOutput({ + mediaId: envelope.mediaId, + kind: 'transcript', + service: envelope.service, + model: envelope.model, + params: envelope.params, + data: envelope.data, + }); + + // Fire-and-forget legacy-path cleanup on successful migration. + const root = requireWorkspaceRoot(); + void removeEntry(root, legacyTranscriptPath(transcript.mediaId)).catch( + (error) => logger.warn(`legacy transcript cleanup failed for ${transcript.mediaId}`, error), + ); + + return transcriptToLegacy(written); } catch (error) { logger.error(`saveTranscript(${transcript.mediaId}) failed`, error); throw new Error(`Failed to save transcript: ${transcript.mediaId}`); @@ -68,9 +98,10 @@ export async function saveTranscript( } export async function deleteTranscript(mediaId: string): Promise { - const root = requireWorkspaceRoot(); try { - await removeEntry(root, transcriptPath(mediaId)); + await deleteAiOutput(mediaId, 'transcript'); + const root = requireWorkspaceRoot(); + await removeEntry(root, legacyTranscriptPath(mediaId)); } catch (error) { logger.error(`deleteTranscript(${mediaId}) failed`, error); throw new Error(`Failed to delete transcript: ${mediaId}`); diff --git a/src/lib/analysis/captioning/lfm-captioning-provider.ts b/src/lib/analysis/captioning/lfm-captioning-provider.ts index a3800825b..3b5bf2dc8 100644 --- a/src/lib/analysis/captioning/lfm-captioning-provider.ts +++ b/src/lib/analysis/captioning/lfm-captioning-provider.ts @@ -99,8 +99,13 @@ function waitForReady( }); } -function captionSingle(worker: Worker, id: number, imageBlob: Blob, signal?: AbortSignal): Promise { - return new Promise((resolve, reject) => { +function captionSingle( + worker: Worker, + id: number, + imageBlob: Blob, + signal?: AbortSignal, +): Promise> { + return new Promise>((resolve, reject) => { const onAbort = () => { cleanup(); reject(signal!.reason); @@ -115,7 +120,10 @@ function captionSingle(worker: Worker, id: number, imageBlob: Blob, signal?: Abo const onMessage = (event: MessageEvent) => { if (event.data.type === 'caption' && event.data.id === id) { cleanup(); - resolve(event.data.caption ?? ''); + resolve({ + text: event.data.caption ?? '', + sceneData: event.data.sceneData, + }); } }; @@ -145,6 +153,7 @@ export const lfmCaptioningProvider: MediaCaptioningProvider = { onProgress, signal, sampleIntervalSec: rawSampleInterval = DEFAULT_SAMPLE_INTERVAL_SEC, + saveThumbnail, } = options; const sampleIntervalSec = Number.isFinite(rawSampleInterval) && rawSampleInterval > 0 ? rawSampleInterval @@ -191,15 +200,25 @@ export const lfmCaptioningProvider: MediaCaptioningProvider = { totalFrames: timestamps.length, }); - const text = await captionSingle(worker, index, blob, signal); - if (text) { + const result = await captionSingle(worker, index, blob, signal); + if (result.text) { + let thumbRelPath: string | undefined; + if (saveThumbnail) { + try { + thumbRelPath = await saveThumbnail(index, blob); + } catch (error) { + log.warn('Caption thumbnail persist failed — skipping', { index, error }); + } + } captions.push({ timeSec: Math.round(timeSec * 10) / 10, - text, + text: result.text, + ...(result.sceneData ? { sceneData: result.sceneData } : {}), + ...(thumbRelPath ? { thumbRelPath } : {}), }); } - log.info('Frame caption', { frame: index, time: timeSec.toFixed(1), length: text.length }); + log.info('Frame caption', { frame: index, time: timeSec.toFixed(1), length: result.text.length }); } return captions; @@ -225,7 +244,7 @@ export const lfmCaptioningProvider: MediaCaptioningProvider = { totalFrames: 1, }); - const text = await captionSingle(worker, 0, imageBlob, signal); + const result = await captionSingle(worker, 0, imageBlob, signal); onProgress?.({ stage: 'captioning', @@ -234,8 +253,14 @@ export const lfmCaptioningProvider: MediaCaptioningProvider = { totalFrames: 1, }); - log.info('Image caption', { length: text.length }); - return text ? [{ timeSec: 0, text }] : []; + log.info('Image caption', { length: result.text.length }); + return result.text + ? [{ + timeSec: 0, + text: result.text, + ...(result.sceneData ? { sceneData: result.sceneData } : {}), + }] + : []; } finally { worker.postMessage({ type: 'dispose' }); setTimeout(() => worker.terminate(), 500); diff --git a/src/lib/analysis/captioning/scene-caption-format.test.ts b/src/lib/analysis/captioning/scene-caption-format.test.ts new file mode 100644 index 000000000..72935f897 --- /dev/null +++ b/src/lib/analysis/captioning/scene-caption-format.test.ts @@ -0,0 +1,137 @@ +import { describe, expect, it } from 'vitest'; +import { + formatSceneCaption, + formatSceneCaptionFromData, + LFM_SCENE_CAPTION_PROMPT, + normalizeShotVocabulary, + normalizeSceneCaptionData, + parseSceneCaptionResponse, +} from './scene-caption-format'; + +describe('LFM_SCENE_CAPTION_PROMPT', () => { + it('asks for JSON only with structured scene fields', () => { + expect(LFM_SCENE_CAPTION_PROMPT).toContain('return a valid JSON object only'); + expect(LFM_SCENE_CAPTION_PROMPT).toContain('"caption": string'); + expect(LFM_SCENE_CAPTION_PROMPT).toContain('"shotType": string | null'); + expect(LFM_SCENE_CAPTION_PROMPT).toContain('Use null for missing scalar fields and [] for missing subjects'); + expect(LFM_SCENE_CAPTION_PROMPT).toContain('The first character of the response must be { and the last character must be }'); + expect(LFM_SCENE_CAPTION_PROMPT).toContain('Use double quotes around every key and every string value'); + expect(LFM_SCENE_CAPTION_PROMPT).toContain('Do not mention camera motion'); + }); +}); + +describe('normalizeShotVocabulary', () => { + it('normalizes common shot-term spelling and hyphenation inside prose', () => { + expect(normalizeShotVocabulary('A medium close up of a singer')).toBe('A medium close-up of a singer'); + expect(normalizeShotVocabulary('An extreme closeup of an eye')).toBe('An extreme close-up of an eye'); + expect(normalizeShotVocabulary('A medium wide shot of a street')).toBe('A medium-wide shot of a street'); + }); +}); + +describe('normalizeSceneCaptionData', () => { + it('canonicalizes shotType aliases and strips empty fields', () => { + expect(normalizeSceneCaptionData({ + caption: 'A singer under stage lights.', + shot_type: 'medium close up', + subjects: ['singer', ' ', 'microphone'], + weather: 'unknown', + })).toEqual({ + caption: 'A singer under stage lights.', + shotType: 'medium close-up', + subjects: ['singer', 'microphone'], + }); + }); +}); + +describe('formatSceneCaption', () => { + it('strips lead-ins and standardizes leading shot phrasing', () => { + expect(formatSceneCaption('This image shows a medium wide shot of a woman in a cafe')).toBe( + 'Medium-wide shot of a woman in a cafe.', + ); + }); + + it('collapses multi-sentence output to one sentence', () => { + expect(formatSceneCaption('Wide shot of two people crossing a city street. Rain falls in the distance.')).toBe( + 'Wide shot of two people crossing a city street.', + ); + }); + + it('drops uncertain time-of-day or weather clauses instead of persisting guesses', () => { + expect(formatSceneCaption('Close up of a woman indoors, possibly at dusk')).toBe( + 'Close-up of a woman indoors.', + ); + expect(formatSceneCaption('A wide shot of a street, maybe rainy')).toBe( + 'Wide shot of a street.', + ); + }); +}); + +describe('formatSceneCaptionFromData', () => { + it('builds a readable fallback sentence from structured fields', () => { + expect(formatSceneCaptionFromData({ + shotType: 'wide shot', + subjects: ['two people'], + action: 'walking across the street', + setting: 'city street', + timeOfDay: 'dusk', + weather: 'rainy', + })).toBe('Wide shot of two people walking across the street in city street in rainy weather at dusk.'); + }); +}); + +describe('parseSceneCaptionResponse', () => { + it('parses JSON responses and preserves structured scene data', () => { + expect(parseSceneCaptionResponse( + '{"caption":"A woman in a red coat walks through a rainy city street at dusk.","shotType":"wide shot","subjects":["woman"],"action":"walking through the street","setting":"city street","lighting":"dim evening light","timeOfDay":"dusk","weather":"rainy"}', + )).toEqual({ + text: 'A woman in a red coat walks through a rainy city street at dusk.', + sceneData: { + caption: 'A woman in a red coat walks through a rainy city street at dusk.', + shotType: 'wide shot', + subjects: ['woman'], + action: 'walking through the street', + setting: 'city street', + lighting: 'dim evening light', + timeOfDay: 'dusk', + weather: 'rainy', + }, + }); + }); + + it('accepts fenced JSON and falls back to the structured fields when caption is missing', () => { + expect(parseSceneCaptionResponse( + '```json\n{"shotType":"medium close up","subjects":["singer"],"action":"singing into a microphone","setting":"stage","timeOfDay":null,"weather":null}\n```', + )).toEqual({ + text: 'Medium close-up of singer singing into a microphone in stage.', + sceneData: { + caption: 'Medium close-up of singer singing into a microphone in stage.', + shotType: 'medium close-up', + subjects: ['singer'], + action: 'singing into a microphone', + setting: 'stage', + }, + }); + }); + + it('falls back to freeform text formatting when JSON parsing fails', () => { + expect(parseSceneCaptionResponse('This image shows a close up of a hand holding a glass')).toEqual({ + text: 'Close-up of a hand holding a glass.', + }); + }); + + it('recovers known fields from json-ish output when strict parsing fails', () => { + expect(parseSceneCaptionResponse( + 'Json ["caption":"A dimly lit corridor illuminated by hanging lanterns, with a text overlay in Chinese at the bottom.","shotType":"medium wide shot","subjects":["lanterns","corridor","text"],"action":"glowing softly","setting":"interior corridor","lighting":"golden lantern light","timeOfDay":null,"weather":null}.', + )).toEqual({ + text: 'A dimly lit corridor illuminated by hanging lanterns, with a text overlay in Chinese at the bottom.', + sceneData: { + caption: 'A dimly lit corridor illuminated by hanging lanterns, with a text overlay in Chinese at the bottom.', + shotType: 'medium-wide shot', + subjects: ['lanterns', 'corridor', 'text'], + action: 'glowing softly', + setting: 'interior corridor', + lighting: 'golden lantern light', + }, + }); + }); +}); diff --git a/src/lib/analysis/captioning/scene-caption-format.ts b/src/lib/analysis/captioning/scene-caption-format.ts new file mode 100644 index 000000000..5b50fed5f --- /dev/null +++ b/src/lib/analysis/captioning/scene-caption-format.ts @@ -0,0 +1,428 @@ +import type { SceneCaptionData } from './types'; + +export const CANONICAL_SHOT_SIZES = [ + 'extreme wide shot', + 'wide shot', + 'medium-wide shot', + 'medium shot', + 'medium close-up', + 'close-up', + 'extreme close-up', +] as const; + +export const LFM_SCENE_CAPTION_PROMPT = + 'Analyze this single video frame and return a valid JSON object only.\n\n' + + 'Use this exact schema:\n' + + '{' + + '"caption": string, ' + + '"shotType": string | null, ' + + '"subjects": string[], ' + + '"action": string | null, ' + + '"setting": string | null, ' + + '"lighting": string | null, ' + + '"timeOfDay": string | null, ' + + '"weather": string | null' + + '}\n\n' + + 'Rules:\n' + + '- "caption" must be one detailed natural sentence.\n' + + '- Describe the visible subject, action, setting, lighting, time of day, and weather when clearly visible.\n' + + `- "shotType" is optional and must be one of: ${CANONICAL_SHOT_SIZES.join(', ')}.\n` + + '- If shot size is not unmistakable, use null.\n' + + '- If time of day or weather is unclear, use null.\n' + + '- Use null for missing scalar fields and [] for missing subjects.\n' + + '- The first character of the response must be { and the last character must be }.\n' + + '- Use double quotes around every key and every string value.\n' + + '- Do not mention camera motion, camera movement, editing, or uncertainty.\n' + + '- Do not wrap the JSON in markdown fences or prose.'; + +const LABEL_PREFIX_PATTERN = /^(?:caption|scene|description)\s*:\s*/i; +const JSON_LEAD_IN_PATTERN = /^(?:json(?:\s+(?:object|response))?|response|output)\s*[:-]?\s*/i; +const SHOT_LABEL_PREFIX_PATTERN = /^shot(?:\s+type)?\s*:\s*/i; +const LEAD_IN_PATTERNS = [ + /^(?:this|the)\s+(?:image|frame|scene|shot)\s+(?:shows|depicts|features)\s+/i, + /^(?:we can see|we see)\s+/i, +] as const; +const SHOT_ONLY_PATTERN = + /^(?:shot(?:\s+type)?\s*:\s*)?(?:extreme wide shot|wide shot|medium-wide shot|medium shot|medium close-up|close-up|extreme close-up)$/i; +const UNCERTAIN_ENVIRONMENT_TAIL_PATTERN = + /(?:,\s*|\s+-\s+|\s+)(?:possibly|maybe|perhaps|likely|apparently|seemingly|it\s+seems\s+to\s+be|it\s+appears\s+to\s+be|appears\s+to\s+be|seems\s+to\s+be)\s+(?:at\s+)?(?:sunrise|dawn|morning|day(?:time)?|afternoon|golden\s+hour|sunset|dusk|night(?:time)?|rain(?:y|ing)?|snow(?:y|ing)?|fog(?:gy)?|mist(?:y)?|overcast|cloudy|sunny|storm(?:y)?|clear(?:\s+sk(?:y|ies))?)\b[^.?!,;:]*$/i; +const EMPTY_FIELD_PATTERN = /^(?:null|none|n\/a|unknown|unclear|not visible|not obvious)$/i; +const QUOTE_WRAPPER_PATTERN = /^"(.*)"$/s; + +function normalizeWhitespace(text: string): string { + return text.replace(/\s+/g, ' ').trim(); +} + +function stripOuterQuotes(text: string): string { + return text.replace(/^[`"']+|[`"']+$/g, ''); +} + +function stripLeadIns(text: string): string { + let next = text.trim(); + next = next.replace(/^[\s\-*]+/, ''); + next = next.replace(JSON_LEAD_IN_PATTERN, ''); + next = next.replace(LABEL_PREFIX_PATTERN, ''); + + for (const pattern of LEAD_IN_PATTERNS) { + next = next.replace(pattern, ''); + } + + return next.trim(); +} + +function stripTerminalPunctuation(text: string): string { + return text.replace(/[.!?]+$/u, '').trim(); +} + +function lowerCaseFirst(text: string): string { + if (text.length === 0) return text; + return text.charAt(0).toLowerCase() + text.slice(1); +} + +function upperCaseFirst(text: string): string { + if (text.length === 0) return text; + return text.charAt(0).toUpperCase() + text.slice(1); +} + +export function normalizeShotVocabulary(text: string): string { + let next = text; + + const replacements: Array<[RegExp, string]> = [ + [/\bextreme[\s-]+long shot\b/gi, 'extreme wide shot'], + [/\bextreme[\s-]+wide shot\b/gi, 'extreme wide shot'], + [/\bmedium[\s-]+wide shot\b/gi, 'medium-wide shot'], + [/\bmedium[\s-]+close[\s-]*up\b/gi, 'medium close-up'], + [/\bmedium[\s-]+close shot\b/gi, 'medium close-up'], + [/\bextreme[\s-]+close[\s-]*up\b/gi, 'extreme close-up'], + [/\bclose[\s-]*up\b/gi, 'close-up'], + [/\blong shot\b/gi, 'wide shot'], + [/\bwide shot\b/gi, 'wide shot'], + [/\bmedium shot\b/gi, 'medium shot'], + ]; + + for (const [pattern, replacement] of replacements) { + next = next.replace(pattern, replacement); + } + + return next; +} + +function collapseToSingleSentence(text: string): string { + const fragments = text + .split(/(?:\r?\n)+|(?<=[.!?])\s+|;\s+/u) + .map((fragment) => normalizeWhitespace(stripOuterQuotes(fragment))) + .filter(Boolean); + + if (fragments.length === 0) return ''; + if (fragments.length === 1) return fragments[0]!; + + const first = normalizeShotVocabulary(stripTerminalPunctuation(fragments[0]!)); + if (SHOT_ONLY_PATTERN.test(first)) { + const shot = stripTerminalPunctuation(first.replace(SHOT_LABEL_PREFIX_PATTERN, '')).toLowerCase(); + const followUp = stripTerminalPunctuation(stripLeadIns(fragments[1]!)); + if (followUp.length > 0) { + return `${shot} in which ${lowerCaseFirst(followUp)}`; + } + } + + return fragments[0]!; +} + +function stripUncertainEnvironmentTail(text: string): string { + return text.replace(UNCERTAIN_ENVIRONMENT_TAIL_PATTERN, ''); +} + +function stripLeadingShotArticle(text: string): string { + return text.replace( + /^(?:a|an)\s+(extreme wide shot|wide shot|medium-wide shot|medium shot|medium close-up|close-up|extreme close-up)\b/i, + (_, shot: string) => shot.toLowerCase(), + ); +} + +function sanitizeScalar(value: unknown): string | undefined { + if (typeof value !== 'string') return undefined; + const normalized = normalizeWhitespace(stripOuterQuotes(value)); + if (normalized.length === 0 || EMPTY_FIELD_PATTERN.test(normalized)) return undefined; + return normalized; +} + +function sanitizeSubjects(value: unknown): string[] | undefined { + if (!Array.isArray(value)) return undefined; + const subjects = value + .map((entry) => sanitizeScalar(entry)) + .filter((entry): entry is string => Boolean(entry)); + return subjects.length > 0 ? subjects : undefined; +} + +function normalizeShotType(value: unknown): string | undefined { + const scalar = sanitizeScalar(value); + if (!scalar) return undefined; + const compact = stripTerminalPunctuation(scalar).toLowerCase(); + const aliasMap: Record = { + 'extreme wide': 'extreme wide shot', + 'extreme wide shot': 'extreme wide shot', + 'extreme long shot': 'extreme wide shot', + 'wide': 'wide shot', + 'wide shot': 'wide shot', + 'long shot': 'wide shot', + 'medium wide': 'medium-wide shot', + 'medium wide shot': 'medium-wide shot', + 'medium-wide shot': 'medium-wide shot', + 'medium': 'medium shot', + 'medium shot': 'medium shot', + 'medium close': 'medium close-up', + 'medium close up': 'medium close-up', + 'medium close-up': 'medium close-up', + 'close': 'close-up', + 'close up': 'close-up', + 'close-up': 'close-up', + 'extreme close': 'extreme close-up', + 'extreme close up': 'extreme close-up', + 'extreme close-up': 'extreme close-up', + }; + const normalized = aliasMap[compact] ?? normalizeShotVocabulary(compact).toLowerCase(); + return CANONICAL_SHOT_SIZES.find((shot) => shot === normalized); +} + +function hasStructuredFields(data: SceneCaptionData): boolean { + return Boolean( + data.caption + || data.shotType + || (data.subjects && data.subjects.length > 0) + || data.action + || data.setting + || data.lighting + || data.timeOfDay + || data.weather, + ); +} + +function escapeRegExp(text: string): string { + return text.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} + +function decodeLooseValue(raw: string): string | null | undefined { + const trimmed = raw.trim(); + if (trimmed.length === 0) return undefined; + if (/^null$/i.test(trimmed)) return null; + + if (/^"(?:\\.|[^"])*"$/s.test(trimmed)) { + try { + const parsed = JSON.parse(trimmed) as unknown; + return typeof parsed === 'string' ? parsed : undefined; + } catch { + return stripOuterQuotes(trimmed); + } + } + + if (/^'(?:\\.|[^'])*'$/s.test(trimmed)) { + return stripOuterQuotes(trimmed) + .replace(/\\'/g, '\'') + .replace(/\\"/g, '"') + .replace(/\\\\/g, '\\'); + } + + return trimmed.replace(/[.,;:]+$/u, '').trim(); +} + +function extractLooseScalar(raw: string, keys: string[]): string | null | undefined { + const pattern = new RegExp( + String.raw`(?:["']?(?:${keys.map(escapeRegExp).join('|')})["']?)\s*:\s*(null|"(?:\\.|[^"])*"|'(?:\\.|[^'])*'|[^,\r\n}]+)`, + 'i', + ); + const match = raw.match(pattern); + return match?.[1] ? decodeLooseValue(match[1]) : undefined; +} + +function extractLooseSubjects(raw: string): string[] | undefined { + const match = raw.match(/(?:["']?subjects["']?)\s*:\s*\[([\s\S]*?)\]/i); + if (!match) return undefined; + + const entries = Array.from(match[1].matchAll(/"(?:\\.|[^"])*"|'(?:\\.|[^'])*'|[^,\]]+/g)) + .map((entry) => decodeLooseValue(entry[0])) + .filter((entry): entry is string => typeof entry === 'string' && entry.length > 0); + + return entries; +} + +function parseLooseJsonObject(raw: string): Record | null { + const object: Record = {}; + const normalized = stripLeadIns(raw); + + const caption = extractLooseScalar(normalized, ['caption']); + if (caption !== undefined) object.caption = caption; + + const shotType = extractLooseScalar(normalized, ['shotType', 'shot_type']); + if (shotType !== undefined) object.shotType = shotType; + + const subjects = extractLooseSubjects(normalized); + if (subjects !== undefined) object.subjects = subjects; + + const action = extractLooseScalar(normalized, ['action']); + if (action !== undefined) object.action = action; + + const setting = extractLooseScalar(normalized, ['setting']); + if (setting !== undefined) object.setting = setting; + + const lighting = extractLooseScalar(normalized, ['lighting']); + if (lighting !== undefined) object.lighting = lighting; + + const timeOfDay = extractLooseScalar(normalized, ['timeOfDay', 'time_of_day']); + if (timeOfDay !== undefined) object.timeOfDay = timeOfDay; + + const weather = extractLooseScalar(normalized, ['weather']); + if (weather !== undefined) object.weather = weather; + + return Object.keys(object).length > 0 ? object : null; +} + +function extractJsonCandidate(raw: string): string | null { + const fenced = raw.match(/```(?:json)?\s*([\s\S]*?)```/i); + if (fenced?.[1]) { + return fenced[1].trim(); + } + + const start = raw.indexOf('{'); + const end = raw.lastIndexOf('}'); + if (start >= 0 && end > start) { + return raw.slice(start, end + 1); + } + + return null; +} + +function parseJsonObject(raw: string): Record | null { + const candidate = extractJsonCandidate(raw); + if (!candidate) return null; + + try { + const parsed = JSON.parse(candidate) as unknown; + return parsed && typeof parsed === 'object' && !Array.isArray(parsed) + ? parsed as Record + : null; + } catch { + return null; + } +} + +function readField(object: Record, ...keys: string[]): unknown { + for (const key of keys) { + if (key in object) return object[key]; + } + return undefined; +} + +export function normalizeSceneCaptionData(object: Record): SceneCaptionData { + const sceneData: SceneCaptionData = { + caption: sanitizeScalar(readField(object, 'caption')), + shotType: normalizeShotType(readField(object, 'shotType', 'shot_type')), + subjects: sanitizeSubjects(readField(object, 'subjects')), + action: sanitizeScalar(readField(object, 'action')), + setting: sanitizeScalar(readField(object, 'setting')), + lighting: sanitizeScalar(readField(object, 'lighting')), + timeOfDay: sanitizeScalar(readField(object, 'timeOfDay', 'time_of_day')), + weather: sanitizeScalar(readField(object, 'weather')), + }; + + return hasStructuredFields(sceneData) ? sceneData : {}; +} + +function maybeWrapWithOf(fragment: string): string { + return /^(?:of|in|on|at)\b/i.test(fragment) ? fragment : `of ${fragment}`; +} + +export function formatSceneCaption(raw: string): string { + let next = normalizeWhitespace(stripOuterQuotes(raw)); + if (next.length === 0) return ''; + + const quoted = next.match(QUOTE_WRAPPER_PATTERN); + if (quoted?.[1]) { + next = quoted[1]; + } + + next = stripLeadIns(next); + next = collapseToSingleSentence(next); + next = stripUncertainEnvironmentTail(next); + next = stripLeadIns(next); + next = normalizeShotVocabulary(next); + next = stripLeadingShotArticle(next); + next = stripTerminalPunctuation(normalizeWhitespace(next)); + + if (next.length === 0) return ''; + + next = upperCaseFirst(next); + return /[.!?]$/u.test(next) ? next : `${next}.`; +} + +export function formatSceneCaptionFromData(data: SceneCaptionData): string { + const subjectText = data.subjects?.join(', '); + let body = ''; + + if (subjectText && data.action) { + body = `${subjectText} ${lowerCaseFirst(data.action)}`; + } else if (subjectText) { + body = subjectText; + } else if (data.action) { + body = data.action; + } else if (data.setting) { + body = `scene in ${data.setting}`; + } else if (data.lighting) { + body = `scene in ${data.lighting}`; + } + + if (data.setting && body && !body.toLowerCase().includes(data.setting.toLowerCase())) { + body = `${body} in ${data.setting}`; + } + + if (data.weather) { + body = body ? `${body} in ${data.weather} weather` : `${data.weather} weather`; + } + + if (data.timeOfDay) { + body = body ? `${body} at ${data.timeOfDay}` : data.timeOfDay; + } + + if (!body && data.caption) { + body = data.caption; + } + + if (!body) return ''; + + if (data.shotType) { + return formatSceneCaption(`${data.shotType} ${maybeWrapWithOf(body)}`); + } + + return formatSceneCaption(body); +} + +export function parseSceneCaptionResponse(raw: string): { + text: string; + sceneData?: SceneCaptionData; +} { + const parsed = parseJsonObject(raw) ?? parseLooseJsonObject(raw); + if (!parsed) { + const text = formatSceneCaption(raw); + return text ? { text } : { text: '' }; + } + + const sceneData = normalizeSceneCaptionData(parsed); + const text = sceneData.caption + ? formatSceneCaption(sceneData.caption) + : formatSceneCaptionFromData(sceneData) || formatSceneCaption(raw); + + if (!text) { + return { text: '' }; + } + + if (!hasStructuredFields(sceneData)) { + return { text }; + } + + return { + text, + sceneData: { + ...sceneData, + caption: text, + }, + }; +} diff --git a/src/lib/analysis/captioning/types.ts b/src/lib/analysis/captioning/types.ts index d6f53e12c..1fa39cf19 100644 --- a/src/lib/analysis/captioning/types.ts +++ b/src/lib/analysis/captioning/types.ts @@ -1,6 +1,43 @@ +export interface SceneCaptionData { + caption?: string; + shotType?: string; + subjects?: string[]; + action?: string; + setting?: string; + lighting?: string; + timeOfDay?: string; + weather?: string; +} + export interface MediaCaption { timeSec: number; text: string; + /** + * Structured scene metadata emitted by the caption model. Preserved for + * future semantic/indexing work while `text` remains the user-facing and + * search-facing sentence. + */ + sceneData?: SceneCaptionData; + /** + * Workspace-relative path to a captured JPEG thumbnail for this scene, + * e.g. `media/{mediaId}/cache/ai/captions-thumbs/{index}.jpg`. Absent on + * captions generated before the Scene Browser feature landed. + */ + thumbRelPath?: string; + /** + * Dense sentence embedding of the caption's embed-text (caption + + * transcript + colors). 384-dim for all-MiniLM-L6-v2. When present, + * enables semantic text search. + */ + embedding?: number[]; + /** + * Structural dominant-color palette for the thumbnail, in CIELAB + * with pixel-coverage weights. Powers ∆E-based color-query ranking + * independent of CLIP — Lab distances are perceptually uniform so + * "red" queries actually hit red scenes rather than whatever CLIP + * happens to associate with the token. + */ + palette?: Array<{ l: number; a: number; b: number; weight: number }>; } export interface CaptioningProgress { @@ -14,6 +51,13 @@ export interface CaptioningOptions { onProgress?: (progress: CaptioningProgress) => void; signal?: AbortSignal; sampleIntervalSec?: number; + /** + * Optional persistence hook invoked once per captioned frame with the + * JPEG the provider already captured for VLM inference. Return a + * workspace-relative path to stash on `MediaCaption.thumbRelPath`; + * return `undefined` to skip the thumbnail for that frame. + */ + saveThumbnail?: (index: number, blob: Blob) => Promise; } export interface MediaCaptioningProvider { diff --git a/src/lib/analysis/embeddings/clip-provider.ts b/src/lib/analysis/embeddings/clip-provider.ts new file mode 100644 index 000000000..1444ce497 --- /dev/null +++ b/src/lib/analysis/embeddings/clip-provider.ts @@ -0,0 +1,221 @@ +/** + * Singleton provider over the CLIP image/text worker. + * + * Exposes three operations — `ensureReady`, `embedImages`, and + * `embedTextForImages` — that together let the Scene Browser index + * thumbnails and search them by free-form text queries, both running + * off-thread so the UI stays responsive while the model downloads. + */ + +import { createLogger } from '@/shared/logging/logger'; +import { createClipWorker } from './create-clip-worker'; +import type { EmbeddingsOptions } from './types'; + +const log = createLogger('ClipProvider'); + +export const CLIP_MODEL_ID = 'Xenova/clip-vit-base-patch32'; +export const CLIP_EMBEDDING_DIM = 512; + +const INIT_TIMEOUT_MS = 120_000; + +let worker: Worker | null = null; +let readyPromise: Promise | null = null; +let nextId = 0; + +function getWorker(): Worker { + if (!worker) { + worker = createClipWorker(); + worker.addEventListener('error', (event) => { + log.error('CLIP worker errored', event.message); + }); + } + return worker; +} + +function ensureReady(options: EmbeddingsOptions = {}): Promise { + if (readyPromise) return readyPromise; + const w = getWorker(); + + readyPromise = new Promise((resolve, reject) => { + const timeout = setTimeout(() => { + cleanup(); + reject(new Error('CLIP worker init timed out')); + }, INIT_TIMEOUT_MS); + + const cleanup = () => { + clearTimeout(timeout); + w.removeEventListener('message', onMessage); + options.signal?.removeEventListener('abort', onAbort); + }; + + const onAbort = () => { + cleanup(); + reject(options.signal?.reason ?? new Error('CLIP init aborted')); + }; + + const onMessage = (event: MessageEvent) => { + const message = event.data; + if (message.type === 'ready') { + cleanup(); + resolve(); + return; + } + if (message.type === 'progress') { + options.onProgress?.({ stage: 'loading-model', percent: message.percent ?? 0 }); + return; + } + if (message.type === 'error' && message.id === undefined) { + cleanup(); + reject(new Error(message.message ?? 'CLIP worker init failed')); + } + }; + + if (options.signal?.aborted) { + cleanup(); + reject(options.signal.reason); + return; + } + options.signal?.addEventListener('abort', onAbort, { once: true }); + + w.addEventListener('message', onMessage); + w.postMessage({ type: 'init' }); + }); + + readyPromise.catch(() => { + readyPromise = null; + }); + + return readyPromise; +} + +type EmbedRequest = + | { kind: 'images'; payload: Blob[] } + | { kind: 'text'; payload: string[] }; + +function runEmbed(request: EmbedRequest, options: EmbeddingsOptions = {}): Promise { + if (request.payload.length === 0) return Promise.resolve([]); + + return ensureReady(options).then(() => new Promise((resolve, reject) => { + const id = ++nextId; + const w = getWorker(); + + const cleanup = () => { + w.removeEventListener('message', onMessage); + options.signal?.removeEventListener('abort', onAbort); + }; + + const onAbort = () => { + cleanup(); + reject(options.signal?.reason ?? new Error('CLIP embed aborted')); + }; + + const onMessage = (event: MessageEvent) => { + const message = event.data; + if (message.id !== id) return; + if (message.type === 'vectors') { + cleanup(); + resolve(message.vectors as Float32Array[]); + return; + } + if (message.type === 'error') { + cleanup(); + reject(new Error(message.message ?? 'CLIP embed failed')); + } + }; + + if (options.signal?.aborted) { + cleanup(); + reject(options.signal.reason); + return; + } + options.signal?.addEventListener('abort', onAbort, { once: true }); + + w.addEventListener('message', onMessage); + if (request.kind === 'images') { + w.postMessage({ type: 'embed-images', id, blobs: request.payload }); + } else { + w.postMessage({ type: 'embed-text', id, texts: request.payload }); + } + })); +} + +/** + * Natural-sentence templates for CLIP query expansion. CLIP was trained + * on descriptive captions (`"a photo of a cat sitting on a windowsill"`), + * not bare keywords, so a one-word query like `"fighting"` embeds into a + * lonely corner of the joint space where random unrelated images can + * score higher than they should (a vertical tower against "fighting" + * scored 0.21 in one test run — classic short-query noise). + * + * Embedding the query through each template and averaging the resulting + * vectors re-anchors it inside the distribution of sentences CLIP was + * trained on, boosting real matches and suppressing noise. Standard + * retrieval-quality trick; ~5–15 points of nDCG in the literature. + */ +const CLIP_QUERY_TEMPLATES = [ + (q: string) => `a photo of ${q}`, + (q: string) => `a picture of ${q}`, + (q: string) => `a scene showing ${q}`, + (q: string) => q, +]; + +function averageAndNormalize(vectors: Float32Array[]): Float32Array { + const dim = vectors[0]!.length; + const out = new Float32Array(dim); + for (const v of vectors) { + for (let i = 0; i < dim; i += 1) out[i] += v[i]!; + } + let sum = 0; + for (let i = 0; i < dim; i += 1) sum += out[i]! * out[i]!; + const norm = Math.sqrt(sum) || 1; + for (let i = 0; i < dim; i += 1) out[i] /= norm; + return out; +} + +export const clipProvider = { + ensureReady, + + /** + * Embed a batch of image blobs with the CLIP vision encoder. Returned + * vectors live in the 512-dim joint space, so cosine similarity against + * text-encoder outputs is meaningful. + */ + embedImages(blobs: Blob[], options?: EmbeddingsOptions): Promise { + return runEmbed({ kind: 'images', payload: blobs }, options); + }, + + /** + * Embed text with the CLIP text encoder so results can be compared to + * stored image embeddings. This is the low-level path — call it when + * you're indexing canonical text (e.g. captions). For user search + * queries use {@link embedQueryForImages} instead; it averages a few + * natural-language templates to counter CLIP's well-known short-query + * noise. + */ + embedTextForImages(texts: string[], options?: EmbeddingsOptions): Promise { + return runEmbed({ kind: 'text', payload: texts }, options); + }, + + /** + * Embed a single user query by ensembling across {@link CLIP_QUERY_TEMPLATES}. + * Returns one 512-dim vector — the L2-normalized mean of the + * per-template embeddings — suitable for cosine-similarity ranking + * against stored image embeddings. + */ + async embedQueryForImages(query: string, options?: EmbeddingsOptions): Promise { + const trimmed = query.trim(); + if (!trimmed) return null; + const templates = CLIP_QUERY_TEMPLATES.map((t) => t(trimmed)); + const vectors = await runEmbed({ kind: 'text', payload: templates }, options); + if (vectors.length === 0) return null; + return averageAndNormalize(vectors); + }, + + dispose(): void { + if (!worker) return; + worker.postMessage({ type: 'dispose' }); + worker.terminate(); + worker = null; + readyPromise = null; + }, +}; diff --git a/src/lib/analysis/embeddings/clip-worker.ts b/src/lib/analysis/embeddings/clip-worker.ts new file mode 100644 index 000000000..603b3d4f5 --- /dev/null +++ b/src/lib/analysis/embeddings/clip-worker.ts @@ -0,0 +1,217 @@ +/** + * Web Worker for CLIP image + text embeddings. + * + * Loads both halves of `Xenova/clip-vit-base-patch32` (q8 quantized, + * ~90 MB total) so the same worker can embed: + * - scene thumbnails at caption time (image encoder), producing + * vectors that get stored in `captions-image-embeddings.bin`, and + * - search queries at query time (text encoder), producing a vector + * in the *same* 512-dim space so cosine similarity against image + * embeddings is meaningful. + * + * Kept separate from the all-MiniLM text worker because the models are + * large and users who never switch to semantic search shouldn't pay the + * CLIP download cost. + * + * Messages: + * → { type: 'init' } + * → { type: 'embed-images', id, blobs: Blob[] } + * → { type: 'embed-text', id, texts: string[] } + * → { type: 'dispose' } + * ← { type: 'ready', dim: number } + * ← { type: 'progress', percent: number } + * ← { type: 'vectors', id, vectors: Float32Array[] } + * ← { type: 'error', id?, message } + */ + +import { + AutoProcessor, + AutoTokenizer, + CLIPTextModelWithProjection, + CLIPVisionModelWithProjection, + RawImage, + env, + type PreTrainedTokenizer, + type Processor, + type PreTrainedModel, +} from '@huggingface/transformers'; + +const MODEL_ID = 'Xenova/clip-vit-base-patch32'; + +env.useBrowserCache = true; +env.allowLocalModels = false; + +/* eslint-disable @typescript-eslint/no-explicit-any -- transformers.js + tensor types vary by version; the worker stays schema-stable. */ +let tokenizer: PreTrainedTokenizer | null = null; +let processor: Processor | null = null; +let textModel: PreTrainedModel | null = null; +let visionModel: PreTrainedModel | null = null; +let loading = false; +let disposed = false; +let loadGeneration = 0; +let embeddingDim = 512; + +function post(msg: Record): void { + self.postMessage(msg); +} + +async function loadModel(): Promise { + if (tokenizer && processor && textModel && visionModel) { + post({ type: 'ready', dim: embeddingDim }); + return; + } + if (loading) return; + loading = true; + disposed = false; + const thisGen = ++loadGeneration; + + try { + let lastPct = 0; + const onProgress = (info: { status?: string; total?: number; loaded?: number }) => { + if (info.status === 'progress' && info.total && info.loaded) { + const pct = (info.loaded / info.total) * 100; + if (pct - lastPct > 2) { + lastPct = pct; + post({ type: 'progress', percent: Math.round(pct) }); + } + } + }; + + const [loadedTokenizer, loadedProcessor, loadedTextModel, loadedVisionModel] = await Promise.all([ + AutoTokenizer.from_pretrained(MODEL_ID), + AutoProcessor.from_pretrained(MODEL_ID), + CLIPTextModelWithProjection.from_pretrained(MODEL_ID, { + dtype: 'q8', + progress_callback: onProgress, + } as any), + CLIPVisionModelWithProjection.from_pretrained(MODEL_ID, { + dtype: 'q8', + progress_callback: onProgress, + } as any), + ]); + + if (disposed || thisGen !== loadGeneration) return; + + tokenizer = loadedTokenizer; + processor = loadedProcessor; + textModel = loadedTextModel; + visionModel = loadedVisionModel; + + // Probe the projection dim with a tiny warmup; different CLIP + // variants project to 512, 768, or 1024 dims and we want to be sure + // before callers start packing bins. + try { + const tokens = tokenizer(['probe'], { padding: true, truncation: true }) as any; + const output = (await (textModel as any)(tokens)) as any; + const dims: number[] | undefined = output?.text_embeds?.dims; + if (Array.isArray(dims) && dims.length > 0) { + embeddingDim = Number(dims[dims.length - 1]); + } + } catch { + // Stick with the default dim if the probe fails — the real embed + // calls will surface a more specific error if the model is bad. + } + + post({ type: 'ready', dim: embeddingDim }); + } catch (error) { + post({ type: 'error', message: error instanceof Error ? error.message : String(error) }); + } finally { + loading = false; + } +} + +function normalize(vector: Float32Array): Float32Array { + let sum = 0; + for (let i = 0; i < vector.length; i += 1) sum += vector[i]! * vector[i]!; + const norm = Math.sqrt(sum) || 1; + const out = new Float32Array(vector.length); + for (let i = 0; i < vector.length; i += 1) out[i] = vector[i]! / norm; + return out; +} + +function splitPacked(packed: Float32Array, count: number, dim: number): Float32Array[] { + const vectors: Float32Array[] = []; + for (let i = 0; i < count; i += 1) { + vectors.push(normalize(packed.slice(i * dim, (i + 1) * dim))); + } + return vectors; +} + +async function embedImages(id: number, blobs: Blob[]): Promise { + if (!processor || !visionModel) { + post({ type: 'error', id, message: 'CLIP worker not ready (vision)' }); + return; + } + if (blobs.length === 0) { + post({ type: 'vectors', id, vectors: [] }); + return; + } + try { + const images = await Promise.all(blobs.map((blob) => RawImage.fromBlob(blob))); + const inputs = await (processor as any)(images); + const output = (await (visionModel as any)(inputs)) as any; + const data = output?.image_embeds?.data as Float32Array | undefined; + if (!data) throw new Error('CLIP vision model returned no image_embeds'); + post({ type: 'vectors', id, vectors: splitPacked(data, blobs.length, embeddingDim) }); + } catch (error) { + post({ type: 'error', id, message: error instanceof Error ? error.message : String(error) }); + } +} + +async function embedTexts(id: number, texts: string[]): Promise { + if (!tokenizer || !textModel) { + post({ type: 'error', id, message: 'CLIP worker not ready (text)' }); + return; + } + if (texts.length === 0) { + post({ type: 'vectors', id, vectors: [] }); + return; + } + try { + const tokens = (tokenizer as any)(texts, { padding: true, truncation: true }); + const output = (await (textModel as any)(tokens)) as any; + const data = output?.text_embeds?.data as Float32Array | undefined; + if (!data) throw new Error('CLIP text model returned no text_embeds'); + post({ type: 'vectors', id, vectors: splitPacked(data, texts.length, embeddingDim) }); + } catch (error) { + post({ type: 'error', id, message: error instanceof Error ? error.message : String(error) }); + } +} + +self.addEventListener('message', (event: MessageEvent) => { + const message = event.data; + if (!message || typeof message.type !== 'string') return; + + if (message.type === 'init') { + void loadModel(); + return; + } + + if (message.type === 'embed-images') { + const id = typeof message.id === 'number' ? message.id : 0; + const blobs = Array.isArray(message.blobs) ? (message.blobs as Blob[]) : []; + void embedImages(id, blobs); + return; + } + + if (message.type === 'embed-text') { + const id = typeof message.id === 'number' ? message.id : 0; + const texts = Array.isArray(message.texts) + ? (message.texts as unknown[]).filter((t): t is string => typeof t === 'string') + : []; + void embedTexts(id, texts); + return; + } + + if (message.type === 'dispose') { + disposed = true; + tokenizer = null; + processor = null; + textModel = null; + visionModel = null; + loading = false; + return; + } +}); +/* eslint-enable @typescript-eslint/no-explicit-any */ diff --git a/src/lib/analysis/embeddings/context.test.ts b/src/lib/analysis/embeddings/context.test.ts new file mode 100644 index 000000000..fccbb957c --- /dev/null +++ b/src/lib/analysis/embeddings/context.test.ts @@ -0,0 +1,160 @@ +import { describe, expect, it } from 'vitest'; +import { + buildEmbeddingText, + sliceTranscript, +} from './context'; +import { parseSceneCaptionResponse } from '../captioning/scene-caption-format'; + +describe('sliceTranscript', () => { + const segments = [ + { text: 'In the beginning.', start: 0, end: 2 }, + { text: 'We see a mountain.', start: 2, end: 5 }, + { text: 'The chef prepares dinner.', start: 10, end: 13 }, + { text: 'Later that night.', start: 30, end: 32 }, + ]; + + it('pulls segments overlapping the caption window', () => { + expect(sliceTranscript(segments, 11, 2)).toBe('The chef prepares dinner.'); + }); + + it('joins adjacent overlapping segments with a space', () => { + expect(sliceTranscript(segments, 3, 2)).toBe('In the beginning. We see a mountain.'); + }); + + it('returns empty string when transcript is missing', () => { + expect(sliceTranscript(null, 10)).toBe(''); + expect(sliceTranscript(undefined, 10)).toBe(''); + expect(sliceTranscript([], 10)).toBe(''); + }); + + it('returns empty string when nothing overlaps', () => { + expect(sliceTranscript(segments, 20, 1)).toBe(''); + }); + + it('clips long transcripts to a word boundary', () => { + const long = Array.from({ length: 50 }, (_, i) => ({ + text: `this is sentence number ${i}`, + start: i, + end: i + 1, + })); + const result = sliceTranscript(long, 25, 20); + expect(result.length).toBeLessThanOrEqual(220); + expect(result.endsWith(' ')).toBe(false); + expect(result.split(' ').pop()).not.toMatch(/^[a-z]*\d{1,2}$/); + }); +}); + +describe('buildEmbeddingText', () => { + const base = { + caption: { text: 'A tree with orange leaves.', timeSec: 10 }, + }; + + it('always starts with SCENE: ', () => { + const result = buildEmbeddingText(base); + expect(result.startsWith('SCENE: A tree with orange leaves.')).toBe(true); + }); + + it('includes structured scene metadata when supplied', () => { + const result = buildEmbeddingText({ + ...base, + sceneData: { + shotType: 'medium close-up', + timeOfDay: 'dusk', + weather: 'rainy', + }, + }); + expect(result).toMatch(/SHOT: medium close-up/); + expect(result).toMatch(/TIME: dusk/); + expect(result).toMatch(/WEATHER: rainy/); + }); + + it('omits SPEECH: when transcript is missing or unmatched', () => { + const result = buildEmbeddingText(base); + expect(result).not.toMatch(/SPEECH:/); + }); + + it('includes SPEECH: when transcript overlaps caption timestamp', () => { + const result = buildEmbeddingText({ + ...base, + transcriptSegments: [{ text: 'and here is hokkaido', start: 9, end: 11 }], + }); + expect(result).toMatch(/SPEECH: and here is hokkaido/); + }); + + it('does not emit SOURCE: because filename was dropped from context', () => { + const result = buildEmbeddingText(base); + expect(result).not.toMatch(/SOURCE:/); + }); + + it('includes COLORS: when a phrase is provided', () => { + const result = buildEmbeddingText({ ...base, colorPhrase: 'warm orange, teal' }); + expect(result).toMatch(/COLORS: warm orange, teal/); + }); + + it('omits COLORS: for empty string', () => { + const result = buildEmbeddingText({ ...base, colorPhrase: ' ' }); + expect(result).not.toMatch(/COLORS:/); + }); + + it('preserves scene metadata before transcript and colors', () => { + const result = buildEmbeddingText({ + ...base, + sceneData: { + shotType: 'wide shot', + timeOfDay: 'dusk', + weather: 'foggy', + }, + transcriptSegments: [{ text: 'speech here', start: 9, end: 11 }], + colorPhrase: 'deep blue', + }); + const sceneIdx = result.indexOf('SCENE:'); + const shotIdx = result.indexOf('SHOT:'); + const timeIdx = result.indexOf('TIME:'); + const weatherIdx = result.indexOf('WEATHER:'); + const speechIdx = result.indexOf('SPEECH:'); + const colorsIdx = result.indexOf('COLORS:'); + expect(sceneIdx).toBeLessThan(shotIdx); + expect(shotIdx).toBeLessThan(timeIdx); + expect(timeIdx).toBeLessThan(weatherIdx); + expect(weatherIdx).toBeLessThan(speechIdx); + expect(speechIdx).toBeLessThan(colorsIdx); + }); + + it('produces a valid string even with only a caption', () => { + const result = buildEmbeddingText({ + caption: { text: 'Minimal scene.', timeSec: 0 }, + }); + expect(result).toBe('SCENE: Minimal scene.'); + }); + + it('preserves richer scene captions verbatim for downstream semantic indexing', () => { + const result = buildEmbeddingText({ + caption: { text: 'Medium close-up of a singer on a rainy street at dusk.', timeSec: 12 }, + sceneData: { + shotType: 'medium close-up', + timeOfDay: 'dusk', + weather: 'rainy', + }, + }); + expect(result).toBe( + 'SCENE: Medium close-up of a singer on a rainy street at dusk.\n' + + 'SHOT: medium close-up\n' + + 'TIME: dusk\n' + + 'WEATHER: rainy', + ); + }); + + it('turns json-ish caption model output into clean embedding text', () => { + const parsed = parseSceneCaptionResponse( + 'Json ["caption":"A dimly lit corridor illuminated by hanging lanterns, with a text overlay in Chinese at the bottom.","shotType":"medium wide shot","subjects":["lanterns","corridor","text"],"action":"glowing softly","setting":"interior corridor","lighting":"golden lantern light","timeOfDay":null,"weather":null}.', + ); + + expect(buildEmbeddingText({ + caption: { text: parsed.text, timeSec: 9 }, + sceneData: parsed.sceneData, + })).toBe( + 'SCENE: A dimly lit corridor illuminated by hanging lanterns, with a text overlay in Chinese at the bottom.\n' + + 'SHOT: medium-wide shot', + ); + }); +}); diff --git a/src/lib/analysis/embeddings/context.ts b/src/lib/analysis/embeddings/context.ts new file mode 100644 index 000000000..e8daebd6d --- /dev/null +++ b/src/lib/analysis/embeddings/context.ts @@ -0,0 +1,117 @@ +import type { SceneCaptionData } from '../captioning/types'; + +/** + * Embedding context builder. + * + * The caption text alone carries a lot of semantic signal, but the Scene + * Browser gets dramatically better results when adjacent context is + * folded into the string before embedding. We concatenate same-space + * signals into one structured input so that: + * + * - a query like "sunset in hokkaido" matches on caption + source + * filename even when neither alone is sufficient, + * - "orange sky" matches scenes whose caption doesn't name colors but + * whose thumbnail is dominated by warm tones, + * - "she explains the recipe" matches scenes where the caption is + * terse ("woman in kitchen") but the nearby transcript is rich. + * + * Missing signals are simply omitted — a no-transcript b-roll scene + * produces a shorter string, not a weaker vector. This is the whole + * reason we chose concat-and-embed-once over parallel vectors for + * same-modality signals. + */ + +export interface TranscriptSegment { + text: string; + start: number; + end: number; +} + +export interface BuildEmbeddingTextInput { + caption: { text: string; timeSec: number }; + sceneData?: SceneCaptionData; + /** + * Retained for call-site compatibility but unused — filename tokens + * turned out to be noise for editor workflows (proxied filenames, + * generic "final_export" stems drifted meaning more than they helped). + */ + fileName?: string; + /** Full transcript for the source media, used to slice per-caption. */ + transcriptSegments?: TranscriptSegment[] | null; + /** + * Human-readable dominant-color phrase for the caption's thumbnail, + * e.g. `"warm orange, deep teal, near black"`. Computed off the JPEG + * the captioning provider already captured at analyze time. This is + * a fuzzy hint for the transformer; the structural Lab palette in + * `paletteForLab` is what powers exact color-query ranking. + */ + colorPhrase?: string; +} + +/** ± radius in seconds around the caption timestamp to pull transcript from. */ +const DEFAULT_TRANSCRIPT_RADIUS_SEC = 2; + +/** Longer values drown the caption signal in transcript chatter. */ +const TRANSCRIPT_MAX_CHARS = 220; + +/** + * Pull transcript text that overlaps with a caption's time window. Joins + * the chosen segments and caps length so long speeches don't dominate + * the embedding input (all-MiniLM truncates around 256 tokens anyway). + */ +export function sliceTranscript( + segments: TranscriptSegment[] | null | undefined, + timeSec: number, + radiusSec: number = DEFAULT_TRANSCRIPT_RADIUS_SEC, +): string { + if (!segments || segments.length === 0) return ''; + const from = timeSec - radiusSec; + const to = timeSec + radiusSec; + const chunks: string[] = []; + for (const segment of segments) { + if (segment.end < from || segment.start > to) continue; + const text = segment.text.trim(); + if (text) chunks.push(text); + } + const joined = chunks.join(' ').replace(/\s+/g, ' ').trim(); + if (joined.length <= TRANSCRIPT_MAX_CHARS) return joined; + // Clip to a word boundary so the truncation doesn't leave half-words + // in the embedding input. + const clipped = joined.slice(0, TRANSCRIPT_MAX_CHARS); + const lastSpace = clipped.lastIndexOf(' '); + return lastSpace > TRANSCRIPT_MAX_CHARS * 0.6 ? clipped.slice(0, lastSpace) : clipped; +} + +/** + * Compose the string that actually gets embedded. Ordering matters a + * little — caption first because it's the primary signal, optional + * context lines after. Line prefixes like `SCENE:` aren't magic; they + * just give the transformer a small semantic anchor. + * + * Note: we deliberately don't include filename/filepath tokens here. + * They tested poorly in practice (proxied renders, generic "export" + * stems, project-template names) and shifted embeddings toward the + * *filename* rather than the scene content. + */ +export function buildEmbeddingText(input: BuildEmbeddingTextInput): string { + const lines: string[] = []; + const caption = input.caption.text.trim(); + lines.push(`SCENE: ${caption}`); + + const shotType = input.sceneData?.shotType?.trim(); + if (shotType) lines.push(`SHOT: ${shotType}`); + + const timeOfDay = input.sceneData?.timeOfDay?.trim(); + if (timeOfDay) lines.push(`TIME: ${timeOfDay}`); + + const weather = input.sceneData?.weather?.trim(); + if (weather) lines.push(`WEATHER: ${weather}`); + + const speech = sliceTranscript(input.transcriptSegments, input.caption.timeSec); + if (speech) lines.push(`SPEECH: ${speech}`); + + const colors = input.colorPhrase?.trim(); + if (colors) lines.push(`COLORS: ${colors}`); + + return lines.join('\n'); +} diff --git a/src/lib/analysis/embeddings/create-clip-worker.ts b/src/lib/analysis/embeddings/create-clip-worker.ts new file mode 100644 index 000000000..7023d4409 --- /dev/null +++ b/src/lib/analysis/embeddings/create-clip-worker.ts @@ -0,0 +1,5 @@ +import ClipWorker from './clip-worker.ts?worker'; + +export function createClipWorker(): Worker { + return new ClipWorker(); +} diff --git a/src/lib/analysis/embeddings/create-embeddings-worker.ts b/src/lib/analysis/embeddings/create-embeddings-worker.ts new file mode 100644 index 000000000..cf74c897e --- /dev/null +++ b/src/lib/analysis/embeddings/create-embeddings-worker.ts @@ -0,0 +1,9 @@ +/** + * Vite-aware factory for the sentence-embeddings worker. + * Matches the pattern used by `create-lfm-worker.ts`. + */ +import EmbeddingsWorker from './embeddings-worker.ts?worker'; + +export function createEmbeddingsWorker(): Worker { + return new EmbeddingsWorker(); +} diff --git a/src/lib/analysis/embeddings/dominant-colors.ts b/src/lib/analysis/embeddings/dominant-colors.ts new file mode 100644 index 000000000..330965645 --- /dev/null +++ b/src/lib/analysis/embeddings/dominant-colors.ts @@ -0,0 +1,183 @@ +/** + * Cheap dominant-color extractor that turns a scene thumbnail into + * either a short English phrase ("warm orange, teal, near black") + * for embedding context, or a structural Lab palette for exact + * color-query ranking — we run one pass and emit both. + * + * Runs in ~5-15 ms per thumbnail on a downsampled 64×64 grid — much + * cheaper than k-means. Quantizes each pixel into a 4×4×4 RGB bucket + * (64 bins total), takes the most populated ones, and reports either + * a label string or the Lab + weight tuple. Lab coordinates use + * D65 sRGB as the source and are the canonical input for ∆E queries. + */ + +import { rgbToLab } from './lab-color'; + +const SAMPLE_SIZE = 64; +const TOP_COLOR_COUNT = 4; // one more than the phrase variant — palette ranking benefits from extra context +const MIN_BIN_FRACTION = 0.04; // ignore colors that cover <4% of the frame + +/** + * Structural entry in a scene's dominant color palette. Stored + * per-caption and queried at rank time via ∆E 2000 against user + * color terms. + */ +export interface PaletteEntry { + /** CIELAB components; `l` ∈ [0, 100], `a`/`b` ≈ [-128, 128]. */ + l: number; + a: number; + b: number; + /** 0–1 fraction of thumbnail pixels assigned to this bin. */ + weight: number; +} + +interface BinEntry { + count: number; + rSum: number; + gSum: number; + bSum: number; +} + +function rgbToHsl(r: number, g: number, b: number): { h: number; s: number; l: number } { + const rf = r / 255; + const gf = g / 255; + const bf = b / 255; + const max = Math.max(rf, gf, bf); + const min = Math.min(rf, gf, bf); + const l = (max + min) / 2; + let h = 0; + let s = 0; + if (max !== min) { + const delta = max - min; + s = l > 0.5 ? delta / (2 - max - min) : delta / (max + min); + switch (max) { + case rf: h = ((gf - bf) / delta + (gf < bf ? 6 : 0)); break; + case gf: h = ((bf - rf) / delta + 2); break; + default: h = ((rf - gf) / delta + 4); break; + } + h *= 60; + } + return { h, s: s * 100, l: l * 100 }; +} + +function hueLabel(hue: number): string { + // 8-way hue wheel — generic enough that a query like "orange sky" + // reliably hits thumbs with warm sunset tones, specific enough that + // "green" doesn't collapse into "yellow-green". + if (hue < 15 || hue >= 345) return 'red'; + if (hue < 40) return 'orange'; + if (hue < 65) return 'yellow'; + if (hue < 95) return 'yellow green'; + if (hue < 165) return 'green'; + if (hue < 200) return 'teal'; + if (hue < 255) return 'blue'; + if (hue < 285) return 'purple'; + if (hue < 345) return 'pink'; + return 'red'; +} + +function colorLabel(r: number, g: number, b: number): string { + const { h, s, l } = rgbToHsl(r, g, b); + if (l < 12) return 'near black'; + if (l > 92) return 'near white'; + if (s < 12) { + if (l < 35) return 'dark gray'; + if (l < 65) return 'gray'; + return 'light gray'; + } + const hue = hueLabel(h); + if (l < 25) return `dark ${hue}`; + if (l > 75) return `light ${hue}`; + if (l < 45 && s > 40) return `deep ${hue}`; + if (s > 70 && l > 50 && (hue === 'orange' || hue === 'red' || hue === 'yellow')) { + return `warm ${hue}`; + } + return hue; +} + +interface ExtractedColors { + /** Human-readable phrase for the embedding input. */ + phrase: string; + /** Lab+weight entries ranked by coverage, ready to ∆E against. */ + palette: PaletteEntry[]; +} + +/** + * One-pass dominant-color extraction. Returns both the labeled + * phrase (for the transformer-visible COLORS: line) and the + * structural Lab palette (for ∆E color-query ranking). + */ +export async function extractDominantColors(blob: Blob): Promise { + let bitmap: ImageBitmap | null = null; + try { + bitmap = await createImageBitmap(blob); + } catch { + return { phrase: '', palette: [] }; + } + try { + const canvas = new OffscreenCanvas(SAMPLE_SIZE, SAMPLE_SIZE); + const context = canvas.getContext('2d'); + if (!context) return { phrase: '', palette: [] }; + context.drawImage(bitmap, 0, 0, SAMPLE_SIZE, SAMPLE_SIZE); + const { data } = context.getImageData(0, 0, SAMPLE_SIZE, SAMPLE_SIZE); + + const bins = new Map(); + const totalPixels = data.length / 4; + for (let i = 0; i < data.length; i += 4) { + const r = data[i]!; + const g = data[i + 1]!; + const b = data[i + 2]!; + const key = ((r >> 6) << 4) | ((g >> 6) << 2) | (b >> 6); + const bin = bins.get(key); + if (bin) { + bin.count += 1; + bin.rSum += r; + bin.gSum += g; + bin.bSum += b; + } else { + bins.set(key, { count: 1, rSum: r, gSum: g, bSum: b }); + } + } + + const ranked = [...bins.values()] + .filter((bin) => bin.count / totalPixels >= MIN_BIN_FRACTION) + .sort((a, b) => b.count - a.count) + .slice(0, TOP_COLOR_COUNT); + + const labels: string[] = []; + const seenLabels = new Set(); + const palette: PaletteEntry[] = []; + + for (const bin of ranked) { + const r = Math.round(bin.rSum / bin.count); + const g = Math.round(bin.gSum / bin.count); + const b = Math.round(bin.bSum / bin.count); + + const label = colorLabel(r, g, b); + if (!seenLabels.has(label)) { + seenLabels.add(label); + labels.push(label); + } + + const lab = rgbToLab(r, g, b); + palette.push({ + l: Number(lab.l.toFixed(2)), + a: Number(lab.a.toFixed(2)), + b: Number(lab.b.toFixed(2)), + weight: Number((bin.count / totalPixels).toFixed(3)), + }); + } + + return { phrase: labels.join(', '), palette }; + } finally { + bitmap.close(); + } +} + +/** + * Back-compat helper for code paths that only need the human phrase + * (embedding input). Equivalent to `extractDominantColors().phrase`. + */ +export async function extractDominantColorPhrase(blob: Blob): Promise { + return (await extractDominantColors(blob)).phrase; +} diff --git a/src/lib/analysis/embeddings/embeddings-provider.ts b/src/lib/analysis/embeddings/embeddings-provider.ts new file mode 100644 index 000000000..39d13669e --- /dev/null +++ b/src/lib/analysis/embeddings/embeddings-provider.ts @@ -0,0 +1,155 @@ +/** + * Singleton sentence-embedding provider built on top of `embeddings-worker`. + * + * A single worker instance is reused for the lifetime of the tab — + * re-instantiating would force another model download. The module-scoped + * state is intentional; callers should go through the exported + * {@link embeddingsProvider} rather than constructing anything themselves. + */ + +import { createLogger } from '@/shared/logging/logger'; +import { createEmbeddingsWorker } from './create-embeddings-worker'; +import { + EMBEDDING_MODEL_DIM, + EMBEDDING_MODEL_ID, + type EmbeddingsOptions, + type EmbeddingsProvider, +} from './types'; + +const log = createLogger('EmbeddingsProvider'); + +const INIT_TIMEOUT_MS = 60_000; + +let worker: Worker | null = null; +let readyPromise: Promise | null = null; +let nextId = 0; + +function getWorker(): Worker { + if (!worker) { + worker = createEmbeddingsWorker(); + worker.addEventListener('error', (event) => { + log.error('Embeddings worker errored', event.message); + }); + } + return worker; +} + +function ensureReady(options: EmbeddingsOptions = {}): Promise { + if (readyPromise) return readyPromise; + const w = getWorker(); + + readyPromise = new Promise((resolve, reject) => { + const timeout = setTimeout(() => { + cleanup(); + reject(new Error('Embeddings worker init timed out')); + }, INIT_TIMEOUT_MS); + + const cleanup = () => { + clearTimeout(timeout); + w.removeEventListener('message', onMessage); + options.signal?.removeEventListener('abort', onAbort); + }; + + const onAbort = () => { + cleanup(); + reject(options.signal?.reason ?? new Error('Embedding init aborted')); + }; + + const onMessage = (event: MessageEvent) => { + const message = event.data; + if (message.type === 'ready') { + cleanup(); + resolve(); + return; + } + if (message.type === 'progress') { + options.onProgress?.({ stage: 'loading-model', percent: message.percent ?? 0 }); + return; + } + if (message.type === 'error' && message.id === undefined) { + cleanup(); + reject(new Error(message.message ?? 'Embeddings worker init failed')); + } + }; + + if (options.signal?.aborted) { + cleanup(); + reject(options.signal.reason); + return; + } + options.signal?.addEventListener('abort', onAbort, { once: true }); + + w.addEventListener('message', onMessage); + w.postMessage({ type: 'init' }); + }); + + readyPromise.catch(() => { + // A failed init should not pin the promise forever — subsequent calls + // will retry (model might have been offline, transient error, etc.). + readyPromise = null; + }); + + return readyPromise; +} + +function embedBatch(texts: string[], options: EmbeddingsOptions = {}): Promise { + if (texts.length === 0) return Promise.resolve([]); + + const id = ++nextId; + const w = getWorker(); + + return ensureReady(options).then(() => new Promise((resolve, reject) => { + const cleanup = () => { + w.removeEventListener('message', onMessage); + options.signal?.removeEventListener('abort', onAbort); + }; + + const onAbort = () => { + cleanup(); + reject(options.signal?.reason ?? new Error('Embedding aborted')); + }; + + const onMessage = (event: MessageEvent) => { + const message = event.data; + if (message.id !== id) return; + if (message.type === 'embeddings') { + cleanup(); + resolve(message.vectors as Float32Array[]); + return; + } + if (message.type === 'error') { + cleanup(); + reject(new Error(message.message ?? 'Embedding failed')); + } + }; + + if (options.signal?.aborted) { + cleanup(); + reject(options.signal.reason); + return; + } + options.signal?.addEventListener('abort', onAbort, { once: true }); + + w.addEventListener('message', onMessage); + w.postMessage({ type: 'embed', id, texts }); + })); +} + +export const embeddingsProvider: EmbeddingsProvider = { + ensureReady, + async embed(text, options) { + const [vector] = await embedBatch([text], options); + if (!vector) throw new Error('Embedding returned no vector'); + return vector; + }, + embedBatch, + dispose() { + if (!worker) return; + worker.postMessage({ type: 'dispose' }); + worker.terminate(); + worker = null; + readyPromise = null; + }, +}; + +export { EMBEDDING_MODEL_ID, EMBEDDING_MODEL_DIM }; diff --git a/src/lib/analysis/embeddings/embeddings-worker.ts b/src/lib/analysis/embeddings/embeddings-worker.ts new file mode 100644 index 000000000..64b4ec511 --- /dev/null +++ b/src/lib/analysis/embeddings/embeddings-worker.ts @@ -0,0 +1,125 @@ +/** + * Web Worker for sentence-embedding generation using Xenova/all-MiniLM-L6-v2. + * + * The model is quantized (~22 MB) and runs via `pipeline('feature-extraction')` + * from @huggingface/transformers. Loaded lazily on first init, cached in the + * browser after download. + * + * Messages: + * → { type: 'init' } — preload model + * → { type: 'embed', id, texts: string[] } — batch embed + * → { type: 'dispose' } — release model + * ← { type: 'ready', dim: number } — model loaded; embedding dimension + * ← { type: 'progress', percent: number } — model download progress + * ← { type: 'embeddings', id, vectors: Float32Array[] } — batch result + * ← { type: 'error', id?, message } — error + */ + +import { pipeline, env, type FeatureExtractionPipeline } from '@huggingface/transformers'; + +const MODEL_ID = 'Xenova/all-MiniLM-L6-v2'; + +env.useBrowserCache = true; +env.allowLocalModels = false; + +let extractor: FeatureExtractionPipeline | null = null; +let loading = false; +let disposed = false; +let loadGeneration = 0; +let embeddingDim = 384; + +function post(msg: Record): void { + self.postMessage(msg); +} + +async function loadModel(): Promise { + if (extractor) { + post({ type: 'ready', dim: embeddingDim }); + return; + } + if (loading) return; + loading = true; + disposed = false; + const thisGen = ++loadGeneration; + + try { + let lastPct = 0; + const loaded = await pipeline('feature-extraction', MODEL_ID, { + dtype: 'q8', + progress_callback: (info: { status?: string; total?: number; loaded?: number }) => { + if (info.status === 'progress' && info.total && info.loaded) { + const pct = (info.loaded / info.total) * 100; + if (pct - lastPct > 2) { + lastPct = pct; + post({ type: 'progress', percent: Math.round(pct) }); + } + } + }, + }); + + if (disposed || thisGen !== loadGeneration) { + return; + } + + extractor = loaded as FeatureExtractionPipeline; + // Probe dimension with a one-token warmup so the first real query isn't + // the one that pays the shape-inference cost. + const warmup = await extractor('probe', { pooling: 'mean', normalize: true }); + embeddingDim = Array.isArray(warmup.dims) ? Number(warmup.dims[warmup.dims.length - 1]) : 384; + + post({ type: 'ready', dim: embeddingDim }); + } catch (error) { + post({ type: 'error', message: error instanceof Error ? error.message : String(error) }); + } finally { + loading = false; + } +} + +async function embedBatch(id: number, texts: string[]): Promise { + if (!extractor) { + post({ type: 'error', id, message: 'Embeddings worker not ready' }); + return; + } + try { + // Mean-pool + L2-normalize so cosine similarity becomes a dot product + // at the ranking site — no per-row normalization needed downstream. + const tensor = await extractor(texts, { pooling: 'mean', normalize: true }); + const flat = tensor.data as Float32Array; + const dim = embeddingDim; + const vectors: Float32Array[] = []; + for (let i = 0; i < texts.length; i += 1) { + vectors.push(flat.slice(i * dim, (i + 1) * dim)); + } + post( + { type: 'embeddings', id, vectors }, + // Transfer underlying buffers when possible — avoids a copy for each + // 384-dim vector across the worker boundary. + ); + } catch (error) { + post({ type: 'error', id, message: error instanceof Error ? error.message : String(error) }); + } +} + +self.addEventListener('message', (event: MessageEvent) => { + const message = event.data; + if (!message || typeof message.type !== 'string') return; + + if (message.type === 'init') { + void loadModel(); + return; + } + + if (message.type === 'embed') { + const id = typeof message.id === 'number' ? message.id : 0; + const texts = Array.isArray(message.texts) ? message.texts.filter((t: unknown) => typeof t === 'string') : []; + void embedBatch(id, texts); + return; + } + + if (message.type === 'dispose') { + disposed = true; + extractor = null; + loading = false; + return; + } +}); diff --git a/src/lib/analysis/embeddings/index.ts b/src/lib/analysis/embeddings/index.ts new file mode 100644 index 000000000..bcff3e303 --- /dev/null +++ b/src/lib/analysis/embeddings/index.ts @@ -0,0 +1,16 @@ +export { embeddingsProvider, EMBEDDING_MODEL_ID, EMBEDDING_MODEL_DIM } from './embeddings-provider'; +export { clipProvider, CLIP_MODEL_ID, CLIP_EMBEDDING_DIM } from './clip-provider'; +export type { + EmbeddingsOptions, + EmbeddingsProgress, + EmbeddingsProvider, +} from './types'; +export { + buildEmbeddingText, + sliceTranscript, +} from './context'; +export type { BuildEmbeddingTextInput, TranscriptSegment } from './context'; +export { extractDominantColors, extractDominantColorPhrase } from './dominant-colors'; +export type { PaletteEntry } from './dominant-colors'; +export { rgbToLab, deltaE76, deltaE2000 } from './lab-color'; +export type { LabColor } from './lab-color'; diff --git a/src/lib/analysis/embeddings/lab-color.test.ts b/src/lib/analysis/embeddings/lab-color.test.ts new file mode 100644 index 000000000..67df38ab3 --- /dev/null +++ b/src/lib/analysis/embeddings/lab-color.test.ts @@ -0,0 +1,80 @@ +import { describe, expect, it } from 'vitest'; +import { deltaE2000, deltaE76, rgbToLab } from './lab-color'; + +describe('rgbToLab', () => { + it('maps pure white to L=100, a=0, b=0', () => { + const { l, a, b } = rgbToLab(255, 255, 255); + expect(l).toBeCloseTo(100, 1); + expect(a).toBeCloseTo(0, 1); + expect(b).toBeCloseTo(0, 1); + }); + + it('maps pure black to L=0', () => { + const { l, a, b } = rgbToLab(0, 0, 0); + expect(l).toBeCloseTo(0, 1); + expect(a).toBeCloseTo(0, 1); + expect(b).toBeCloseTo(0, 1); + }); + + it('maps pure sRGB red to the canonical Lab red region', () => { + // Reference values from Bruce Lindbloom's sRGB calculator. + const lab = rgbToLab(255, 0, 0); + expect(lab.l).toBeCloseTo(53.24, 0); + expect(lab.a).toBeCloseTo(80.09, 0); + expect(lab.b).toBeCloseTo(67.20, 0); + }); + + it('maps pure sRGB green to its known Lab coordinates', () => { + const lab = rgbToLab(0, 255, 0); + expect(lab.l).toBeCloseTo(87.73, 0); + expect(lab.a).toBeCloseTo(-86.18, 0); + expect(lab.b).toBeCloseTo(83.18, 0); + }); + + it('maps pure sRGB blue to its known Lab coordinates', () => { + const lab = rgbToLab(0, 0, 255); + expect(lab.l).toBeCloseTo(32.30, 0); + expect(lab.a).toBeCloseTo(79.19, 0); + expect(lab.b).toBeCloseTo(-107.86, 0); + }); +}); + +describe('deltaE76', () => { + it('returns 0 for identical colors', () => { + const red = rgbToLab(255, 0, 0); + expect(deltaE76(red, red)).toBeCloseTo(0, 5); + }); + + it('is larger between red and blue than between red and dark red', () => { + const red = rgbToLab(255, 0, 0); + const darkRed = rgbToLab(180, 0, 0); + const blue = rgbToLab(0, 0, 255); + expect(deltaE76(red, blue)).toBeGreaterThan(deltaE76(red, darkRed)); + }); +}); + +describe('deltaE2000', () => { + it('returns 0 for identical colors', () => { + const red = rgbToLab(255, 0, 0); + expect(deltaE2000(red, red)).toBeCloseTo(0, 5); + }); + + it('gives a small delta for near-duplicate reds', () => { + const red = rgbToLab(255, 0, 0); + const nearRed = rgbToLab(250, 5, 5); + expect(deltaE2000(red, nearRed)).toBeLessThan(3); + }); + + it('gives a large delta for red vs blue', () => { + const red = rgbToLab(255, 0, 0); + const blue = rgbToLab(0, 0, 255); + expect(deltaE2000(red, blue)).toBeGreaterThan(40); + }); + + it('ranks "orange vs red" closer than "orange vs blue"', () => { + const orange = rgbToLab(255, 128, 0); + const red = rgbToLab(255, 0, 0); + const blue = rgbToLab(0, 0, 255); + expect(deltaE2000(orange, red)).toBeLessThan(deltaE2000(orange, blue)); + }); +}); diff --git a/src/lib/analysis/embeddings/lab-color.ts b/src/lib/analysis/embeddings/lab-color.ts new file mode 100644 index 000000000..900439ad6 --- /dev/null +++ b/src/lib/analysis/embeddings/lab-color.ts @@ -0,0 +1,156 @@ +/** + * sRGB → CIELAB conversion and Delta E perceptual distance. + * + * Color-by-query is notoriously hard for CLIP — CLIP was trained on + * captions like `"a photo of a red firetruck"` where color is attached + * to an object, so bare color queries drift to weak matches. Industry + * CBIR systems (Imgix, TinEye, classic color histograms) use the + * CIELAB color space with ∆E distance because Lab is approximately + * perceptually uniform — equal ∆E steps correspond to equal visible + * differences. This module provides that pipeline. + * + * Conversion constants come from the D65 reference illuminant, which + * matches sRGB's standard viewing conditions. The ∆E 2000 formula is + * the industry standard for perceptual distance; ∆E 76 is the + * simpler Euclidean version used as a fast fallback. + */ + +export interface LabColor { + l: number; + a: number; + b: number; +} + +// D65 reference white in XYZ. +const REF_X = 0.95047; +const REF_Y = 1.0; +const REF_Z = 1.08883; + +function sRgbCompand(v: number): number { + // Inverse of the sRGB gamma companding — get back to linear light. + const normalized = v / 255; + return normalized <= 0.04045 + ? normalized / 12.92 + : Math.pow((normalized + 0.055) / 1.055, 2.4); +} + +function labFTransform(t: number): number { + const epsilon = 216 / 24389; // 0.008856... + const kappa = 24389 / 27; // 903.3... + return t > epsilon + ? Math.cbrt(t) + : (kappa * t + 16) / 116; +} + +/** + * Convert 0–255 sRGB values to CIELAB. Input assumed gamma-encoded + * (as JPEGs are). Output `l` is in 0–100, `a`/`b` roughly in -128..127. + */ +export function rgbToLab(r: number, g: number, b: number): LabColor { + const rLin = sRgbCompand(r); + const gLin = sRgbCompand(g); + const bLin = sRgbCompand(b); + + // sRGB → XYZ (D65) + const x = rLin * 0.4124564 + gLin * 0.3575761 + bLin * 0.1804375; + const y = rLin * 0.2126729 + gLin * 0.7151522 + bLin * 0.0721750; + const z = rLin * 0.0193339 + gLin * 0.1191920 + bLin * 0.9503041; + + // XYZ → Lab + const fx = labFTransform(x / REF_X); + const fy = labFTransform(y / REF_Y); + const fz = labFTransform(z / REF_Z); + + return { + l: 116 * fy - 16, + a: 500 * (fx - fy), + b: 200 * (fy - fz), + }; +} + +/** + * Simple Euclidean distance in Lab (∆E 76). Cheap, approximate — + * values below ~2 are visually indistinguishable, 2–10 is a subtle + * change, 10+ is obviously different. + */ +export function deltaE76(a: LabColor, b: LabColor): number { + const dL = a.l - b.l; + const dA = a.a - b.a; + const dB = a.b - b.b; + return Math.sqrt(dL * dL + dA * dA + dB * dB); +} + +/** + * CIEDE 2000 — industry-standard perceptual distance. Corrects for + * known issues with ∆E 76 (hue non-linearity, blue/purple cluster + * distortion). More expensive but still cheap enough to run per + * palette entry per query on the hot path. + * + * Formula source: Sharma et al. (2005), "The CIEDE2000 Color-Difference + * Formula: Implementation Notes, Supplementary Test Data, and + * Mathematical Observations." + */ +export function deltaE2000(c1: LabColor, c2: LabColor): number { + const { l: l1, a: a1, b: b1 } = c1; + const { l: l2, a: a2, b: b2 } = c2; + + const avgL = (l1 + l2) / 2; + const c1ab = Math.sqrt(a1 * a1 + b1 * b1); + const c2ab = Math.sqrt(a2 * a2 + b2 * b2); + const avgC = (c1ab + c2ab) / 2; + + const g = 0.5 * (1 - Math.sqrt(Math.pow(avgC, 7) / (Math.pow(avgC, 7) + Math.pow(25, 7)))); + const a1p = a1 * (1 + g); + const a2p = a2 * (1 + g); + + const c1p = Math.sqrt(a1p * a1p + b1 * b1); + const c2p = Math.sqrt(a2p * a2p + b2 * b2); + const avgCp = (c1p + c2p) / 2; + + const h1p = Math.atan2(b1, a1p) >= 0 + ? Math.atan2(b1, a1p) + : Math.atan2(b1, a1p) + 2 * Math.PI; + const h2p = Math.atan2(b2, a2p) >= 0 + ? Math.atan2(b2, a2p) + : Math.atan2(b2, a2p) + 2 * Math.PI; + + const dHp = (() => { + if (c1p * c2p === 0) return 0; + const diff = h2p - h1p; + if (Math.abs(diff) <= Math.PI) return diff; + return diff > Math.PI ? diff - 2 * Math.PI : diff + 2 * Math.PI; + })(); + + const dLp = l2 - l1; + const dCp = c2p - c1p; + const dHpFinal = 2 * Math.sqrt(c1p * c2p) * Math.sin(dHp / 2); + + const avgHp = (() => { + if (c1p * c2p === 0) return h1p + h2p; + if (Math.abs(h1p - h2p) <= Math.PI) return (h1p + h2p) / 2; + return h1p + h2p < 2 * Math.PI + ? (h1p + h2p + 2 * Math.PI) / 2 + : (h1p + h2p - 2 * Math.PI) / 2; + })(); + + const t = 1 + - 0.17 * Math.cos(avgHp - Math.PI / 6) + + 0.24 * Math.cos(2 * avgHp) + + 0.32 * Math.cos(3 * avgHp + Math.PI / 30) + - 0.20 * Math.cos(4 * avgHp - (63 * Math.PI) / 180); + + const sl = 1 + (0.015 * Math.pow(avgL - 50, 2)) / Math.sqrt(20 + Math.pow(avgL - 50, 2)); + const sc = 1 + 0.045 * avgCp; + const sh = 1 + 0.015 * avgCp * t; + + const dTheta = (30 * Math.PI / 180) * Math.exp(-Math.pow((avgHp * 180 / Math.PI - 275) / 25, 2)); + const rc = 2 * Math.sqrt(Math.pow(avgCp, 7) / (Math.pow(avgCp, 7) + Math.pow(25, 7))); + const rt = -rc * Math.sin(2 * dTheta); + + return Math.sqrt( + Math.pow(dLp / sl, 2) + + Math.pow(dCp / sc, 2) + + Math.pow(dHpFinal / sh, 2) + + rt * (dCp / sc) * (dHpFinal / sh), + ); +} diff --git a/src/lib/analysis/embeddings/types.ts b/src/lib/analysis/embeddings/types.ts new file mode 100644 index 000000000..0c075b553 --- /dev/null +++ b/src/lib/analysis/embeddings/types.ts @@ -0,0 +1,31 @@ +/** + * Public types for the sentence-embedding provider. + * + * The model identifier and dimension are exposed so consumers can persist + * them alongside stored embeddings and detect mismatch on load (e.g. if + * we switch to a larger model later, old vectors must be re-generated). + */ + +export const EMBEDDING_MODEL_ID = 'Xenova/all-MiniLM-L6-v2'; +export const EMBEDDING_MODEL_DIM = 384; + +export interface EmbeddingsProgress { + stage: 'loading-model' | 'idle'; + percent: number; +} + +export interface EmbeddingsOptions { + onProgress?: (progress: EmbeddingsProgress) => void; + signal?: AbortSignal; +} + +export interface EmbeddingsProvider { + /** Ensures the model is loaded; safe to call repeatedly. */ + ensureReady(options?: EmbeddingsOptions): Promise; + /** Embed one text. Returns a unit-length 384-dim vector. */ + embed(text: string, options?: EmbeddingsOptions): Promise; + /** Embed a batch. More efficient than calling `embed` in a loop. */ + embedBatch(texts: string[], options?: EmbeddingsOptions): Promise; + /** Release the worker and free the underlying model memory. */ + dispose(): void; +} diff --git a/src/lib/analysis/index.ts b/src/lib/analysis/index.ts index 1f4da7258..7ba4642e5 100644 --- a/src/lib/analysis/index.ts +++ b/src/lib/analysis/index.ts @@ -1,7 +1,12 @@ export { OpticalFlowAnalyzer } from './optical-flow-analyzer'; export type { MotionResult } from './optical-flow-analyzer'; export { detectScenes, clearSceneCache } from './scene-detection'; -export type { SceneCut, SceneDetectionProgress, DetectScenesOptions, VerificationModel } from './scene-detection'; +export type { + SceneCut, + SceneDetectionProgress, + DetectScenesOptions, + VerificationModel, +} from './scene-detection'; export { getDefaultSceneVerificationProvider, getSceneVerificationModelLabel, @@ -14,4 +19,28 @@ export type { HistogramDetectOptions } from './histogram-scene-detection'; export { seekVideo, deduplicateCuts } from './scene-detection-utils'; export { captionVideo, captionImage } from './media-tagger'; export type { MediaCaption, CaptioningProgress, CaptioningOptions } from './media-tagger'; +export { + embeddingsProvider, + EMBEDDING_MODEL_ID, + EMBEDDING_MODEL_DIM, + clipProvider, + CLIP_MODEL_ID, + CLIP_EMBEDDING_DIM, + buildEmbeddingText, + sliceTranscript, + extractDominantColors, + extractDominantColorPhrase, + rgbToLab, + deltaE76, + deltaE2000, +} from './embeddings'; +export type { + EmbeddingsOptions, + EmbeddingsProgress, + EmbeddingsProvider, + BuildEmbeddingTextInput, + TranscriptSegment, + PaletteEntry, + LabColor, +} from './embeddings'; export { ANALYSIS_WIDTH, ANALYSIS_HEIGHT, PYRAMID_LEVELS } from './optical-flow-shaders'; diff --git a/src/lib/analysis/lfm-scene-worker.ts b/src/lib/analysis/lfm-scene-worker.ts index 2021b4f01..566d2fd68 100644 --- a/src/lib/analysis/lfm-scene-worker.ts +++ b/src/lib/analysis/lfm-scene-worker.ts @@ -22,6 +22,10 @@ import { RawImage, env, } from '@huggingface/transformers'; +import { + LFM_SCENE_CAPTION_PROMPT, + parseSceneCaptionResponse, +} from './captioning/scene-caption-format'; const MODEL_ID = 'LiquidAI/LFM2.5-VL-450M-ONNX'; @@ -36,6 +40,7 @@ let model: any = null; let loading = false; let disposed = false; let loadGeneration = 0; +const DESCRIBE_MAX_NEW_TOKENS = 160; function post(msg: Record): void { self.postMessage(msg); @@ -192,8 +197,6 @@ async function verifyCandidate( } } -const DESCRIBE_PROMPT = 'Describe the scene in one sentence.'; - async function describeImage(id: number, imageBlob: Blob): Promise { if (!model || !processor) { post({ type: 'error', message: 'Model not loaded' }); @@ -208,7 +211,7 @@ async function describeImage(id: number, imageBlob: Blob): Promise { role: 'user', content: [ { type: 'image' }, - { type: 'text', text: DESCRIBE_PROMPT }, + { type: 'text', text: LFM_SCENE_CAPTION_PROMPT }, ], }, ]; @@ -221,7 +224,7 @@ async function describeImage(id: number, imageBlob: Blob): Promise { const outputs = await model.generate({ ...inputs, - max_new_tokens: 128, + max_new_tokens: DESCRIBE_MAX_NEW_TOKENS, do_sample: false, repetition_penalty: 1.05, }); @@ -231,8 +234,13 @@ async function describeImage(id: number, imageBlob: Blob): Promise { { skip_special_tokens: true }, ); - const caption = (decoded[0] ?? '').trim(); - post({ type: 'caption', id, caption }); + const parsed = parseSceneCaptionResponse(decoded[0] ?? ''); + post({ + type: 'caption', + id, + caption: parsed.text, + sceneData: parsed.sceneData, + }); } catch (err) { post({ type: 'caption', id, caption: '', error: (err as Error).message }); } diff --git a/src/main.tsx b/src/main.tsx index b196aca22..b348cfdf0 100644 --- a/src/main.tsx +++ b/src/main.tsx @@ -56,7 +56,8 @@ window.addEventListener('vite:preloadError', () => { }); // IMPORTANT: Intentionally do not dispose filmstrip cache on beforeunload. -// Filmstrip OPFS data is persistent and should survive refresh/reload. +// Filmstrip cache data is persistent in the workspace and +// should survive refresh/reload. // The browser tears down workers/resources on navigation anyway. const rootElement = document.getElementById('root'); @@ -70,4 +71,3 @@ createRoot(rootElement).render( ); - diff --git a/src/routes/projects/index.tsx b/src/routes/projects/index.tsx index 1bf4b1725..21b39148c 100644 --- a/src/routes/projects/index.tsx +++ b/src/routes/projects/index.tsx @@ -36,7 +36,7 @@ export const Route = createFileRoute('/projects/')({ // Clean up any media blob URLs when returning to projects page beforeLoad: async () => { cleanupBlobUrls(); - // Always reload projects from IndexedDB to get fresh data (thumbnails may have changed) + // Always reload projects from storage to get fresh data (thumbnails may have changed) const { loadProjects } = useProjectStore.getState(); await loadProjects(); }, @@ -473,4 +473,3 @@ function ProjectsIndex() { ); } - diff --git a/src/shared/components/color-scopes-view.tsx b/src/shared/components/color-scopes-view.tsx index 5a2052b9c..b14686e45 100644 --- a/src/shared/components/color-scopes-view.tsx +++ b/src/shared/components/color-scopes-view.tsx @@ -1,6 +1,6 @@ import { memo, useCallback, useEffect, useRef, useState } from 'react'; import { Activity } from 'lucide-react'; -import { usePlaybackStore } from '@/shared/state/playback'; +import { getResolvedPlaybackFrame, usePlaybackStore } from '@/shared/state/playback'; import { usePreviewBridgeStore } from '@/shared/state/preview-bridge'; import { cn } from '@/shared/ui/cn'; import { Button } from '@/components/ui/button'; @@ -602,8 +602,15 @@ export const ColorScopesView = memo(function ColorScopesView({ if (!captureFrameImageData && !captureFrame) return; const getRequestedFrame = () => { - const s = usePlaybackStore.getState(); - return s.previewFrame ?? s.currentFrame; + const playbackState = usePlaybackStore.getState(); + return getResolvedPlaybackFrame({ + currentFrame: playbackState.currentFrame, + currentFrameEpoch: playbackState.currentFrameEpoch, + previewFrame: playbackState.previewFrame, + previewFrameEpoch: playbackState.previewFrameEpoch, + isPlaying: playbackState.isPlaying, + displayedFrame: usePreviewBridgeStore.getState().displayedFrame, + }); }; const requestedFrame = getRequestedFrame(); @@ -715,21 +722,66 @@ export const ColorScopesView = memo(function ColorScopesView({ scheduleDraw(); - const unsubscribe = usePlaybackStore.subscribe((state, previousState) => { + const scheduleIfFrameChanged = (nextRequestedFrame: number, previousRequestedFrame: number) => { + if (nextRequestedFrame !== previousRequestedFrame) { + scheduleDraw(); + } + }; + + const unsubscribePlayback = usePlaybackStore.subscribe((state, previousState) => { if (state.isPlaying) { return; } - const nextRequestedFrame = state.previewFrame ?? state.currentFrame; - const previousRequestedFrame = previousState.previewFrame ?? previousState.currentFrame; + const nextRequestedFrame = getResolvedPlaybackFrame({ + currentFrame: state.currentFrame, + currentFrameEpoch: state.currentFrameEpoch, + previewFrame: state.previewFrame, + previewFrameEpoch: state.previewFrameEpoch, + isPlaying: state.isPlaying, + displayedFrame: usePreviewBridgeStore.getState().displayedFrame, + }); + const previousRequestedFrame = getResolvedPlaybackFrame({ + currentFrame: previousState.currentFrame, + currentFrameEpoch: previousState.currentFrameEpoch, + previewFrame: previousState.previewFrame, + previewFrameEpoch: previousState.previewFrameEpoch, + isPlaying: previousState.isPlaying, + displayedFrame: usePreviewBridgeStore.getState().displayedFrame, + }); - if (nextRequestedFrame !== previousRequestedFrame) { - scheduleDraw(); + scheduleIfFrameChanged(nextRequestedFrame, previousRequestedFrame); + }); + + const unsubscribePreviewBridge = usePreviewBridgeStore.subscribe((bridgeState, previousBridgeState) => { + const playbackState = usePlaybackStore.getState(); + if (playbackState.isPlaying) { + return; } + + const nextRequestedFrame = getResolvedPlaybackFrame({ + currentFrame: playbackState.currentFrame, + currentFrameEpoch: playbackState.currentFrameEpoch, + previewFrame: playbackState.previewFrame, + previewFrameEpoch: playbackState.previewFrameEpoch, + isPlaying: playbackState.isPlaying, + displayedFrame: bridgeState.displayedFrame, + }); + const previousRequestedFrame = getResolvedPlaybackFrame({ + currentFrame: playbackState.currentFrame, + currentFrameEpoch: playbackState.currentFrameEpoch, + previewFrame: playbackState.previewFrame, + previewFrameEpoch: playbackState.previewFrameEpoch, + isPlaying: playbackState.isPlaying, + displayedFrame: previousBridgeState.displayedFrame, + }); + + scheduleIfFrameChanged(nextRequestedFrame, previousRequestedFrame); }); return () => { - unsubscribe(); + unsubscribePlayback(); + unsubscribePreviewBridge(); if (rafId !== null) { cancelAnimationFrame(rafId); } diff --git a/src/shared/state/playback/index.ts b/src/shared/state/playback/index.ts index 3a6715481..745d9808d 100644 --- a/src/shared/state/playback/index.ts +++ b/src/shared/state/playback/index.ts @@ -1,5 +1,6 @@ export { usePlaybackStore } from './store'; export { getResolvedPlaybackFrame } from './frame-resolution'; +export { commitPreviewFrameToCurrentFrame } from './preview-handoff'; export type { CaptureOptions, PreviewQuality, diff --git a/src/shared/state/playback/preview-handoff.test.ts b/src/shared/state/playback/preview-handoff.test.ts new file mode 100644 index 000000000..62e8ceda0 --- /dev/null +++ b/src/shared/state/playback/preview-handoff.test.ts @@ -0,0 +1,45 @@ +import { beforeEach, describe, expect, it } from 'vitest'; +import { usePlaybackStore } from './store'; +import { commitPreviewFrameToCurrentFrame } from './preview-handoff'; + +describe('commitPreviewFrameToCurrentFrame', () => { + beforeEach(() => { + usePlaybackStore.setState({ + currentFrame: 12, + currentFrameEpoch: 0, + isPlaying: false, + playbackRate: 1, + loop: false, + volume: 1, + muted: false, + masterBusDb: 0, + busAudioEq: undefined, + zoom: -1, + previewFrame: null, + previewFrameEpoch: 0, + frameUpdateEpoch: 0, + previewItemId: null, + useProxy: true, + previewQuality: 1, + }); + }); + + it('promotes the active preview frame before clearing it', () => { + usePlaybackStore.getState().setPreviewFrame(48, 'item-1'); + + commitPreviewFrameToCurrentFrame(); + + const state = usePlaybackStore.getState(); + expect(state.currentFrame).toBe(48); + expect(state.previewFrame).toBeNull(); + expect(state.previewItemId).toBeNull(); + }); + + it('does nothing when there is no active preview frame', () => { + commitPreviewFrameToCurrentFrame(); + + const state = usePlaybackStore.getState(); + expect(state.currentFrame).toBe(12); + expect(state.previewFrame).toBeNull(); + }); +}); diff --git a/src/shared/state/playback/preview-handoff.ts b/src/shared/state/playback/preview-handoff.ts new file mode 100644 index 000000000..55ddc1577 --- /dev/null +++ b/src/shared/state/playback/preview-handoff.ts @@ -0,0 +1,16 @@ +import { usePlaybackStore } from './store'; + +/** + * Promote an active transient skim/preview frame into the authoritative + * current frame before clearing preview state. This prevents edit gestures + * from briefly snapping back to the stale pre-skim playhead frame. + */ +export function commitPreviewFrameToCurrentFrame(): void { + const playback = usePlaybackStore.getState(); + if (playback.previewFrame === null) { + return; + } + + playback.setScrubFrame(playback.previewFrame, playback.previewItemId); + playback.setPreviewFrame(null); +} diff --git a/src/shared/state/preview-bridge/store.test.ts b/src/shared/state/preview-bridge/store.test.ts index b914e443a..b2f5970ab 100644 --- a/src/shared/state/preview-bridge/store.test.ts +++ b/src/shared/state/preview-bridge/store.test.ts @@ -8,6 +8,7 @@ describe('preview-bridge-store', () => { captureFrame: null, captureFrameImageData: null, captureCanvasSource: null, + postEditWarmRequest: null, }); }); @@ -17,6 +18,7 @@ describe('preview-bridge-store', () => { captureFrame: null, captureFrameImageData: null, captureCanvasSource: null, + postEditWarmRequest: null, }); }); @@ -60,4 +62,24 @@ describe('preview-bridge-store', () => { expect(await state.captureFrameImageData?.()).toBeNull(); expect(await state.captureCanvasSource?.()).toBeNull(); }); + + it('stores post-edit warm requests with normalized frames and incrementing tokens', () => { + const store = usePreviewBridgeStore.getState(); + + store.requestPostEditWarm(48.6, ['clip-1'], [48.6, 47.8, 48.6, -2]); + expect(usePreviewBridgeStore.getState().postEditWarmRequest).toEqual({ + frame: 49, + frames: [49, 48, 0], + itemIds: ['clip-1'], + token: 1, + }); + + store.requestPostEditWarm(-2, ['clip-2', 'clip-3']); + expect(usePreviewBridgeStore.getState().postEditWarmRequest).toEqual({ + frame: 0, + frames: [0], + itemIds: ['clip-2', 'clip-3'], + token: 2, + }); + }); }); diff --git a/src/shared/state/preview-bridge/store.ts b/src/shared/state/preview-bridge/store.ts index 5027c7351..920601c11 100644 --- a/src/shared/state/preview-bridge/store.ts +++ b/src/shared/state/preview-bridge/store.ts @@ -12,11 +12,26 @@ function normalizeFrame(frame: number | null): number | null { return Math.max(0, Math.round(frame)); } +function normalizeFrames(frames: number[]): number[] { + const normalized: number[] = []; + const seen = new Set(); + + for (const frame of frames) { + const nextFrame = normalizeFrame(frame); + if (nextFrame == null || seen.has(nextFrame)) continue; + seen.add(nextFrame); + normalized.push(nextFrame); + } + + return normalized; +} + export const usePreviewBridgeStore = create()((set) => ({ displayedFrame: null, captureFrame: null, captureFrameImageData: null, captureCanvasSource: null, + postEditWarmRequest: null, setDisplayedFrame: (frame) => set((state) => { @@ -27,4 +42,18 @@ export const usePreviewBridgeStore = create set({ captureFrame: fn }), setCaptureFrameImageData: (fn) => set({ captureFrameImageData: fn }), setCaptureCanvasSource: (fn) => set({ captureCanvasSource: fn }), + requestPostEditWarm: (frame, itemIds, frames = []) => + set((state) => { + const normalizedFrame = normalizeFrame(frame) ?? 0; + const normalizedFrames = normalizeFrames(frames.length > 0 ? frames : [normalizedFrame]); + + return { + postEditWarmRequest: { + frame: normalizedFrame, + frames: normalizedFrames, + itemIds: [...itemIds], + token: (state.postEditWarmRequest?.token ?? 0) + 1, + }, + }; + }), })); diff --git a/src/shared/state/preview-bridge/types.ts b/src/shared/state/preview-bridge/types.ts index 435144cb9..719b02fd5 100644 --- a/src/shared/state/preview-bridge/types.ts +++ b/src/shared/state/preview-bridge/types.ts @@ -1,5 +1,12 @@ import type { CaptureOptions } from '@/shared/state/playback'; +export interface PostEditWarmRequest { + frame: number; + frames: number[]; + itemIds: string[]; + token: number; +} + export interface PreviewBridgeState { /** Frame currently presented to the user in preview output (null when Player path is active) */ displayedFrame: number | null; @@ -9,6 +16,8 @@ export interface PreviewBridgeState { captureFrameImageData: ((options?: CaptureOptions) => Promise) | null; /** Returns the rendered canvas directly for GPU-accelerated scope analysis (near-zero-copy) */ captureCanvasSource: (() => Promise) | null; + /** Latest request to prewarm the preview renderer after an edit commit. */ + postEditWarmRequest: PostEditWarmRequest | null; } export interface PreviewBridgeActions { @@ -19,4 +28,5 @@ export interface PreviewBridgeActions { setCaptureFrameImageData: (fn: ((options?: CaptureOptions) => Promise) | null) => void; /** Register canvas source capture for GPU scopes (optional) */ setCaptureCanvasSource: (fn: (() => Promise) | null) => void; + requestPostEditWarm: (frame: number, itemIds: string[], frames?: number[]) => void; } diff --git a/src/shared/state/source-player/store.test.ts b/src/shared/state/source-player/store.test.ts index 94b49196f..754a3cf8c 100644 --- a/src/shared/state/source-player/store.test.ts +++ b/src/shared/state/source-player/store.test.ts @@ -8,6 +8,7 @@ describe('source-player-store', () => { playerMethods: null, currentMediaId: null, currentSourceFrame: 0, + previewSourceFrame: null, inPoint: null, outPoint: null, pendingSeekFrame: null, @@ -25,6 +26,7 @@ describe('source-player-store', () => { expect(useSourcePlayerStore.getState()).toMatchObject({ currentMediaId: null, currentSourceFrame: 0, + previewSourceFrame: null, inPoint: null, outPoint: null, }); @@ -44,6 +46,7 @@ describe('source-player-store', () => { currentMediaId: 'media-2', inPoint: 75, outPoint: 150, + previewSourceFrame: null, pendingSeekFrame: 75, }); }); diff --git a/src/shared/state/source-player/store.ts b/src/shared/state/source-player/store.ts index 7285d7521..f03e0807d 100644 --- a/src/shared/state/source-player/store.ts +++ b/src/shared/state/source-player/store.ts @@ -6,20 +6,39 @@ export const useSourcePlayerStore = create((set) => ({ playerMethods: null, currentMediaId: null, currentSourceFrame: 0, + previewSourceFrame: null, inPoint: null, outPoint: null, pendingSeekFrame: null, + pendingPlay: false, setHoveredPanel: (panel) => set({ hoveredPanel: panel }), setPlayerMethods: (methods) => set({ playerMethods: methods }), setCurrentMediaId: (id) => set((state) => { if (id === state.currentMediaId) return state; - return { currentMediaId: id, inPoint: null, outPoint: null, currentSourceFrame: 0 }; + return { + currentMediaId: id, + inPoint: null, + outPoint: null, + currentSourceFrame: 0, + previewSourceFrame: null, + pendingSeekFrame: null, + pendingPlay: false, + }; }), releaseCurrentMediaId: (id) => set((state) => { if (state.currentMediaId !== id) return state; - return { currentMediaId: null, inPoint: null, outPoint: null, currentSourceFrame: 0 }; + return { + currentMediaId: null, + inPoint: null, + outPoint: null, + currentSourceFrame: 0, + previewSourceFrame: null, + pendingSeekFrame: null, + pendingPlay: false, + }; }), setCurrentSourceFrame: (frame) => set({ currentSourceFrame: frame }), + setPreviewSourceFrame: (frame) => set({ previewSourceFrame: frame }), setInPoint: (frame) => set((state) => { if (frame !== null && state.outPoint !== null && frame >= state.outPoint) { return { inPoint: frame, outPoint: null }; @@ -34,4 +53,5 @@ export const useSourcePlayerStore = create((set) => ({ }), clearInOutPoints: () => set({ inPoint: null, outPoint: null }), setPendingSeekFrame: (frame) => set({ pendingSeekFrame: frame }), + setPendingPlay: (play) => set({ pendingPlay: play }), })); diff --git a/src/shared/state/source-player/types.ts b/src/shared/state/source-player/types.ts index 47016c462..55283675a 100644 --- a/src/shared/state/source-player/types.ts +++ b/src/shared/state/source-player/types.ts @@ -1,5 +1,13 @@ export interface SourcePlayerMethods { toggle: () => void; + /** + * Unconditional pause — no-ops when already paused. Exposed so callers + * outside the source monitor (e.g. scene browser clicks) can stop the + * current scene synchronously before queueing a seek, instead of + * waiting for the seek-consume effect and racing with the video + * element still decoding the old frame. + */ + pause: () => void; seek: (frame: number) => void; frameBack: (frames: number) => void; frameForward: (frames: number) => void; @@ -11,16 +19,27 @@ export interface SourcePlayerState { playerMethods: SourcePlayerMethods | null; currentMediaId: string | null; currentSourceFrame: number; + previewSourceFrame: number | null; inPoint: number | null; outPoint: number | null; pendingSeekFrame: number | null; + /** + * When true, the source monitor starts playback after consuming the + * next `pendingSeekFrame`. The monitor always pauses before seeking, + * so scene-browser single-click just queues a seek (leaves paused) + * while double-click queues `pendingPlay: true` to play from the new + * scene. + */ + pendingPlay: boolean; setHoveredPanel: (panel: 'source' | null) => void; setPlayerMethods: (methods: SourcePlayerMethods | null) => void; setCurrentMediaId: (id: string | null) => void; releaseCurrentMediaId: (id: string) => void; setCurrentSourceFrame: (frame: number) => void; + setPreviewSourceFrame: (frame: number | null) => void; setInPoint: (frame: number | null) => void; setOutPoint: (frame: number | null) => void; clearInOutPoints: () => void; setPendingSeekFrame: (frame: number | null) => void; + setPendingPlay: (play: boolean) => void; } diff --git a/src/shared/utils/browser-whisper-models.ts b/src/shared/utils/browser-whisper-models.ts index 72b0fae63..3986fa882 100644 --- a/src/shared/utils/browser-whisper-models.ts +++ b/src/shared/utils/browser-whisper-models.ts @@ -1,6 +1,6 @@ import type { MediaTranscriptModel } from '@/types/storage'; -export const DEFAULT_BROWSER_WHISPER_MODEL: MediaTranscriptModel = 'whisper-tiny'; +export const DEFAULT_BROWSER_WHISPER_MODEL: MediaTranscriptModel = 'whisper-small'; export const BROWSER_WHISPER_MODEL_LABELS: Record = { 'whisper-tiny': 'Tiny', @@ -10,7 +10,6 @@ export const BROWSER_WHISPER_MODEL_LABELS: Record }; export const BROWSER_WHISPER_MODEL_OPTIONS = [ - { value: 'whisper-tiny', label: BROWSER_WHISPER_MODEL_LABELS['whisper-tiny'] }, { value: 'whisper-base', label: BROWSER_WHISPER_MODEL_LABELS['whisper-base'] }, { value: 'whisper-small', label: BROWSER_WHISPER_MODEL_LABELS['whisper-small'] }, { value: 'whisper-large', label: BROWSER_WHISPER_MODEL_LABELS['whisper-large'] }, @@ -18,3 +17,19 @@ export const BROWSER_WHISPER_MODEL_OPTIONS = [ value: MediaTranscriptModel; label: string; }>; + +const SELECTABLE_BROWSER_WHISPER_MODELS = new Set( + BROWSER_WHISPER_MODEL_OPTIONS.map((option) => option.value), +); + +export function normalizeSelectableBrowserWhisperModel( + model: MediaTranscriptModel | undefined, +): MediaTranscriptModel { + if (!model) { + return DEFAULT_BROWSER_WHISPER_MODEL; + } + + return SELECTABLE_BROWSER_WHISPER_MODELS.has(model) + ? model + : DEFAULT_BROWSER_WHISPER_MODEL; +} diff --git a/src/shared/utils/schedule-after-paint.ts b/src/shared/utils/schedule-after-paint.ts new file mode 100644 index 000000000..0f173fe2e --- /dev/null +++ b/src/shared/utils/schedule-after-paint.ts @@ -0,0 +1,18 @@ +export function scheduleAfterPaint(task: () => void): () => void { + if (typeof window === 'undefined') { + const timeoutId = setTimeout(task, 0); + return () => clearTimeout(timeoutId); + } + + let timeoutId: number | null = null; + const rafId = window.requestAnimationFrame(() => { + timeoutId = window.setTimeout(task, 0); + }); + + return () => { + window.cancelAnimationFrame(rafId); + if (timeoutId !== null) { + window.clearTimeout(timeoutId); + } + }; +} diff --git a/src/shared/utils/transcription-cancellation.ts b/src/shared/utils/transcription-cancellation.ts new file mode 100644 index 000000000..fe0c024ef --- /dev/null +++ b/src/shared/utils/transcription-cancellation.ts @@ -0,0 +1,44 @@ +import { LOCAL_INFERENCE_UNLOADED_MESSAGE } from '@/shared/state/local-inference'; + +export const TRANSCRIPTION_CANCELLED_MESSAGE = 'Transcription cancelled'; + +export function isTranscriptionCancellationError(error: unknown): boolean { + return error instanceof Error && ( + error.message === TRANSCRIPTION_CANCELLED_MESSAGE + || error.message === LOCAL_INFERENCE_UNLOADED_MESSAGE + ); +} + +const OOM_PATTERNS = [ + /out of memory/i, + /\boom\b/i, + /insufficient memory/i, + /allocation failed/i, + /failed to allocate/i, + /cannot allocate/i, + /memory allocation/i, + /array buffer allocation/i, + /device lost/i, + /webgpu.*buffer/i, + /createbuffer/i, + /wasm memory/i, + /maximum.*memory/i, +]; + +export function isTranscriptionOutOfMemoryError(error: unknown): boolean { + // RangeError from buffer allocation is the clearest OOM signal from browsers. + if (error instanceof RangeError) return true; + + if (!(error instanceof Error)) { + if (typeof error === 'string') { + return OOM_PATTERNS.some((pattern) => pattern.test(error)); + } + return false; + } + + const message = `${error.message} ${error.name}`; + return OOM_PATTERNS.some((pattern) => pattern.test(message)); +} + +export const TRANSCRIPTION_OOM_HINT = + 'The model ran out of memory. Try a lower quantization (q8 or q4) or a smaller model in Settings → Whisper, then try again.'; diff --git a/src/shared/utils/transcription-progress.test.ts b/src/shared/utils/transcription-progress.test.ts index ae6f0abc7..f7e16ed44 100644 --- a/src/shared/utils/transcription-progress.test.ts +++ b/src/shared/utils/transcription-progress.test.ts @@ -7,6 +7,7 @@ import { describe('transcription-progress', () => { it('maps stages into a stable overall percentage range', () => { + expect(getTranscriptionOverallPercent({ stage: 'queued', progress: 1 })).toBe(0); expect(getTranscriptionOverallPercent({ stage: 'loading', progress: 1 })).toBe(35); expect(getTranscriptionOverallPercent({ stage: 'decoding', progress: 0.5 })).toBeCloseTo(52.5); expect(getTranscriptionOverallPercent({ stage: 'transcribing', progress: 0.5 })).toBe(85); @@ -40,8 +41,9 @@ describe('transcription-progress', () => { }); it('formats readable stage labels', () => { + expect(getTranscriptionStageLabel('queued')).toBe('Queued'); expect(getTranscriptionStageLabel('loading')).toBe('Loading model'); - expect(getTranscriptionStageLabel('decoding')).toBe('Decoding audio'); + expect(getTranscriptionStageLabel('decoding')).toBe('Preparing audio'); expect(getTranscriptionStageLabel('transcribing')).toBe('Transcribing'); }); }); diff --git a/src/shared/utils/transcription-progress.ts b/src/shared/utils/transcription-progress.ts index 74f538eda..2e20350dc 100644 --- a/src/shared/utils/transcription-progress.ts +++ b/src/shared/utils/transcription-progress.ts @@ -1,4 +1,4 @@ -export type TranscriptionProgressStage = 'loading' | 'decoding' | 'transcribing'; +export type TranscriptionProgressStage = 'queued' | 'loading' | 'decoding' | 'transcribing'; export interface TranscriptionProgressSnapshot { stage: TranscriptionProgressStage; @@ -15,6 +15,8 @@ export function getTranscriptionOverallProgress( const normalizedProgress = clampProgress(snapshot.progress); switch (snapshot.stage) { + case 'queued': + return 0; case 'loading': return normalizedProgress * 0.35; case 'decoding': @@ -50,10 +52,12 @@ export function mergeTranscriptionProgress( export function getTranscriptionStageLabel(stage: TranscriptionProgressStage): string { switch (stage) { + case 'queued': + return 'Queued'; case 'loading': return 'Loading model'; case 'decoding': - return 'Decoding audio'; + return 'Preparing audio'; case 'transcribing': return 'Transcribing'; } diff --git a/src/shared/utils/whisper-settings.ts b/src/shared/utils/whisper-settings.ts index 8f86570e5..d8149f2ae 100644 --- a/src/shared/utils/whisper-settings.ts +++ b/src/shared/utils/whisper-settings.ts @@ -3,6 +3,7 @@ import { BROWSER_WHISPER_MODEL_LABELS, BROWSER_WHISPER_MODEL_OPTIONS, DEFAULT_BROWSER_WHISPER_MODEL, + normalizeSelectableBrowserWhisperModel, } from './browser-whisper-models'; export const DEFAULT_WHISPER_MODEL: MediaTranscriptModel = DEFAULT_BROWSER_WHISPER_MODEL; @@ -17,6 +18,12 @@ export const WHISPER_MODEL_OPTIONS: ReadonlyArray<{ label: string; }> = BROWSER_WHISPER_MODEL_OPTIONS; +export function normalizeSelectableWhisperModel( + model: MediaTranscriptModel | undefined, +): MediaTranscriptModel { + return normalizeSelectableBrowserWhisperModel(model); +} + export const WHISPER_QUANTIZATION_OPTIONS: ReadonlyArray<{ value: MediaTranscriptQuantization; label: string; @@ -51,9 +58,10 @@ export const WHISPER_QUANTIZATION_OPTIONS: ReadonlyArray<{ export function getWhisperQuantizationOption( value: MediaTranscriptQuantization | undefined, -) { +): (typeof WHISPER_QUANTIZATION_OPTIONS)[number] { + const fallback = WHISPER_QUANTIZATION_OPTIONS[0]!; return WHISPER_QUANTIZATION_OPTIONS.find((option) => option.value === value) - ?? WHISPER_QUANTIZATION_OPTIONS[0]; + ?? fallback; } const WHISPER_LANGUAGE_NAMES = { diff --git a/src/types/project.ts b/src/types/project.ts index 5a8bed1e0..d1b6bdc8b 100644 --- a/src/types/project.ts +++ b/src/types/project.ts @@ -15,7 +15,7 @@ export interface Project { * Increment CURRENT_SCHEMA_VERSION in lib/migrations when adding migrations. */ schemaVersion?: number; - thumbnailId?: string; // Reference to ThumbnailData in IndexedDB + thumbnailId?: string; // Reference to workspace-backed ThumbnailData thumbnail?: string; // @deprecated Base64 data URL (for backward compatibility) metadata: ProjectResolution; timeline?: ProjectTimeline; @@ -80,8 +80,9 @@ export interface ProjectTimeline { sourceDuration?: number; // Total duration of source media (frames) sourceFps?: number; // Source media frame rate for source* frame fields text?: string; + textRole?: 'caption'; captionSource?: { - type: 'transcript'; + type: 'transcript' | 'ai-captions'; clipId: string; mediaId: string; }; diff --git a/src/types/storage.ts b/src/types/storage.ts index 6b1a60923..77c9d32e9 100644 --- a/src/types/storage.ts +++ b/src/types/storage.ts @@ -54,7 +54,8 @@ export interface MediaMetadata { audioCodecSupported?: boolean; /** * Conformed preview-audio asset path for custom-decoded codecs. - * Points to a browser-native playable WAV stored in OPFS. + * Kept under the legacy name for compatibility, but now points to the + * workspace-backed persisted WAV path. */ previewAudioOpfsPath?: string; previewAudioMimeType?: string; @@ -74,8 +75,30 @@ export interface MediaMetadata { gopInterval?: number; thumbnailId?: string; tags: string[]; - /** AI-generated timestamped captions from LFM vision-language model. */ - aiCaptions?: Array<{ timeSec: number; text: string }>; + /** + * AI-generated timestamped captions from LFM vision-language model. + * Mirrors the canonical `cache/ai/captions.json` payload for in-memory + * consumers (search, Scene Browser). See `MediaCaption` in + * `lib/analysis/captioning/types.ts` for the full shape including optional + * thumbnail paths, semantic embeddings, and color palettes. + */ + aiCaptions?: Array<{ + timeSec: number; + text: string; + sceneData?: { + caption?: string; + shotType?: string; + subjects?: string[]; + action?: string; + setting?: string; + lighting?: string; + timeOfDay?: string; + weather?: string; + }; + thumbRelPath?: string; + embedding?: number[]; + palette?: Array<{ l: number; a: number; b: number; weight: number }>; + }>; createdAt: number; updatedAt: number; } @@ -157,7 +180,7 @@ export interface WaveformData { createdAt: number; } -// Streaming waveform cache records (meta + bins in IndexedDB). +// Streaming waveform cache records (meta + bins in persisted storage). export interface WaveformMeta { id: string; // Same as mediaId mediaId: string; diff --git a/src/types/timeline.ts b/src/types/timeline.ts index 737438a20..410fc3c17 100644 --- a/src/types/timeline.ts +++ b/src/types/timeline.ts @@ -94,7 +94,13 @@ type BaseTimelineItem = { }; export interface GeneratedCaptionSource { - type: 'transcript'; + /** + * `transcript` — generated from whisper speech-to-text segments. + * `ai-captions` — generated from vision-language-model frame descriptions + * (e.g. LFM captioning). Distinguished so replace/remove flows can target + * one kind without disturbing the other on the same clip. + */ + type: 'transcript' | 'ai-captions'; clipId: string; mediaId: string; } @@ -122,6 +128,7 @@ export type AudioItem = BaseTimelineItem & { export type TextItem = BaseTimelineItem & { type: 'text'; text: string; + textRole?: 'caption'; captionSource?: GeneratedCaptionSource; // Typography fontSize?: number; // Font size in pixels (default: 60) diff --git a/vercel.json b/vercel.json index ae401caa2..61635dfae 100644 --- a/vercel.json +++ b/vercel.json @@ -37,7 +37,7 @@ }, { "key": "Cross-Origin-Embedder-Policy", - "value": "require-corp" + "value": "credentialless" }, { "key": "Cross-Origin-Opener-Policy",