From 62c93e58d2d7963922b7d98cdadf216cc2714fa8 Mon Sep 17 00:00:00 2001 From: Maulik Date: Mon, 1 Jun 2026 06:32:40 +0530 Subject: [PATCH 01/39] feat: implement caption word deletion, filler detection, and timeline ripple editing support --- .../src/routes/editor/TranscriptPage.tsx | 605 ++++++++++++++++-- .../src/routes/editor/caption-types.ts | 9 + apps/desktop/src/routes/editor/captions.ts | 54 +- .../src/routes/editor/filler-detection.ts | 68 ++ .../src/routes/editor/timeline-utils.ts | 121 ++++ apps/web/package.json | 2 +- crates/project/src/configuration.rs | 10 + crates/rendering/src/layers/captions.rs | 14 +- package.json | 2 +- 9 files changed, 814 insertions(+), 71 deletions(-) create mode 100644 apps/desktop/src/routes/editor/caption-types.ts create mode 100644 apps/desktop/src/routes/editor/filler-detection.ts diff --git a/apps/desktop/src/routes/editor/TranscriptPage.tsx b/apps/desktop/src/routes/editor/TranscriptPage.tsx index 1c903efe65b..42064b2ed6b 100644 --- a/apps/desktop/src/routes/editor/TranscriptPage.tsx +++ b/apps/desktop/src/routes/editor/TranscriptPage.tsx @@ -12,14 +12,22 @@ import { } from "solid-js"; import { produce } from "solid-js/store"; import { commands } from "~/utils/tauri"; +import type { CaptionWordExtended } from "./caption-types"; import { createCaptionTrackSegments, getCaptionTextFromWords, } from "./captions"; import { FPS, useEditorContext } from "./context"; +import { + AUTO_CLEAN_SILENCE_THRESHOLD, + isFillerWord, + PAUSE_DETECTION_THRESHOLD, +} from "./filler-detection"; import { rippleDeleteAllTracks, + rippleInsertAllTracks, shiftCaptionTimesAfterCut, + shiftCaptionTimesAfterInsert, } from "./timeline-utils"; function formatTimePrecise(secs: number) { @@ -35,6 +43,20 @@ interface FlatWord { end: number; segmentIndex: number; wordIndex: number; + deleted: boolean; + isFiller: boolean; + isPause: boolean; + bufferStart: number; + bufferEnd: number; +} + +interface PauseIndicator { + type: "pause"; + start: number; + end: number; + duration: number; + afterSegmentIndex: number; + afterWordIndex: number; } interface TranscriptSegmentGroup { @@ -69,7 +91,7 @@ export function TranscriptPanel() { const result: FlatWord[] = []; for (let segIdx = 0; segIdx < segments.length; segIdx++) { const seg = segments[segIdx]; - const words = seg.words ?? []; + const words = (seg.words ?? []) as CaptionWordExtended[]; for (let wordIdx = 0; wordIdx < words.length; wordIdx++) { const w = words[wordIdx]; result.push({ @@ -78,12 +100,45 @@ export function TranscriptPanel() { end: w.end, segmentIndex: segIdx, wordIndex: wordIdx, + deleted: w.deleted ?? false, + isFiller: w.isFiller ?? isFillerWord(w.text), + isPause: w.isPause ?? false, + bufferStart: w.bufferStart ?? 0, + bufferEnd: w.bufferEnd ?? 0, }); } } return result; }); + const pauses = createMemo((): PauseIndicator[] => { + const words = allWords(); + const result: PauseIndicator[] = []; + for (let i = 1; i < words.length; i++) { + const prev = words[i - 1]; + const curr = words[i]; + if (prev.deleted || curr.deleted) continue; + const gap = curr.start - prev.end; + if (gap >= PAUSE_DETECTION_THRESHOLD) { + result.push({ + type: "pause", + start: prev.end, + end: curr.start, + duration: gap, + afterSegmentIndex: prev.segmentIndex, + afterWordIndex: prev.wordIndex, + }); + } + } + return result; + }); + + const fillerCount = createMemo( + () => allWords().filter((w) => w.isFiller && !w.deleted).length, + ); + + const pauseCount = createMemo(() => pauses().length); + const segmentGroups = createMemo((): TranscriptSegmentGroup[] => { const words = allWords(); const groups: TranscriptSegmentGroup[] = []; @@ -148,18 +203,15 @@ export function TranscriptPanel() { const words = allWords(); const wordsToDelete = flatIndices .map((idx) => words[idx]) - .filter((w): w is FlatWord => !!w); + .filter((w): w is FlatWord => !!w && !w.deleted); if (wordsToDelete.length === 0) return; - const sorted = [...wordsToDelete].sort((a, b) => { - if (a.segmentIndex !== b.segmentIndex) - return b.segmentIndex - a.segmentIndex; - return b.wordIndex - a.wordIndex; - }); - const timeRanges = wordsToDelete - .map((w) => ({ start: w.start, end: w.end })) + .map((w) => ({ + start: Math.max(0, w.start - (w.bufferStart || 0)), + end: w.end + (w.bufferEnd || 0), + })) .sort((a, b) => a.start - b.start); const mergedRanges: { start: number; end: number }[] = []; @@ -176,22 +228,24 @@ export function TranscriptPanel() { produce((p) => { if (!p.captions?.segments) return; - for (const word of sorted) { + for (const word of wordsToDelete) { const seg = p.captions.segments[word.segmentIndex]; if (!seg?.words) continue; - if (word.wordIndex < seg.words.length) { - seg.words.splice(word.wordIndex, 1); + const w = seg.words[word.wordIndex] as CaptionWordExtended; + if (w) { + w.deleted = true; } } - for (let i = p.captions.segments.length - 1; i >= 0; i--) { - const seg = p.captions.segments[i]; - if (!seg.words || seg.words.length === 0) { - p.captions.segments.splice(i, 1); - } else { - seg.text = getCaptionTextFromWords(seg.words); - seg.start = seg.words[0].start; - seg.end = seg.words[seg.words.length - 1].end; + for (const seg of p.captions.segments) { + const extWords = (seg.words ?? []) as CaptionWordExtended[]; + seg.text = getCaptionTextFromWords(extWords); + if (seg.words && seg.words.length > 0) { + const visible = extWords.filter((w) => !w.deleted); + if (visible.length > 0) { + seg.start = visible[0].start; + seg.end = visible[visible.length - 1].end; + } } } @@ -230,6 +284,82 @@ export function TranscriptPanel() { } }; + const restoreWords = (flatIndices: number[]) => { + const words = allWords(); + const wordsToRestore = flatIndices + .map((idx) => words[idx]) + .filter((w): w is FlatWord => !!w && w.deleted); + + if (wordsToRestore.length === 0) return; + + const timeRanges = wordsToRestore + .map((w) => ({ + start: Math.max(0, w.start - (w.bufferStart || 0)), + end: w.end + (w.bufferEnd || 0), + })) + .sort((a, b) => a.start - b.start); + + const mergedRanges: { start: number; end: number }[] = []; + for (const range of timeRanges) { + const last = mergedRanges[mergedRanges.length - 1]; + if (last && range.start <= last.end) { + last.end = Math.max(last.end, range.end); + } else { + mergedRanges.push({ ...range }); + } + } + + setProject( + produce((p) => { + if (!p.captions?.segments) return; + + for (const range of mergedRanges) { + const insertDuration = range.end - range.start; + if (insertDuration <= 0.001) continue; + + shiftCaptionTimesAfterInsert( + p.captions.segments, + range.start, + insertDuration, + ); + + if (p.timeline) { + rippleInsertAllTracks(p.timeline, range.start, insertDuration); + } + } + + for (const word of wordsToRestore) { + const seg = p.captions.segments[word.segmentIndex]; + if (!seg?.words) continue; + const w = seg.words[word.wordIndex] as CaptionWordExtended; + if (w) { + w.deleted = false; + } + } + + for (const seg of p.captions.segments) { + const extWords = (seg.words ?? []) as CaptionWordExtended[]; + seg.text = getCaptionTextFromWords(extWords); + if (seg.words && seg.words.length > 0) { + const visible = extWords.filter((w) => !w.deleted); + if (visible.length > 0) { + seg.start = visible[0].start; + seg.end = visible[visible.length - 1].end; + } + } + } + + if (p.timeline && p.captions) { + p.timeline.captionSegments = createCaptionTrackSegments( + p.captions.segments, + ); + } + }), + ); + + setEditorState("captions", "isStale", false); + }; + const handleDeleteWord = (flatIndex: number) => { applyWordDeletions([flatIndex]); }; @@ -238,6 +368,53 @@ export function TranscriptPanel() { applyWordDeletions(flatIndices); }; + const handleRestoreWord = (flatIndex: number) => { + restoreWords([flatIndex]); + }; + + const handleRestoreWords = (flatIndices: number[]) => { + restoreWords(flatIndices); + }; + + const [silenceThreshold, setSilenceThreshold] = makePersisted( + createSignal(AUTO_CLEAN_SILENCE_THRESHOLD), + { name: "editorAutoCleanThreshold" }, + ); + + const autoClean = () => { + const words = allWords(); + const ps = pauses(); + const threshold = silenceThreshold(); + + const fillerIndices: number[] = []; + for (let i = 0; i < words.length; i++) { + if (!words[i].deleted && words[i].isFiller) { + fillerIndices.push(i); + } + } + + const pauseWordIndices: number[] = []; + for (const p of ps) { + if (p.duration >= threshold) { + for (let i = 0; i < words.length; i++) { + const w = words[i]; + if ( + !w.deleted && + w.start >= p.start - 0.01 && + w.end <= p.end + 0.01 + ) { + pauseWordIndices.push(i); + } + } + } + } + + const allIndices = [...new Set([...fillerIndices, ...pauseWordIndices])]; + if (allIndices.length > 0) { + applyWordDeletions(allIndices); + } + }; + const isAtEnd = () => { const total = totalDuration(); return total > 0 && total - editorState.playbackTime <= 0.1; @@ -291,11 +468,80 @@ export function TranscriptPanel() { handlePlayPause(); }); + const [showAutoCleanDropdown, setShowAutoCleanDropdown] = createSignal(false); + return (
Transcript
+ 0 || pauseCount() > 0}> + + {fillerCount() > 0 && + `${fillerCount()} filler${fillerCount() > 1 ? "s" : ""}`} + {fillerCount() > 0 && pauseCount() > 0 && ", "} + {pauseCount() > 0 && + `${pauseCount()} pause${pauseCount() > 1 ? "s" : ""}`} + + +
+
+ + +
+ +
+
+ Silence Threshold +
+
+ + setSilenceThreshold( + Number.parseFloat(e.currentTarget.value), + ) + } + class="flex-1 h-1 accent-blue-9" + /> + + {silenceThreshold().toFixed(1)}s + +
+ +
+
+
); } +function BufferPopover(props: { + word: FlatWord; + position: { x: number; y: number }; + onClose: () => void; + onBufferChange: ( + segmentIndex: number, + wordIndex: number, + bufferStart: number, + bufferEnd: number, + ) => void; + onRestore: () => void; +}) { + const [bufStart, setBufStart] = createSignal(props.word.bufferStart); + const [bufEnd, setBufEnd] = createSignal(props.word.bufferEnd); + let popoverRef: HTMLDivElement | undefined; + + const handleClickOutside = (e: MouseEvent) => { + if (popoverRef && !popoverRef.contains(e.target as Node)) { + props.onClose(); + } + }; + + const handleEscape = (e: KeyboardEvent) => { + if (e.key === "Escape") props.onClose(); + }; + + createEventListener(document, "mousedown", handleClickOutside); + createEventListener(window, "keydown", handleEscape); + + const updateBuffer = (start: number, end: number) => { + setBufStart(start); + setBufEnd(end); + props.onBufferChange( + props.word.segmentIndex, + props.word.wordIndex, + start, + end, + ); + }; + + const popoverStyle = () => { + const x = Math.min(props.position.x, window.innerWidth - 220); + const y = Math.min(props.position.y, window.innerHeight - 200); + return { + position: "fixed" as const, + left: `${x}px`, + top: `${y}px`, + "z-index": "9999", + }; + }; + + return ( +
+
+
+ + Adjust Buffer + + +
+

+ Buffer around deleted word to preserve pronunciations. +

+ +
+
+
+ Start Buffer + + {bufStart().toFixed(2)}s + +
+ + updateBuffer(Number.parseFloat(e.currentTarget.value), bufEnd()) + } + class="w-full h-1 accent-blue-9" + /> +
+
+
+ End Buffer + + {bufEnd().toFixed(2)}s + +
+ + updateBuffer( + bufStart(), + Number.parseFloat(e.currentTarget.value), + ) + } + class="w-full h-1 accent-blue-9" + /> +
+
+ + + + +
+
+ ); +} + function WordWithTooltip(props: { word: FlatWord; isActive: boolean; @@ -341,6 +719,8 @@ function WordWithTooltip(props: { ref: (el: HTMLSpanElement) => void; onClick: (e: MouseEvent) => void; onDelete: () => void; + onRestore: () => void; + onContextMenu: (e: MouseEvent) => void; }) { const [hovering, setHovering] = createSignal(false); let hoverTimer: number | undefined; @@ -361,14 +741,25 @@ function WordWithTooltip(props: { ref={props.ref} class={cx( "cursor-pointer transition-colors duration-100 rounded-xs relative", - props.isSelected && "bg-blue-4/50", - props.isActive - ? "text-blue-11" - : props.isSelected + props.word.deleted + ? "line-through opacity-40 text-red-9 bg-red-3/30" + : props.word.isFiller + ? "border-b-2 border-dotted border-amber-8/80 bg-amber-3/15" + : "", + !props.word.deleted && props.isSelected && "bg-blue-4/50", + props.word.deleted + ? "hover:opacity-60" + : props.isActive ? "text-blue-11" - : "text-gray-9 hover:text-gray-12", + : props.isSelected + ? "text-blue-11" + : "text-gray-9 hover:text-gray-12", )} onClick={(e) => props.onClick(e)} + onContextMenu={(e) => { + e.preventDefault(); + props.onContextMenu(e); + }} onMouseEnter={onEnter} onMouseLeave={onLeave} > @@ -382,16 +773,32 @@ function WordWithTooltip(props: { {formatTimePrecise(props.word.start)} - + } > - - + + @@ -399,19 +806,36 @@ function WordWithTooltip(props: { ); } +function PauseBadge(props: { pause: PauseIndicator }) { + return ( + + ⏸ {props.pause.duration.toFixed(1)}s + + ); +} + function TranscriptEditor(props: { segmentGroups: TranscriptSegmentGroup[]; allWords: FlatWord[]; + pauses: PauseIndicator[]; activeWordIndex: number; textSizeClass: string; onWordClick: (word: FlatWord) => void; onDeleteWord: (flatIndex: number) => void; onDeleteWords: (flatIndices: number[]) => void; + onRestoreWord: (flatIndex: number) => void; + onRestoreWords: (flatIndices: number[]) => void; }) { + const { setProject } = useEditorContext(); const [selectedIndices, setSelectedIndices] = createSignal>( new Set(), ); const [anchorIndex, setAnchorIndex] = createSignal(-1); + const [bufferPopover, setBufferPopover] = createSignal<{ + word: FlatWord; + flatIndex: number; + position: { x: number; y: number }; + } | null>(null); let scrollContainerRef: HTMLDivElement | undefined; let activeWordRef: HTMLSpanElement | undefined; @@ -429,6 +853,14 @@ function TranscriptEditor(props: { const selectedCount = () => selectedIndices().size; + const pauseAfterWord = createMemo(() => { + const map = new Map(); + for (const p of props.pauses) { + map.set(`${p.afterSegmentIndex}:${p.afterWordIndex}`, p); + } + return map; + }); + createEffect( on( () => props.activeWordIndex, @@ -460,10 +892,19 @@ function TranscriptEditor(props: { if (e.key === "Backspace" || e.key === "Delete") { e.preventDefault(); const indices = [...selected]; - if (indices.length === 1) { - props.onDeleteWord(indices[0]); + const firstWord = props.allWords[indices[0]]; + if (firstWord?.deleted) { + if (indices.length === 1) { + props.onRestoreWord(indices[0]); + } else { + props.onRestoreWords(indices); + } } else { - props.onDeleteWords(indices); + if (indices.length === 1) { + props.onDeleteWord(indices[0]); + } else { + props.onDeleteWords(indices); + } } setSelectedIndices(new Set()); setAnchorIndex(-1); @@ -548,6 +989,47 @@ function TranscriptEditor(props: { setAnchorIndex(-1); }; + const handleWordRestore = (word: FlatWord) => { + const selected = selectedIndices(); + if (selected.size > 1) { + props.onRestoreWords([...selected]); + } else { + props.onRestoreWord(flatIndexOf(word)); + } + setSelectedIndices(new Set()); + setAnchorIndex(-1); + }; + + const handleContextMenu = (word: FlatWord, e: MouseEvent) => { + if (word.deleted) { + setBufferPopover({ + word, + flatIndex: flatIndexOf(word), + position: { x: e.clientX, y: e.clientY }, + }); + } + }; + + const handleBufferChange = ( + segmentIndex: number, + wordIndex: number, + bufferStart: number, + bufferEnd: number, + ) => { + setProject( + produce((p) => { + if (!p.captions?.segments) return; + const seg = p.captions.segments[segmentIndex]; + if (!seg?.words) return; + const w = seg.words[wordIndex] as CaptionWordExtended; + if (w) { + w.bufferStart = bufferStart; + w.bufferEnd = bufferEnd; + } + }), + ); + }; + return (
flatIndexOf(word); const isActive = () => props.activeWordIndex === flatIdx(); const isSelected = () => selectedIndices().has(flatIdx()); + const pause = () => + pauseAfterWord().get( + `${word.segmentIndex}:${word.wordIndex}`, + ); return ( - { - if (isActive()) activeWordRef = el; - }} - onClick={(e: MouseEvent) => handleWordSelect(word, e)} - onDelete={() => handleWordDelete(word)} - /> + <> + { + if (isActive()) activeWordRef = el; + }} + onClick={(e: MouseEvent) => handleWordSelect(word, e)} + onDelete={() => handleWordDelete(word)} + onRestore={() => handleWordRestore(word)} + onContextMenu={(e: MouseEvent) => + handleContextMenu(word, e) + } + /> + + {(p) => } + + ); }} @@ -598,6 +1093,20 @@ function TranscriptEditor(props: {
+ + {(popover) => ( + setBufferPopover(null)} + onBufferChange={handleBufferChange} + onRestore={() => { + handleWordRestore(popover().word); + setBufferPopover(null); + }} + /> + )} +
); } diff --git a/apps/desktop/src/routes/editor/caption-types.ts b/apps/desktop/src/routes/editor/caption-types.ts new file mode 100644 index 00000000000..4ef410e417b --- /dev/null +++ b/apps/desktop/src/routes/editor/caption-types.ts @@ -0,0 +1,9 @@ +import type { CaptionWord as BaseCaptionWord } from "~/utils/tauri"; + +export interface CaptionWordExtended extends BaseCaptionWord { + deleted?: boolean; + isFiller?: boolean; + isPause?: boolean; + bufferStart?: number; + bufferEnd?: number; +} diff --git a/apps/desktop/src/routes/editor/captions.ts b/apps/desktop/src/routes/editor/captions.ts index 1af027145ca..d6a58fa4835 100644 --- a/apps/desktop/src/routes/editor/captions.ts +++ b/apps/desktop/src/routes/editor/captions.ts @@ -11,6 +11,7 @@ import { type SegmentRecordings, type TimelineSegment, } from "~/utils/tauri"; +import type { CaptionWordExtended } from "./caption-types"; export const DEFAULT_CAPTION_MODEL = "best"; export const DEFAULT_WHISPER_CAPTION_MODEL = "small"; export const DEFAULT_CAPTION_LANGUAGE = "auto"; @@ -138,18 +139,20 @@ export function mapCaptionsToEditedTimeline( const mappedCaptionSegments = mappings.flatMap((mapping) => { if (caption.words && caption.words.length > 0) { const mappedWords = caption.words.flatMap((word) => { - const wordMapped = mapTimeRangeWithinMapping( - word.start, - word.end, - mapping, - ); + const w = word as CaptionWordExtended; + const wordMapped = mapTimeRangeWithinMapping(w.start, w.end, mapping); return wordMapped ? [ { - text: word.text, + text: w.text, start: wordMapped.start, end: wordMapped.end, + deleted: w.deleted ?? false, + isFiller: w.isFiller ?? false, + isPause: w.isPause ?? false, + bufferStart: w.bufferStart ?? 0, + bufferEnd: w.bufferEnd ?? 0, }, ] : []; @@ -209,19 +212,29 @@ export function mapCaptionsToEditedTimeline( export function createCaptionTrackSegments( segments: CaptionSegment[], ): CaptionTrackSegment[] { - return segments.map((segment) => ({ - id: segment.id, - start: segment.start, - end: segment.end, - text: segment.text, - words: segment.words ?? [], - fadeDurationOverride: null, - lingerDurationOverride: null, - positionOverride: null, - colorOverride: null, - backgroundColorOverride: null, - fontSizeOverride: null, - })); + return segments.map((segment) => { + const words = (segment.words ?? []) as CaptionWordExtended[]; + const visibleText = words.some((w) => w.deleted) + ? words + .filter((w) => !w.deleted) + .map((w) => w.text.trim()) + .filter((t) => t.length > 0) + .join(" ") + : segment.text; + return { + id: segment.id, + start: segment.start, + end: segment.end, + text: visibleText, + words, + fadeDurationOverride: null, + lingerDurationOverride: null, + positionOverride: null, + colorOverride: null, + backgroundColorOverride: null, + fontSizeOverride: null, + }; + }); } export function applyCaptionResultToProject< @@ -301,8 +314,9 @@ export async function transcribeEditorCaptions( return await commands.transcribeAudio(videoPath, modelPath, language, engine); } -export function getCaptionTextFromWords(words: CaptionWord[]) { +export function getCaptionTextFromWords(words: CaptionWordExtended[]) { return words + .filter((word) => !word.deleted) .map((word) => word.text.trim()) .filter((word) => word.length > 0) .join(" "); diff --git a/apps/desktop/src/routes/editor/filler-detection.ts b/apps/desktop/src/routes/editor/filler-detection.ts new file mode 100644 index 00000000000..81a469ea557 --- /dev/null +++ b/apps/desktop/src/routes/editor/filler-detection.ts @@ -0,0 +1,68 @@ +import type { CaptionWordExtended } from "./caption-types"; + +const FILLER_WORDS = new Set(["uh", "um", "ah", "er", "hmm", "mhm"]); + +export function isFillerWord(text: string): boolean { + const normalized = text.toLowerCase().replace(/[^a-z]/g, ""); + return FILLER_WORDS.has(normalized); +} + +export const PAUSE_DETECTION_THRESHOLD = 0.5; +export const AUTO_CLEAN_SILENCE_THRESHOLD = 1.5; +export const DEFAULT_PAUSE_BUFFER = 0.15; + +export function detectAndAnnotateWords( + words: CaptionWordExtended[], +): CaptionWordExtended[] { + return words.map((w) => ({ + ...w, + isFiller: isFillerWord(w.text), + })); +} + +export interface PauseEntry { + text: string; + start: number; + end: number; + duration: number; + deleted: boolean; + isPause: true; + isFiller: false; + bufferStart: number; + bufferEnd: number; + segmentIndex: number; + afterWordIndex: number; +} + +export function detectPauses( + words: Array<{ + start: number; + end: number; + segmentIndex: number; + wordIndex: number; + }>, + threshold = PAUSE_DETECTION_THRESHOLD, +): PauseEntry[] { + const pauses: PauseEntry[] = []; + for (let i = 1; i < words.length; i++) { + const prev = words[i - 1]; + const curr = words[i]; + const gap = curr.start - prev.end; + if (gap >= threshold) { + pauses.push({ + text: `[Pause ${gap.toFixed(1)}s]`, + start: prev.end, + end: curr.start, + duration: gap, + deleted: false, + isPause: true, + isFiller: false, + bufferStart: DEFAULT_PAUSE_BUFFER, + bufferEnd: DEFAULT_PAUSE_BUFFER, + segmentIndex: prev.segmentIndex, + afterWordIndex: prev.wordIndex, + }); + } + } + return pauses; +} diff --git a/apps/desktop/src/routes/editor/timeline-utils.ts b/apps/desktop/src/routes/editor/timeline-utils.ts index b1c1ecd8bd6..6374f3b0afc 100644 --- a/apps/desktop/src/routes/editor/timeline-utils.ts +++ b/apps/desktop/src/routes/editor/timeline-utils.ts @@ -151,3 +151,124 @@ export function rippleDeleteAllTracks( if (timeline.keyboardSegments) rippleDeleteFromTrack(timeline.keyboardSegments, cutStart, cutEnd); } + +export function shiftTimeAfterInsert( + time: number, + insertPoint: number, + duration: number, +): number { + if (time <= insertPoint) return time; + return time + duration; +} + +export function shiftCaptionTimesAfterInsert( + segments: Array<{ + start: number; + end: number; + words?: Array<{ start: number; end: number }>; + }>, + insertPoint: number, + duration: number, +) { + for (const seg of segments) { + if (seg.words) { + for (const w of seg.words) { + w.start = shiftTimeAfterInsert(w.start, insertPoint, duration); + w.end = shiftTimeAfterInsert(w.end, insertPoint, duration); + } + if (seg.words.length > 0) { + seg.start = seg.words[0].start; + seg.end = seg.words[seg.words.length - 1].end; + } + } + } +} + +export function rippleInsertIntoTrack( + segments: Array<{ start: number; end: number }>, + insertPoint: number, + duration: number, +) { + for (const seg of segments) { + if (seg.start >= insertPoint) { + seg.start += duration; + seg.end += duration; + } else if (seg.end > insertPoint) { + seg.end += duration; + } + } +} + +export function insertClipSegmentForRange( + segments: Array<{ timescale: number; start: number; end: number }>, + insertPoint: number, + duration: number, +) { + let editedOffset = 0; + for (let i = 0; i < segments.length; i++) { + const seg = segments[i]; + const segDuration = (seg.end - seg.start) / seg.timescale; + const segEditedEnd = editedOffset + segDuration; + + if (insertPoint <= segEditedEnd) { + const relativeInSeg = (insertPoint - editedOffset) * seg.timescale; + const splitPoint = seg.start + relativeInSeg; + const insertAmount = duration * seg.timescale; + + if (splitPoint <= seg.start + 0.001) { + seg.start -= insertAmount; + } else if (splitPoint >= seg.end - 0.001) { + seg.end += insertAmount; + } else { + const originalEnd = seg.end; + seg.end = splitPoint; + const insertedSeg = { + timescale: seg.timescale, + start: splitPoint, + end: splitPoint + insertAmount, + }; + const afterSeg = { + timescale: seg.timescale, + start: splitPoint + insertAmount, + end: originalEnd + insertAmount, + }; + segments.splice(i + 1, 0, insertedSeg, afterSeg); + } + return; + } + editedOffset += segDuration; + } + + if (segments.length > 0) { + const lastSeg = segments[segments.length - 1]; + lastSeg.end += duration * lastSeg.timescale; + } +} + +export function rippleInsertAllTracks( + timeline: { + segments: Array<{ timescale: number; start: number; end: number }>; + zoomSegments?: Array<{ start: number; end: number }> | null; + sceneSegments?: Array<{ start: number; end: number }> | null; + maskSegments?: Array<{ start: number; end: number }> | null; + textSegments?: Array<{ start: number; end: number }> | null; + captionSegments?: Array<{ start: number; end: number }> | null; + keyboardSegments?: Array<{ start: number; end: number }> | null; + }, + insertPoint: number, + duration: number, +) { + insertClipSegmentForRange(timeline.segments, insertPoint, duration); + if (timeline.zoomSegments) + rippleInsertIntoTrack(timeline.zoomSegments, insertPoint, duration); + if (timeline.sceneSegments) + rippleInsertIntoTrack(timeline.sceneSegments, insertPoint, duration); + if (timeline.maskSegments) + rippleInsertIntoTrack(timeline.maskSegments, insertPoint, duration); + if (timeline.textSegments) + rippleInsertIntoTrack(timeline.textSegments, insertPoint, duration); + if (timeline.captionSegments) + rippleInsertIntoTrack(timeline.captionSegments, insertPoint, duration); + if (timeline.keyboardSegments) + rippleInsertIntoTrack(timeline.keyboardSegments, insertPoint, duration); +} diff --git a/apps/web/package.json b/apps/web/package.json index 0d21caf3bae..712fe6d2e95 100644 --- a/apps/web/package.json +++ b/apps/web/package.json @@ -6,7 +6,7 @@ "dev": "cross-env NODE_OPTIONS=--disable-warning=DEP0169 dotenv -e ../../.env -- next dev", "build": "next build --turbopack", "build:web": "next build --turbopack", - "build:docker": "cd ../.. && docker build -t cap-web-docker . --no-cache --progress=plain", + "build:docker": "cd ../.. && docker build -t cap-web-docker -f apps/web/Dockerfile . --no-cache --progress=plain", "start": "cross-env NODE_OPTIONS=--disable-warning=DEP0169 next start", "compress-images": "bash tools/compress-images.sh", "test": "vitest run", diff --git a/crates/project/src/configuration.rs b/crates/project/src/configuration.rs index 08615b143ae..b81539cbde0 100644 --- a/crates/project/src/configuration.rs +++ b/crates/project/src/configuration.rs @@ -902,6 +902,16 @@ pub struct CaptionWord { pub text: String, pub start: f32, pub end: f32, + #[serde(default)] + pub deleted: bool, + #[serde(default)] + pub is_filler: bool, + #[serde(default)] + pub is_pause: bool, + #[serde(default)] + pub buffer_start: f32, + #[serde(default)] + pub buffer_end: f32, } #[derive(Type, Serialize, Deserialize, Clone, Debug, Default)] diff --git a/crates/rendering/src/layers/captions.rs b/crates/rendering/src/layers/captions.rs index 15ec581e9d8..94055420f4b 100644 --- a/crates/rendering/src/layers/captions.rs +++ b/crates/rendering/src/layers/captions.rs @@ -396,12 +396,24 @@ impl CaptionsLayer { active.segment.end as f32, ); - let raw_caption_text = self.current_text.clone().unwrap_or_default(); + let raw_caption_text = if active.segment.words.iter().any(|w| w.deleted) { + active + .segment + .words + .iter() + .filter(|w| !w.deleted) + .map(|w| w.text.as_str()) + .collect::>() + .join(" ") + } else { + self.current_text.clone().unwrap_or_default() + }; let caption_text = wrap_text_by_words(&raw_caption_text, MAX_WORDS_PER_LINE); let caption_words: Vec = active .segment .words .iter() + .filter(|w| !w.deleted) .map(|w| CaptionWord { text: w.text.clone(), start: w.start, diff --git a/package.json b/package.json index 8ae22dd6d83..0825497bfbe 100644 --- a/package.json +++ b/package.json @@ -70,4 +70,4 @@ "engines": { "node": ">=20" } -} +} \ No newline at end of file From 751dfdff5f507243ef855a4573ce816951ff98be Mon Sep 17 00:00:00 2001 From: Maulik Date: Mon, 1 Jun 2026 07:12:11 +0530 Subject: [PATCH 02/39] fix: add auxiliary metadata fields to CaptionWord type and update struct initialization --- apps/desktop/src-tauri/src/captions.rs | 4 ++++ apps/desktop/src/utils/tauri.ts | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/apps/desktop/src-tauri/src/captions.rs b/apps/desktop/src-tauri/src/captions.rs index 0b8d6fe06ae..68d69910d8d 100644 --- a/apps/desktop/src-tauri/src/captions.rs +++ b/apps/desktop/src-tauri/src/captions.rs @@ -822,6 +822,7 @@ fn process_with_whisper( text: current_word.trim().to_string(), start: ws, end: word_end, + ..Default::default() }); } current_word = token_text.trim().to_string(); @@ -854,6 +855,7 @@ fn process_with_whisper( text: current_word.trim().to_string(), start: ws, end: word_end, + ..Default::default() }); } @@ -1000,6 +1002,7 @@ fn process_with_parakeet( text: t.text.trim().to_string(), start: t.start, end: t.end, + ..Default::default() }) .collect(); @@ -1393,6 +1396,7 @@ pub fn parse_captions_json(json: &str) -> Result Date: Mon, 1 Jun 2026 08:40:59 +0530 Subject: [PATCH 03/39] fix: restore video with word restore --- apps/desktop/src/routes/editor/timeline-utils.ts | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/apps/desktop/src/routes/editor/timeline-utils.ts b/apps/desktop/src/routes/editor/timeline-utils.ts index 6374f3b0afc..fe673118ec9 100644 --- a/apps/desktop/src/routes/editor/timeline-utils.ts +++ b/apps/desktop/src/routes/editor/timeline-utils.ts @@ -19,8 +19,9 @@ export function shiftCaptionTimesAfterCut( for (const seg of segments) { if (seg.words) { for (const w of seg.words) { + const duration = w.end - w.start; w.start = shiftTimeAfterCut(w.start, cutStart, cutDuration); - w.end = shiftTimeAfterCut(w.end, cutStart, cutDuration); + w.end = w.start + duration; } if (seg.words.length > 0) { seg.start = seg.words[0].start; @@ -168,13 +169,14 @@ export function shiftCaptionTimesAfterInsert( words?: Array<{ start: number; end: number }>; }>, insertPoint: number, - duration: number, + duration_arg: number, ) { for (const seg of segments) { if (seg.words) { for (const w of seg.words) { - w.start = shiftTimeAfterInsert(w.start, insertPoint, duration); - w.end = shiftTimeAfterInsert(w.end, insertPoint, duration); + const duration = w.end - w.start; + w.start = shiftTimeAfterInsert(w.start, insertPoint, duration_arg); + w.end = w.start + duration; } if (seg.words.length > 0) { seg.start = seg.words[0].start; From eab4d9e49782d64d2a3f6e7386e0cd084efdb41f Mon Sep 17 00:00:00 2001 From: Maulik Date: Mon, 1 Jun 2026 09:14:21 +0530 Subject: [PATCH 04/39] feat(desktop): implement transcript editing buffers and ripple deletion --- .../src/routes/editor/Timeline/ClipTrack.tsx | 8 +- .../src/routes/editor/TranscriptPage.tsx | 203 ++++++++++++++---- apps/desktop/src/routes/editor/captions.ts | 6 +- .../src/routes/editor/timeline-utils.ts | 13 +- 4 files changed, 185 insertions(+), 45 deletions(-) diff --git a/apps/desktop/src/routes/editor/Timeline/ClipTrack.tsx b/apps/desktop/src/routes/editor/Timeline/ClipTrack.tsx index cdeef7f13e4..ccbe3f95b8d 100644 --- a/apps/desktop/src/routes/editor/Timeline/ClipTrack.tsx +++ b/apps/desktop/src/routes/editor/Timeline/ClipTrack.tsx @@ -143,7 +143,7 @@ function WaveformCanvas(props: { rafId = null; if (!canvas) return; const ctx = canvas.getContext("2d"); - if (!ctx) return; + if (!ctx || !props.segment) return; const segmentDuration = props.segment.end - props.segment.start; const fullSegmentWidth = width(); @@ -262,8 +262,8 @@ function WaveformCanvas(props: { timelineBounds.width; editorState.timeline.transform.position; editorState.timeline.transform.zoom; - props.segment.start; - props.segment.end; + props.segment?.start; + props.segment?.end; props.micWaveform; props.systemWaveform; project.audio.micVolumeDb; @@ -392,6 +392,8 @@ export function ClipTrack( {(segmentIndex) => { const i = segmentIndex; const segment = () => segments()[i()]; + if (!segment()) return null; + const [startHandleDrag, setStartHandleDrag] = createSignal { const words = allWords(); const result: PauseIndicator[] = []; - for (let i = 1; i < words.length; i++) { - const prev = words[i - 1]; + let lastVisible: (typeof words)[0] | null = null; + + for (let i = 0; i < words.length; i++) { const curr = words[i]; - if (prev.deleted || curr.deleted) continue; - const gap = curr.start - prev.end; - if (gap >= PAUSE_DETECTION_THRESHOLD) { - result.push({ - type: "pause", - start: prev.end, - end: curr.start, - duration: gap, - afterSegmentIndex: prev.segmentIndex, - afterWordIndex: prev.wordIndex, - }); + if (curr.deleted) continue; + + if (lastVisible) { + const gap = curr.start - lastVisible.end; + if (gap >= PAUSE_DETECTION_THRESHOLD) { + result.push({ + type: "pause", + start: lastVisible.end, + end: curr.start, + duration: gap, + afterSegmentIndex: lastVisible.segmentIndex, + afterWordIndex: lastVisible.wordIndex, + }); + } } + lastVisible = curr; } return result; }); @@ -386,33 +393,108 @@ export function TranscriptPanel() { const ps = pauses(); const threshold = silenceThreshold(); - const fillerIndices: number[] = []; - for (let i = 0; i < words.length; i++) { - if (!words[i].deleted && words[i].isFiller) { - fillerIndices.push(i); - } - } + const fillerWords = words.filter((w) => !w.deleted && w.isFiller); + const pausesToClean = ps.filter((p) => p.duration >= threshold); - const pauseWordIndices: number[] = []; - for (const p of ps) { - if (p.duration >= threshold) { - for (let i = 0; i < words.length; i++) { - const w = words[i]; - if ( - !w.deleted && - w.start >= p.start - 0.01 && - w.end <= p.end + 0.01 - ) { - pauseWordIndices.push(i); + if (fillerWords.length === 0 && pausesToClean.length === 0) return; + + setProject( + produce((p) => { + if (!p.captions?.segments) return; + + const timeRanges: Array<{ start: number; end: number }> = []; + + // 1. Mark fillers as deleted and collect their ranges + for (const fw of fillerWords) { + const seg = p.captions.segments[fw.segmentIndex]; + if (seg?.words) { + const w = seg.words[fw.wordIndex] as CaptionWordExtended; + if (w) w.deleted = true; } + timeRanges.push({ + start: Math.max(0, fw.start - (fw.bufferStart || 0)), + end: fw.end + (fw.bufferEnd || 0), + }); } - } - } - const allIndices = [...new Set([...fillerIndices, ...pauseWordIndices])]; - if (allIndices.length > 0) { - applyWordDeletions(allIndices); - } + // 2. Insert pause words and collect their ranges + const sortedPauses = [...pausesToClean].sort((a, b) => { + if (a.afterSegmentIndex !== b.afterSegmentIndex) { + return b.afterSegmentIndex - a.afterSegmentIndex; + } + return b.afterWordIndex - a.afterWordIndex; + }); + + for (const pInfo of sortedPauses) { + const seg = p.captions.segments[pInfo.afterSegmentIndex]; + if (seg?.words) { + const pauseWord: CaptionWordExtended = { + text: `[Pause ${pInfo.duration.toFixed(1)}s]`, + start: pInfo.start, + end: pInfo.end, + deleted: true, + isPause: true, + isFiller: false, + bufferStart: DEFAULT_PAUSE_BUFFER, + bufferEnd: DEFAULT_PAUSE_BUFFER, + }; + seg.words.splice(pInfo.afterWordIndex + 1, 0, pauseWord); + } + timeRanges.push({ + start: Math.max(0, pInfo.start - DEFAULT_PAUSE_BUFFER), + end: pInfo.end + DEFAULT_PAUSE_BUFFER, + }); + } + + // 3. Recalculate seg.start and seg.end + for (const seg of p.captions.segments) { + const extWords = (seg.words ?? []) as CaptionWordExtended[]; + seg.text = getCaptionTextFromWords(extWords); + if (seg.words && seg.words.length > 0) { + const visible = extWords.filter((w) => !w.deleted); + if (visible.length > 0) { + seg.start = visible[0].start; + seg.end = visible[visible.length - 1].end; + } + } + } + + // 4. Merge time ranges and apply cuts + timeRanges.sort((a, b) => a.start - b.start); + const mergedRanges: { start: number; end: number }[] = []; + for (const range of timeRanges) { + const last = mergedRanges[mergedRanges.length - 1]; + if (last && range.start <= last.end) { + last.end = Math.max(last.end, range.end); + } else { + mergedRanges.push({ ...range }); + } + } + + const reversedRanges = [...mergedRanges].reverse(); + for (const range of reversedRanges) { + const cutDuration = range.end - range.start; + if (cutDuration <= 0.001) continue; + + shiftCaptionTimesAfterCut( + p.captions.segments, + range.start, + cutDuration, + ); + + if (p.timeline) { + rippleDeleteAllTracks(p.timeline, range.start, range.end); + } + } + + if (p.timeline && p.captions) { + p.timeline.captionSegments = createCaptionTrackSegments( + p.captions.segments, + ); + } + }), + ); + setEditorState("captions", "isStale", false); }; const isAtEnd = () => { @@ -826,7 +908,7 @@ function TranscriptEditor(props: { onRestoreWord: (flatIndex: number) => void; onRestoreWords: (flatIndices: number[]) => void; }) { - const { setProject } = useEditorContext(); + const { setProject, setEditorState } = useEditorContext(); const [selectedIndices, setSelectedIndices] = createSignal>( new Set(), ); @@ -1022,12 +1104,57 @@ function TranscriptEditor(props: { const seg = p.captions.segments[segmentIndex]; if (!seg?.words) return; const w = seg.words[wordIndex] as CaptionWordExtended; - if (w) { + if (!w) return; + + if (w.deleted) { + // 1. Undo the old cut + const oldCutStart = Math.max(0, w.start - (w.bufferStart || 0)); + const oldCutEnd = w.end + (w.bufferEnd || 0); + const oldDuration = oldCutEnd - oldCutStart; + + if (oldDuration > 0.001) { + shiftCaptionTimesAfterInsert( + p.captions.segments, + oldCutStart, + oldDuration, + ); + if (p.timeline) { + rippleInsertAllTracks(p.timeline, oldCutStart, oldDuration); + } + } + + // 2. Update buffers + w.bufferStart = bufferStart; + w.bufferEnd = bufferEnd; + + // 3. Apply the new cut + const newCutStart = Math.max(0, w.start - w.bufferStart); + const newCutEnd = w.end + w.bufferEnd; + const newDuration = newCutEnd - newCutStart; + + if (newDuration > 0.001) { + shiftCaptionTimesAfterCut( + p.captions.segments, + newCutStart, + newDuration, + ); + if (p.timeline) { + rippleDeleteAllTracks(p.timeline, newCutStart, newCutEnd); + } + } + + if (p.timeline) { + p.timeline.captionSegments = createCaptionTrackSegments( + p.captions.segments, + ); + } + } else { w.bufferStart = bufferStart; w.bufferEnd = bufferEnd; } }), ); + setEditorState("captions", "isStale", false); }; return ( diff --git a/apps/desktop/src/routes/editor/captions.ts b/apps/desktop/src/routes/editor/captions.ts index d6a58fa4835..e59c34ecb29 100644 --- a/apps/desktop/src/routes/editor/captions.ts +++ b/apps/desktop/src/routes/editor/captions.ts @@ -214,9 +214,9 @@ export function createCaptionTrackSegments( ): CaptionTrackSegment[] { return segments.map((segment) => { const words = (segment.words ?? []) as CaptionWordExtended[]; - const visibleText = words.some((w) => w.deleted) + const visibleText = words.some((w) => w.deleted || w.isPause) ? words - .filter((w) => !w.deleted) + .filter((w) => !w.deleted && !w.isPause) .map((w) => w.text.trim()) .filter((t) => t.length > 0) .join(" ") @@ -316,7 +316,7 @@ export async function transcribeEditorCaptions( export function getCaptionTextFromWords(words: CaptionWordExtended[]) { return words - .filter((word) => !word.deleted) + .filter((word) => !word.deleted && !word.isPause) .map((word) => word.text.trim()) .filter((word) => word.length > 0) .join(" "); diff --git a/apps/desktop/src/routes/editor/timeline-utils.ts b/apps/desktop/src/routes/editor/timeline-utils.ts index fe673118ec9..61e5d5fbc41 100644 --- a/apps/desktop/src/routes/editor/timeline-utils.ts +++ b/apps/desktop/src/routes/editor/timeline-utils.ts @@ -19,6 +19,9 @@ export function shiftCaptionTimesAfterCut( for (const seg of segments) { if (seg.words) { for (const w of seg.words) { + if (w.start >= cutStart && w.end <= cutStart + cutDuration) { + continue; + } const duration = w.end - w.start; w.start = shiftTimeAfterCut(w.start, cutStart, cutDuration); w.end = w.start + duration; @@ -91,7 +94,12 @@ export function cutClipSegmentsForRange( editedOffset += duration; } - if (startSegIdx === -1 || endSegIdx === -1) return; + if (startSegIdx === -1) return; + + if (endSegIdx === -1) { + endSegIdx = segments.length - 1; + endRelative = segments[endSegIdx].end - segments[endSegIdx].start; + } if (startSegIdx === endSegIdx) { const seg = segments[startSegIdx]; @@ -174,6 +182,9 @@ export function shiftCaptionTimesAfterInsert( for (const seg of segments) { if (seg.words) { for (const w of seg.words) { + if (w.start >= insertPoint && w.end <= insertPoint + duration_arg) { + continue; + } const duration = w.end - w.start; w.start = shiftTimeAfterInsert(w.start, insertPoint, duration_arg); w.end = w.start + duration; From 15f43402441039767631031d88b7cc4e384340e1 Mon Sep 17 00:00:00 2001 From: Maulik Date: Mon, 1 Jun 2026 10:04:53 +0530 Subject: [PATCH 05/39] feat(desktop): support deleting pauses and cap word duration during transcription --- apps/desktop/src-tauri/src/captions.rs | 25 +++-- .../src/routes/editor/TranscriptPage.tsx | 99 ++++++++++++++++++- .../src/routes/editor/timeline-utils.ts | 13 +-- package.json | 2 +- 4 files changed, 117 insertions(+), 22 deletions(-) diff --git a/apps/desktop/src-tauri/src/captions.rs b/apps/desktop/src-tauri/src/captions.rs index 68d69910d8d..f30352f9c65 100644 --- a/apps/desktop/src-tauri/src/captions.rs +++ b/apps/desktop/src-tauri/src/captions.rs @@ -810,16 +810,23 @@ fn process_with_whisper( if token_text.starts_with(' ') || token_text.starts_with('\n') { if !current_word.is_empty() - && let Some(ws) = word_start + && let Some(mut ws) = word_start { + let text = current_word.trim().to_string(); + let duration = word_end - ws; + let max_duration = (text.len() as f32 * 0.1).clamp(0.5, 1.5); + if duration > max_duration + 0.3 { + ws = word_end - max_duration; + } + log::info!( " -> Completing word: '{}' ({:.2}s - {:.2}s)", - current_word.trim(), + text, ws, word_end ); words.push(CaptionWord { - text: current_word.trim().to_string(), + text, start: ws, end: word_end, ..Default::default() @@ -843,16 +850,22 @@ fn process_with_whisper( } if !current_word.trim().is_empty() - && let Some(ws) = word_start + && let Some(mut ws) = word_start { + let text = current_word.trim().to_string(); + let duration = word_end - ws; + let max_duration = (text.len() as f32 * 0.1).clamp(0.5, 1.5); + if duration > max_duration + 0.3 { + ws = word_end - max_duration; + } log::info!( " -> Final word: '{}' ({:.2}s - {:.2}s)", - current_word.trim(), + text, ws, word_end ); words.push(CaptionWord { - text: current_word.trim().to_string(), + text, start: ws, end: word_end, ..Default::default() diff --git a/apps/desktop/src/routes/editor/TranscriptPage.tsx b/apps/desktop/src/routes/editor/TranscriptPage.tsx index 2df720127ae..c142506b054 100644 --- a/apps/desktop/src/routes/editor/TranscriptPage.tsx +++ b/apps/desktop/src/routes/editor/TranscriptPage.tsx @@ -96,9 +96,19 @@ export function TranscriptPanel() { const words = (seg.words ?? []) as CaptionWordExtended[]; for (let wordIdx = 0; wordIdx < words.length; wordIdx++) { const w = words[wordIdx]; + + let start = w.start; + if (!w.isPause) { + const duration = w.end - w.start; + const maxDuration = Math.max(0.5, Math.min(1.5, w.text.length * 0.1)); + if (duration > maxDuration + 0.3) { + start = w.end - maxDuration; + } + } + result.push({ text: w.text, - start: w.start, + start, end: w.end, segmentIndex: segIdx, wordIndex: wordIdx, @@ -122,7 +132,19 @@ export function TranscriptPanel() { const curr = words[i]; if (curr.deleted) continue; - if (lastVisible) { + if (!lastVisible) { + const gap = curr.start; + if (gap >= PAUSE_DETECTION_THRESHOLD) { + result.push({ + type: "pause", + start: 0, + end: curr.start, + duration: gap, + afterSegmentIndex: curr.segmentIndex, + afterWordIndex: -1, + }); + } + } else { const gap = curr.start - lastVisible.end; if (gap >= PAUSE_DETECTION_THRESHOLD) { result.push({ @@ -888,10 +910,21 @@ function WordWithTooltip(props: { ); } -function PauseBadge(props: { pause: PauseIndicator }) { +function PauseBadge(props: { pause: PauseIndicator; onDelete: () => void }) { return ( - + ⏸ {props.pause.duration.toFixed(1)}s + ); } @@ -1157,6 +1190,44 @@ function TranscriptEditor(props: { setEditorState("captions", "isStale", false); }; + const handleDeletePause = (pause: PauseIndicator) => { + setProject( + produce((p) => { + if (!p.captions?.segments) return; + const seg = p.captions.segments[pause.afterSegmentIndex]; + if (seg?.words) { + const pauseWord: CaptionWordExtended = { + text: `[Pause ${pause.duration.toFixed(1)}s]`, + start: pause.start, + end: pause.end, + deleted: true, + isPause: true, + isFiller: false, + bufferStart: DEFAULT_PAUSE_BUFFER, + bufferEnd: DEFAULT_PAUSE_BUFFER, + }; + seg.words.splice(pause.afterWordIndex + 1, 0, pauseWord); + } + + const cutStart = Math.max(0, pause.start - DEFAULT_PAUSE_BUFFER); + const cutEnd = pause.end + DEFAULT_PAUSE_BUFFER; + const cutDuration = cutEnd - cutStart; + + if (cutDuration > 0.001) { + shiftCaptionTimesAfterCut(p.captions.segments, cutStart, cutDuration); + } + + if (p.timeline) { + rippleDeleteAllTracks(p.timeline, cutStart, cutEnd); + p.timeline.captionSegments = createCaptionTrackSegments( + p.captions.segments, + ); + } + }), + ); + setEditorState("captions", "isStale", false); + }; + return (
flatIndexOf(word); const isActive = () => props.activeWordIndex === flatIdx(); const isSelected = () => selectedIndices().has(flatIdx()); + const pauseBefore = () => + word.wordIndex === 0 + ? pauseAfterWord().get(`${word.segmentIndex}:-1`) + : undefined; + const pause = () => pauseAfterWord().get( `${word.segmentIndex}:${word.wordIndex}`, @@ -1194,6 +1270,14 @@ function TranscriptEditor(props: { return ( <> + + {(p) => ( + handleDeletePause(p())} + /> + )} + - {(p) => } + {(p) => ( + handleDeletePause(p())} + /> + )} ); diff --git a/apps/desktop/src/routes/editor/timeline-utils.ts b/apps/desktop/src/routes/editor/timeline-utils.ts index 61e5d5fbc41..22c1427e65b 100644 --- a/apps/desktop/src/routes/editor/timeline-utils.ts +++ b/apps/desktop/src/routes/editor/timeline-utils.ts @@ -4,6 +4,7 @@ export function shiftTimeAfterCut( cutDuration: number, ): number { if (time <= cutStart) return time; + if (time <= cutStart + cutDuration) return cutStart; return time - cutDuration; } @@ -19,12 +20,8 @@ export function shiftCaptionTimesAfterCut( for (const seg of segments) { if (seg.words) { for (const w of seg.words) { - if (w.start >= cutStart && w.end <= cutStart + cutDuration) { - continue; - } - const duration = w.end - w.start; w.start = shiftTimeAfterCut(w.start, cutStart, cutDuration); - w.end = w.start + duration; + w.end = shiftTimeAfterCut(w.end, cutStart, cutDuration); } if (seg.words.length > 0) { seg.start = seg.words[0].start; @@ -182,12 +179,8 @@ export function shiftCaptionTimesAfterInsert( for (const seg of segments) { if (seg.words) { for (const w of seg.words) { - if (w.start >= insertPoint && w.end <= insertPoint + duration_arg) { - continue; - } - const duration = w.end - w.start; w.start = shiftTimeAfterInsert(w.start, insertPoint, duration_arg); - w.end = w.start + duration; + w.end = shiftTimeAfterInsert(w.end, insertPoint, duration_arg); } if (seg.words.length > 0) { seg.start = seg.words[0].start; diff --git a/package.json b/package.json index 0825497bfbe..8ae22dd6d83 100644 --- a/package.json +++ b/package.json @@ -70,4 +70,4 @@ "engines": { "node": ">=20" } -} \ No newline at end of file +} From b194ea5d7501e0395759fbb77439dcb10094374c Mon Sep 17 00:00:00 2001 From: Maulik Date: Mon, 1 Jun 2026 10:28:40 +0530 Subject: [PATCH 06/39] improve(desktop): refine word duration capping, buffer popover interactions, and deleted word shifting --- .../src/routes/editor/TranscriptPage.tsx | 16 ++++++++-- .../src/routes/editor/timeline-utils.ts | 30 +++++++++++++++---- 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/apps/desktop/src/routes/editor/TranscriptPage.tsx b/apps/desktop/src/routes/editor/TranscriptPage.tsx index c142506b054..24c2e914ce5 100644 --- a/apps/desktop/src/routes/editor/TranscriptPage.tsx +++ b/apps/desktop/src/routes/editor/TranscriptPage.tsx @@ -97,19 +97,23 @@ export function TranscriptPanel() { for (let wordIdx = 0; wordIdx < words.length; wordIdx++) { const w = words[wordIdx]; - let start = w.start; + const start = w.start; + let end = w.end; if (!w.isPause) { const duration = w.end - w.start; const maxDuration = Math.max(0.5, Math.min(1.5, w.text.length * 0.1)); if (duration > maxDuration + 0.3) { - start = w.end - maxDuration; + // Parakeet TDT attaches trailing silence to the END of the word. + // We must cap w.end so the spoken word is preserved at the beginning of the timestamp block, + // exposing the silence AFTER the word. + end = w.start + maxDuration; } } result.push({ text: w.text, start, - end: w.end, + end, segmentIndex: segIdx, wordIndex: wordIdx, deleted: w.deleted ?? false, @@ -771,6 +775,9 @@ function BufferPopover(props: { step="0.01" value={bufStart()} onInput={(e) => + setBufStart(Number.parseFloat(e.currentTarget.value)) + } + onChange={(e) => updateBuffer(Number.parseFloat(e.currentTarget.value), bufEnd()) } class="w-full h-1 accent-blue-9" @@ -790,6 +797,9 @@ function BufferPopover(props: { step="0.01" value={bufEnd()} onInput={(e) => + setBufEnd(Number.parseFloat(e.currentTarget.value)) + } + onChange={(e) => updateBuffer( bufStart(), Number.parseFloat(e.currentTarget.value), diff --git a/apps/desktop/src/routes/editor/timeline-utils.ts b/apps/desktop/src/routes/editor/timeline-utils.ts index 22c1427e65b..980a6c186fd 100644 --- a/apps/desktop/src/routes/editor/timeline-utils.ts +++ b/apps/desktop/src/routes/editor/timeline-utils.ts @@ -12,7 +12,7 @@ export function shiftCaptionTimesAfterCut( segments: Array<{ start: number; end: number; - words?: Array<{ start: number; end: number }>; + words?: Array<{ start: number; end: number; deleted?: boolean }>; }>, cutStart: number, cutDuration: number, @@ -20,8 +20,17 @@ export function shiftCaptionTimesAfterCut( for (const seg of segments) { if (seg.words) { for (const w of seg.words) { - w.start = shiftTimeAfterCut(w.start, cutStart, cutDuration); - w.end = shiftTimeAfterCut(w.end, cutStart, cutDuration); + if (w.deleted) { + if (w.start >= cutStart && w.end <= cutStart + cutDuration) { + continue; + } + const duration = w.end - w.start; + w.start = shiftTimeAfterCut(w.start, cutStart, cutDuration); + w.end = w.start + duration; + } else { + w.start = shiftTimeAfterCut(w.start, cutStart, cutDuration); + w.end = shiftTimeAfterCut(w.end, cutStart, cutDuration); + } } if (seg.words.length > 0) { seg.start = seg.words[0].start; @@ -171,7 +180,7 @@ export function shiftCaptionTimesAfterInsert( segments: Array<{ start: number; end: number; - words?: Array<{ start: number; end: number }>; + words?: Array<{ start: number; end: number; deleted?: boolean }>; }>, insertPoint: number, duration_arg: number, @@ -179,8 +188,17 @@ export function shiftCaptionTimesAfterInsert( for (const seg of segments) { if (seg.words) { for (const w of seg.words) { - w.start = shiftTimeAfterInsert(w.start, insertPoint, duration_arg); - w.end = shiftTimeAfterInsert(w.end, insertPoint, duration_arg); + if (w.deleted) { + if (w.start >= insertPoint && w.end <= insertPoint + duration_arg) { + continue; + } + const duration = w.end - w.start; + w.start = shiftTimeAfterInsert(w.start, insertPoint, duration_arg); + w.end = w.start + duration; + } else { + w.start = shiftTimeAfterInsert(w.start, insertPoint, duration_arg); + w.end = shiftTimeAfterInsert(w.end, insertPoint, duration_arg); + } } if (seg.words.length > 0) { seg.start = seg.words[0].start; From cc6857ca4abc07cb3eecd6a9c268ac85ac0a4655 Mon Sep 17 00:00:00 2001 From: Maulik Date: Mon, 1 Jun 2026 10:37:20 +0530 Subject: [PATCH 07/39] fix(desktop): prevent timeline word stretching during ripple insert This fixes a boundary math flaw where restoring a word could cause adjacent words exactly at the insert point to stretch their durations instead of simply shifting right, gradually desyncing the timeline. --- apps/desktop/src/routes/editor/timeline-utils.ts | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/apps/desktop/src/routes/editor/timeline-utils.ts b/apps/desktop/src/routes/editor/timeline-utils.ts index 980a6c186fd..326ccf3f4c4 100644 --- a/apps/desktop/src/routes/editor/timeline-utils.ts +++ b/apps/desktop/src/routes/editor/timeline-utils.ts @@ -172,7 +172,7 @@ export function shiftTimeAfterInsert( insertPoint: number, duration: number, ): number { - if (time <= insertPoint) return time; + if (time < insertPoint - 0.001) return time; return time + duration; } @@ -189,16 +189,16 @@ export function shiftCaptionTimesAfterInsert( if (seg.words) { for (const w of seg.words) { if (w.deleted) { - if (w.start >= insertPoint && w.end <= insertPoint + duration_arg) { + if ( + w.start >= insertPoint - 0.001 && + w.end <= insertPoint + duration_arg + 0.001 + ) { continue; } - const duration = w.end - w.start; - w.start = shiftTimeAfterInsert(w.start, insertPoint, duration_arg); - w.end = w.start + duration; - } else { - w.start = shiftTimeAfterInsert(w.start, insertPoint, duration_arg); - w.end = shiftTimeAfterInsert(w.end, insertPoint, duration_arg); } + const duration = w.end - w.start; + w.start = shiftTimeAfterInsert(w.start, insertPoint, duration_arg); + w.end = w.start + duration; } if (seg.words.length > 0) { seg.start = seg.words[0].start; From 510cebea5ee1c2342473247e8865f1a253aae096 Mon Sep 17 00:00:00 2001 From: Maulik Date: Mon, 1 Jun 2026 10:53:29 +0530 Subject: [PATCH 08/39] fix(desktop): map isFiller with fallback to isFillerWord --- apps/desktop/src/routes/editor/captions.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/desktop/src/routes/editor/captions.ts b/apps/desktop/src/routes/editor/captions.ts index e59c34ecb29..4f390ba28d9 100644 --- a/apps/desktop/src/routes/editor/captions.ts +++ b/apps/desktop/src/routes/editor/captions.ts @@ -12,6 +12,7 @@ import { type TimelineSegment, } from "~/utils/tauri"; import type { CaptionWordExtended } from "./caption-types"; +import { isFillerWord } from "./filler-detection"; export const DEFAULT_CAPTION_MODEL = "best"; export const DEFAULT_WHISPER_CAPTION_MODEL = "small"; export const DEFAULT_CAPTION_LANGUAGE = "auto"; @@ -149,7 +150,7 @@ export function mapCaptionsToEditedTimeline( start: wordMapped.start, end: wordMapped.end, deleted: w.deleted ?? false, - isFiller: w.isFiller ?? false, + isFiller: w.isFiller ?? isFillerWord(w.text), isPause: w.isPause ?? false, bufferStart: w.bufferStart ?? 0, bufferEnd: w.bufferEnd ?? 0, From ad0f51a64f02ee69833705f2c11fc8ffa6ec2bf9 Mon Sep 17 00:00:00 2001 From: Maulik Date: Mon, 1 Jun 2026 11:10:04 +0530 Subject: [PATCH 09/39] fix(desktop): force array update in autoClean to guarantee reactivity for word deletion --- .../src/routes/editor/TranscriptPage.tsx | 57 +++++++++++++++---- 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/apps/desktop/src/routes/editor/TranscriptPage.tsx b/apps/desktop/src/routes/editor/TranscriptPage.tsx index 24c2e914ce5..a0537c44ad1 100644 --- a/apps/desktop/src/routes/editor/TranscriptPage.tsx +++ b/apps/desktop/src/routes/editor/TranscriptPage.tsx @@ -263,10 +263,11 @@ export function TranscriptPanel() { for (const word of wordsToDelete) { const seg = p.captions.segments[word.segmentIndex]; - if (!seg?.words) continue; - const w = seg.words[word.wordIndex] as CaptionWordExtended; - if (w) { - w.deleted = true; + if (seg?.words) { + const w = seg.words[word.wordIndex] as CaptionWordExtended; + if (w) { + seg.words[word.wordIndex] = { ...w, deleted: true }; + } } } @@ -291,6 +292,10 @@ export function TranscriptPanel() { p.captions.segments, range.start, cutDuration, + wordsToDelete.map((w) => ({ + segmentIndex: w.segmentIndex, + wordIndex: w.wordIndex, + })), ); if (p.timeline) { @@ -354,6 +359,10 @@ export function TranscriptPanel() { p.captions.segments, range.start, insertDuration, + wordsToRestore.map((w) => ({ + segmentIndex: w.segmentIndex, + wordIndex: w.wordIndex, + })), ); if (p.timeline) { @@ -366,15 +375,25 @@ export function TranscriptPanel() { if (!seg?.words) continue; const w = seg.words[word.wordIndex] as CaptionWordExtended; if (w) { - w.deleted = false; + if (w.isPause) { + w._markForRemoval = true; + } else { + w.deleted = false; + } } } for (const seg of p.captions.segments) { const extWords = (seg.words ?? []) as CaptionWordExtended[]; - seg.text = getCaptionTextFromWords(extWords); - if (seg.words && seg.words.length > 0) { - const visible = extWords.filter((w) => !w.deleted); + + const filteredWords = extWords.filter((w) => !w._markForRemoval); + if (filteredWords.length !== extWords.length) { + seg.words = filteredWords; + } + + seg.text = getCaptionTextFromWords(filteredWords); + if (filteredWords && filteredWords.length > 0) { + const visible = filteredWords.filter((w) => !w.deleted); if (visible.length > 0) { seg.start = visible[0].start; seg.end = visible[visible.length - 1].end; @@ -435,7 +454,9 @@ export function TranscriptPanel() { const seg = p.captions.segments[fw.segmentIndex]; if (seg?.words) { const w = seg.words[fw.wordIndex] as CaptionWordExtended; - if (w) w.deleted = true; + if (w) { + seg.words[fw.wordIndex] = { ...w, deleted: true }; + } } timeRanges.push({ start: Math.max(0, fw.start - (fw.bufferStart || 0)), @@ -506,6 +527,10 @@ export function TranscriptPanel() { p.captions.segments, range.start, cutDuration, + fillerWords.map((w) => ({ + segmentIndex: w.segmentIndex, + wordIndex: w.wordIndex, + })), ); if (p.timeline) { @@ -1160,6 +1185,7 @@ function TranscriptEditor(props: { p.captions.segments, oldCutStart, oldDuration, + [{ segmentIndex, wordIndex }], ); if (p.timeline) { rippleInsertAllTracks(p.timeline, oldCutStart, oldDuration); @@ -1180,6 +1206,7 @@ function TranscriptEditor(props: { p.captions.segments, newCutStart, newDuration, + [{ segmentIndex, wordIndex }], ); if (p.timeline) { rippleDeleteAllTracks(p.timeline, newCutStart, newCutEnd); @@ -1224,7 +1251,17 @@ function TranscriptEditor(props: { const cutDuration = cutEnd - cutStart; if (cutDuration > 0.001) { - shiftCaptionTimesAfterCut(p.captions.segments, cutStart, cutDuration); + shiftCaptionTimesAfterCut( + p.captions.segments, + cutStart, + cutDuration, + [ + { + segmentIndex: pause.afterSegmentIndex, + wordIndex: pause.afterWordIndex + 1, + }, + ], + ); } if (p.timeline) { From c412566d1c68ea2697a0173bc5cade3ab25b1e79 Mon Sep 17 00:00:00 2001 From: Maulik Date: Mon, 1 Jun 2026 11:13:17 +0530 Subject: [PATCH 10/39] fix(desktop): correctly fallback to isFillerWord evaluation for old projects --- apps/desktop/src/routes/editor/TranscriptPage.tsx | 2 +- apps/desktop/src/routes/editor/captions.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/desktop/src/routes/editor/TranscriptPage.tsx b/apps/desktop/src/routes/editor/TranscriptPage.tsx index a0537c44ad1..e84e509e007 100644 --- a/apps/desktop/src/routes/editor/TranscriptPage.tsx +++ b/apps/desktop/src/routes/editor/TranscriptPage.tsx @@ -117,7 +117,7 @@ export function TranscriptPanel() { segmentIndex: segIdx, wordIndex: wordIdx, deleted: w.deleted ?? false, - isFiller: w.isFiller ?? isFillerWord(w.text), + isFiller: w.isFiller || isFillerWord(w.text), isPause: w.isPause ?? false, bufferStart: w.bufferStart ?? 0, bufferEnd: w.bufferEnd ?? 0, diff --git a/apps/desktop/src/routes/editor/captions.ts b/apps/desktop/src/routes/editor/captions.ts index 4f390ba28d9..4c9af389641 100644 --- a/apps/desktop/src/routes/editor/captions.ts +++ b/apps/desktop/src/routes/editor/captions.ts @@ -150,7 +150,7 @@ export function mapCaptionsToEditedTimeline( start: wordMapped.start, end: wordMapped.end, deleted: w.deleted ?? false, - isFiller: w.isFiller ?? isFillerWord(w.text), + isFiller: w.isFiller || isFillerWord(w.text), isPause: w.isPause ?? false, bufferStart: w.bufferStart ?? 0, bufferEnd: w.bufferEnd ?? 0, From 91f6523c34695a9b002bc5093884ad03a2f5ba67 Mon Sep 17 00:00:00 2001 From: Maulik Date: Mon, 1 Jun 2026 11:27:38 +0530 Subject: [PATCH 11/39] fix: resolve undo/redo break when adjusting buffer for deleted words - Passing explicit ignoreWords lists to shiftCaptionTimesAfterCut and shiftCaptionTimesAfterInsert - Prevents deleted words from being shifted by their own cut durations if buffer values are negative - Ensures proper tracking of _markForRemoval for restoring words --- .../src/routes/editor/TranscriptPage.tsx | 7 +--- .../src/routes/editor/caption-types.ts | 1 + .../src/routes/editor/timeline-utils.ts | 37 ++++++++++++++++--- 3 files changed, 35 insertions(+), 10 deletions(-) diff --git a/apps/desktop/src/routes/editor/TranscriptPage.tsx b/apps/desktop/src/routes/editor/TranscriptPage.tsx index e84e509e007..8597686421f 100644 --- a/apps/desktop/src/routes/editor/TranscriptPage.tsx +++ b/apps/desktop/src/routes/editor/TranscriptPage.tsx @@ -439,7 +439,8 @@ export function TranscriptPanel() { const threshold = silenceThreshold(); const fillerWords = words.filter((w) => !w.deleted && w.isFiller); - const pausesToClean = ps.filter((p) => p.duration >= threshold); + const wordsWithoutFillers = words.filter((w) => !w.deleted && !w.isFiller); + const pausesToClean = detectPauses(wordsWithoutFillers, threshold); if (fillerWords.length === 0 && pausesToClean.length === 0) return; @@ -527,10 +528,6 @@ export function TranscriptPanel() { p.captions.segments, range.start, cutDuration, - fillerWords.map((w) => ({ - segmentIndex: w.segmentIndex, - wordIndex: w.wordIndex, - })), ); if (p.timeline) { diff --git a/apps/desktop/src/routes/editor/caption-types.ts b/apps/desktop/src/routes/editor/caption-types.ts index 4ef410e417b..2500709aa6e 100644 --- a/apps/desktop/src/routes/editor/caption-types.ts +++ b/apps/desktop/src/routes/editor/caption-types.ts @@ -6,4 +6,5 @@ export interface CaptionWordExtended extends BaseCaptionWord { isPause?: boolean; bufferStart?: number; bufferEnd?: number; + _markForRemoval?: boolean; } diff --git a/apps/desktop/src/routes/editor/timeline-utils.ts b/apps/desktop/src/routes/editor/timeline-utils.ts index 326ccf3f4c4..e9936aeaecd 100644 --- a/apps/desktop/src/routes/editor/timeline-utils.ts +++ b/apps/desktop/src/routes/editor/timeline-utils.ts @@ -16,12 +16,27 @@ export function shiftCaptionTimesAfterCut( }>, cutStart: number, cutDuration: number, + ignoreWords?: { segmentIndex: number; wordIndex: number }[], ) { - for (const seg of segments) { + for (let i = 0; i < segments.length; i++) { + const seg = segments[i]; if (seg.words) { - for (const w of seg.words) { + for (let j = 0; j < seg.words.length; j++) { + const w = seg.words[j]; + + if ( + ignoreWords?.some( + (ignore) => ignore.segmentIndex === i && ignore.wordIndex === j, + ) + ) { + continue; + } + if (w.deleted) { - if (w.start >= cutStart && w.end <= cutStart + cutDuration) { + if ( + w.start >= cutStart - 0.001 && + w.end <= cutStart + cutDuration + 0.001 + ) { continue; } const duration = w.end - w.start; @@ -184,10 +199,22 @@ export function shiftCaptionTimesAfterInsert( }>, insertPoint: number, duration_arg: number, + ignoreWords?: { segmentIndex: number; wordIndex: number }[], ) { - for (const seg of segments) { + for (let i = 0; i < segments.length; i++) { + const seg = segments[i]; if (seg.words) { - for (const w of seg.words) { + for (let j = 0; j < seg.words.length; j++) { + const w = seg.words[j]; + + if ( + ignoreWords?.some( + (ignore) => ignore.segmentIndex === i && ignore.wordIndex === j, + ) + ) { + continue; + } + if (w.deleted) { if ( w.start >= insertPoint - 0.001 && From 11676f7361f3e43232e01f0d0189835e75e1492f Mon Sep 17 00:00:00 2001 From: Maulik Date: Mon, 1 Jun 2026 11:28:15 +0530 Subject: [PATCH 12/39] fix(desktop): detect pauses after removing filler words to accurately clean newly formed gaps --- apps/desktop/src/routes/editor/TranscriptPage.tsx | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/desktop/src/routes/editor/TranscriptPage.tsx b/apps/desktop/src/routes/editor/TranscriptPage.tsx index 8597686421f..06a6076d5f5 100644 --- a/apps/desktop/src/routes/editor/TranscriptPage.tsx +++ b/apps/desktop/src/routes/editor/TranscriptPage.tsx @@ -435,7 +435,6 @@ export function TranscriptPanel() { const autoClean = () => { const words = allWords(); - const ps = pauses(); const threshold = silenceThreshold(); const fillerWords = words.filter((w) => !w.deleted && w.isFiller); From fd7ba58411f042262aff1df2d75c9ef9aa2be969 Mon Sep 17 00:00:00 2001 From: Maulik Date: Mon, 1 Jun 2026 11:31:00 +0530 Subject: [PATCH 13/39] fix(desktop): import detectPauses in TranscriptPage.tsx --- apps/desktop/src/routes/editor/TranscriptPage.tsx | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/desktop/src/routes/editor/TranscriptPage.tsx b/apps/desktop/src/routes/editor/TranscriptPage.tsx index 06a6076d5f5..36b6630b869 100644 --- a/apps/desktop/src/routes/editor/TranscriptPage.tsx +++ b/apps/desktop/src/routes/editor/TranscriptPage.tsx @@ -20,6 +20,7 @@ import { import { FPS, useEditorContext } from "./context"; import { AUTO_CLEAN_SILENCE_THRESHOLD, + detectPauses, isFillerWord, PAUSE_DETECTION_THRESHOLD, } from "./filler-detection"; From 0fc0ea0b2a68f48b6c2e12b3c76f15e84847eec9 Mon Sep 17 00:00:00 2001 From: Maulik Date: Mon, 1 Jun 2026 11:58:41 +0530 Subject: [PATCH 14/39] feat(desktop): implement auto-clean undo and refine transcript editing timeline adjustments --- apps/desktop/src-tauri/src/captions.rs | 8 +- .../src/routes/editor/TranscriptPage.tsx | 82 +++++++++++++------ .../src/routes/editor/caption-types.ts | 1 - .../src/routes/editor/filler-detection.ts | 13 +-- .../src/routes/editor/timeline-utils.ts | 2 +- apps/web/package.json | 2 +- 6 files changed, 63 insertions(+), 45 deletions(-) diff --git a/apps/desktop/src-tauri/src/captions.rs b/apps/desktop/src-tauri/src/captions.rs index f30352f9c65..1109ef00895 100644 --- a/apps/desktop/src-tauri/src/captions.rs +++ b/apps/desktop/src-tauri/src/captions.rs @@ -810,13 +810,13 @@ fn process_with_whisper( if token_text.starts_with(' ') || token_text.starts_with('\n') { if !current_word.is_empty() - && let Some(mut ws) = word_start + && let Some(ws) = word_start { let text = current_word.trim().to_string(); let duration = word_end - ws; let max_duration = (text.len() as f32 * 0.1).clamp(0.5, 1.5); if duration > max_duration + 0.3 { - ws = word_end - max_duration; + word_end = ws + max_duration; } log::info!( @@ -850,13 +850,13 @@ fn process_with_whisper( } if !current_word.trim().is_empty() - && let Some(mut ws) = word_start + && let Some(ws) = word_start { let text = current_word.trim().to_string(); let duration = word_end - ws; let max_duration = (text.len() as f32 * 0.1).clamp(0.5, 1.5); if duration > max_duration + 0.3 { - ws = word_end - max_duration; + word_end = ws + max_duration; } log::info!( " -> Final word: '{}' ({:.2}s - {:.2}s)", diff --git a/apps/desktop/src/routes/editor/TranscriptPage.tsx b/apps/desktop/src/routes/editor/TranscriptPage.tsx index 36b6630b869..25889814e62 100644 --- a/apps/desktop/src/routes/editor/TranscriptPage.tsx +++ b/apps/desktop/src/routes/editor/TranscriptPage.tsx @@ -20,6 +20,7 @@ import { import { FPS, useEditorContext } from "./context"; import { AUTO_CLEAN_SILENCE_THRESHOLD, + DEFAULT_PAUSE_BUFFER, detectPauses, isFillerWord, PAUSE_DETECTION_THRESHOLD, @@ -73,8 +74,6 @@ const TEXT_SIZES = [ { label: "XL", value: "text-lg leading-snug" }, ] as const; -const DEFAULT_PAUSE_BUFFER = 0.0; - export function TranscriptPanel() { const { editorState, @@ -377,7 +376,7 @@ export function TranscriptPanel() { const w = seg.words[word.wordIndex] as CaptionWordExtended; if (w) { if (w.isPause) { - w._markForRemoval = true; + seg.words.splice(word.wordIndex, 1); } else { w.deleted = false; } @@ -386,15 +385,9 @@ export function TranscriptPanel() { for (const seg of p.captions.segments) { const extWords = (seg.words ?? []) as CaptionWordExtended[]; - - const filteredWords = extWords.filter((w) => !w._markForRemoval); - if (filteredWords.length !== extWords.length) { - seg.words = filteredWords; - } - - seg.text = getCaptionTextFromWords(filteredWords); - if (filteredWords && filteredWords.length > 0) { - const visible = filteredWords.filter((w) => !w.deleted); + seg.text = getCaptionTextFromWords(extWords); + if (extWords.length > 0) { + const visible = extWords.filter((w) => !w.deleted); if (visible.length > 0) { seg.start = visible[0].start; seg.end = visible[visible.length - 1].end; @@ -434,6 +427,10 @@ export function TranscriptPanel() { { name: "editorAutoCleanThreshold" }, ); + const [lastAutoCleanIndices, setLastAutoCleanIndices] = createSignal< + number[] | null + >(null); + const autoClean = () => { const words = allWords(); const threshold = silenceThreshold(); @@ -444,13 +441,16 @@ export function TranscriptPanel() { if (fillerWords.length === 0 && pausesToClean.length === 0) return; + const affectedFlatIndices = fillerWords.map((fw) => words.indexOf(fw)); + setProject( produce((p) => { if (!p.captions?.segments) return; const timeRanges: Array<{ start: number; end: number }> = []; + const allIgnoreWords: { segmentIndex: number; wordIndex: number }[] = + []; - // 1. Mark fillers as deleted and collect their ranges for (const fw of fillerWords) { const seg = p.captions.segments[fw.segmentIndex]; if (seg?.words) { @@ -459,13 +459,16 @@ export function TranscriptPanel() { seg.words[fw.wordIndex] = { ...w, deleted: true }; } } + allIgnoreWords.push({ + segmentIndex: fw.segmentIndex, + wordIndex: fw.wordIndex, + }); timeRanges.push({ start: Math.max(0, fw.start - (fw.bufferStart || 0)), end: fw.end + (fw.bufferEnd || 0), }); } - // 2. Insert pause words and collect their ranges const sortedPauses = [...pausesToClean].sort((a, b) => { if (a.afterSegmentIndex !== b.afterSegmentIndex) { return b.afterSegmentIndex - a.afterSegmentIndex; @@ -486,7 +489,12 @@ export function TranscriptPanel() { bufferStart: DEFAULT_PAUSE_BUFFER, bufferEnd: DEFAULT_PAUSE_BUFFER, }; - seg.words.splice(pInfo.afterWordIndex + 1, 0, pauseWord); + const insertIdx = pInfo.afterWordIndex + 1; + seg.words.splice(insertIdx, 0, pauseWord); + allIgnoreWords.push({ + segmentIndex: pInfo.afterSegmentIndex, + wordIndex: insertIdx, + }); } timeRanges.push({ start: Math.max(0, pInfo.start - DEFAULT_PAUSE_BUFFER), @@ -494,7 +502,6 @@ export function TranscriptPanel() { }); } - // 3. Recalculate seg.start and seg.end for (const seg of p.captions.segments) { const extWords = (seg.words ?? []) as CaptionWordExtended[]; seg.text = getCaptionTextFromWords(extWords); @@ -507,7 +514,6 @@ export function TranscriptPanel() { } } - // 4. Merge time ranges and apply cuts timeRanges.sort((a, b) => a.start - b.start); const mergedRanges: { start: number; end: number }[] = []; for (const range of timeRanges) { @@ -528,6 +534,7 @@ export function TranscriptPanel() { p.captions.segments, range.start, cutDuration, + allIgnoreWords, ); if (p.timeline) { @@ -542,9 +549,19 @@ export function TranscriptPanel() { } }), ); + setLastAutoCleanIndices(affectedFlatIndices); setEditorState("captions", "isStale", false); }; + const undoAutoClean = () => { + const indices = lastAutoCleanIndices(); + if (!indices || indices.length === 0) return; + const words = allWords(); + const deletedIndices = indices.filter((i) => words[i]?.deleted); + restoreWords(deletedIndices); + setLastAutoCleanIndices(null); + }; + const isAtEnd = () => { const total = totalDuration(); return total > 0 && total - editorState.playbackTime <= 0.1; @@ -614,6 +631,16 @@ export function TranscriptPanel() { `${pauseCount()} pause${pauseCount() > 1 ? "s" : ""}`} + + +
- +
Date: Tue, 9 Jun 2026 22:57:25 +0530 Subject: [PATCH 36/39] feat: treat pauses as first-class word entities --- .../src/routes/editor/TranscriptPage.tsx | 236 +++++------------- apps/desktop/src/routes/editor/captions.ts | 91 ++++++- 2 files changed, 149 insertions(+), 178 deletions(-) diff --git a/apps/desktop/src/routes/editor/TranscriptPage.tsx b/apps/desktop/src/routes/editor/TranscriptPage.tsx index f36292bc0a4..5c21dd51ca8 100644 --- a/apps/desktop/src/routes/editor/TranscriptPage.tsx +++ b/apps/desktop/src/routes/editor/TranscriptPage.tsx @@ -22,7 +22,6 @@ import { AUTO_CLEAN_SILENCE_THRESHOLD, DEFAULT_PAUSE_BUFFER, isFillerWord, - PAUSE_DETECTION_THRESHOLD, } from "./filler-detection"; import { rippleDeleteAllTracks, @@ -51,15 +50,6 @@ interface FlatWord { bufferEnd: number; } -interface PauseIndicator { - type: "pause"; - start: number; - end: number; - duration: number; - afterSegmentIndex: number; - afterWordIndex: number; -} - interface TranscriptSegmentGroup { segmentIndex: number; startTime: number; @@ -127,66 +117,13 @@ export function TranscriptPanel() { return result; }); - const pauses = createMemo((): PauseIndicator[] => { - const words = allWords(); - const result: PauseIndicator[] = []; - let lastVisible: (typeof words)[0] | null = null; - let hasPauseWordInGap = false; - let deletedDurationInGap = 0; - - for (let i = 0; i < words.length; i++) { - const curr = words[i]; - if (curr.deleted) { - if (curr.isPause) { - hasPauseWordInGap = true; - } else { - const bufStart = curr.bufferStart || 0; - const bufEnd = curr.bufferEnd || 0; - deletedDurationInGap += - curr.storedEnd + bufEnd - Math.max(0, curr.start - bufStart); - } - continue; - } - - if (!lastVisible) { - const gap = curr.start; - const silentGap = Math.max(0, gap - deletedDurationInGap); - if (silentGap >= PAUSE_DETECTION_THRESHOLD && !hasPauseWordInGap) { - result.push({ - type: "pause", - start: 0, - end: curr.start, - duration: silentGap, - afterSegmentIndex: curr.segmentIndex, - afterWordIndex: -1, - }); - } - } else { - const gap = curr.start - lastVisible.end; - const silentGap = Math.max(0, gap - deletedDurationInGap); - if (silentGap >= PAUSE_DETECTION_THRESHOLD && !hasPauseWordInGap) { - result.push({ - type: "pause", - start: lastVisible.end, - end: curr.start, - duration: silentGap, - afterSegmentIndex: lastVisible.segmentIndex, - afterWordIndex: lastVisible.wordIndex, - }); - } - } - lastVisible = curr; - hasPauseWordInGap = false; - deletedDurationInGap = 0; - } - return result; - }); - const fillerCount = createMemo( () => allWords().filter((w) => w.isFiller && !w.deleted).length, ); - const pauseCount = createMemo(() => pauses().length); + const pauseCount = createMemo( + () => allWords().filter((w) => w.isPause && !w.deleted).length, + ); const segmentGroups = createMemo((): TranscriptSegmentGroup[] => { const words = allWords(); @@ -384,13 +321,9 @@ export function TranscriptPanel() { if (!seg?.words) continue; const w = seg.words[word.wordIndex] as CaptionWordExtended; if (w) { - if (w.isPause) { - seg.words.splice(word.wordIndex, 1); - } else { - w.deleted = false; - w.bufferStart = 0; - w.bufferEnd = 0; - } + w.deleted = false; + w.bufferStart = 0; + w.bufferEnd = 0; } } @@ -439,7 +372,13 @@ export function TranscriptPanel() { ); const cleanablePauseCount = createMemo( - () => pauses().filter((p) => p.duration >= silenceThreshold()).length, + () => + allWords().filter( + (w) => + w.isPause && + !w.deleted && + w.storedEnd - w.start >= silenceThreshold(), + ).length, ); const autoClean = () => { @@ -769,7 +708,6 @@ export function TranscriptPanel() { void }) { +function PauseBadge(props: { word: FlatWord; onDelete: () => void }) { + const duration = props.word.storedEnd - props.word.start; return ( - - ⏸ {props.pause.duration.toFixed(1)}s + + ⏸ {duration.toFixed(1)}s ); @@ -1043,7 +994,6 @@ function PauseBadge(props: { pause: PauseIndicator; onDelete: () => void }) { function TranscriptEditor(props: { segmentGroups: TranscriptSegmentGroup[]; allWords: FlatWord[]; - pauses: PauseIndicator[]; activeWordIndex: number; textSizeClass: string; onWordClick: (word: FlatWord) => void; @@ -1079,14 +1029,6 @@ function TranscriptEditor(props: { const selectedCount = () => selectedIndices().size; - const pauseAfterWord = createMemo(() => { - const map = new Map(); - for (const p of props.pauses) { - map.set(`${p.afterSegmentIndex}:${p.afterWordIndex}`, p); - } - return map; - }); - createEffect( on( () => props.activeWordIndex, @@ -1352,54 +1294,6 @@ function TranscriptEditor(props: { } }; - const handleDeletePause = (pause: PauseIndicator) => { - setProject( - produce((p) => { - if (!p.captions?.segments) return; - const seg = p.captions.segments[pause.afterSegmentIndex]; - if (seg?.words) { - const pauseWord: CaptionWordExtended = { - text: `[Pause ${pause.duration.toFixed(1)}s]`, - start: pause.start, - end: pause.end, - deleted: true, - isPause: true, - isFiller: false, - bufferStart: DEFAULT_PAUSE_BUFFER, - bufferEnd: DEFAULT_PAUSE_BUFFER, - }; - seg.words.splice(pause.afterWordIndex + 1, 0, pauseWord); - } - - const cutStart = Math.max(0, pause.start - DEFAULT_PAUSE_BUFFER); - const cutEnd = pause.end + DEFAULT_PAUSE_BUFFER; - const cutDuration = cutEnd - cutStart; - - if (cutDuration > 0.001) { - shiftCaptionTimesAfterCut(p.captions.segments, cutStart, cutDuration); - } - - for (const s of p.captions.segments) { - const extWords = (s.words ?? []) as CaptionWordExtended[]; - s.text = getCaptionTextFromWords(extWords); - const visible = extWords.filter((w) => !w.deleted); - if (visible.length > 0) { - s.start = visible[0].start; - s.end = visible[visible.length - 1].end; - } - } - - if (p.timeline) { - rippleDeleteAllTracks(p.timeline, cutStart, cutEnd); - p.timeline.captionSegments = createCaptionTrackSegments( - p.captions.segments, - ); - } - }), - ); - setEditorState("captions", "isStale", false); - }; - return (
flatIndexOf(word); const isActive = () => props.activeWordIndex === flatIdx(); const isSelected = () => selectedIndices().has(flatIdx()); - const pauseBefore = () => - word.wordIndex === 0 - ? pauseAfterWord().get(`${word.segmentIndex}:-1`) - : undefined; - - const pause = () => - pauseAfterWord().get( - `${word.segmentIndex}:${word.wordIndex}`, - ); - return ( - <> - - {(p) => ( - handleDeletePause(p())} - /> - )} - - { - if (isActive()) activeWordRef = el; + onDelete={() => { + if (word.deleted) { + handleWordRestore(word); + } else { + handleWordDelete(word); + } }} - onClick={(e: MouseEvent) => handleWordSelect(word, e)} - onDelete={() => handleWordDelete(word)} - onRestore={() => handleWordRestore(word)} - onContextMenu={(e: MouseEvent) => - handleContextMenu(word, e) - } /> - - {(p) => ( - handleDeletePause(p())} - /> - )} - - + ); + } + + return ( + { + if (isActive()) activeWordRef = el; + }} + onClick={(e: MouseEvent) => handleWordSelect(word, e)} + onDelete={() => handleWordDelete(word)} + onRestore={() => handleWordRestore(word)} + onContextMenu={(e: MouseEvent) => + handleContextMenu(word, e) + } + /> ); }} diff --git a/apps/desktop/src/routes/editor/captions.ts b/apps/desktop/src/routes/editor/captions.ts index 1f4297cdd26..14ba3004620 100644 --- a/apps/desktop/src/routes/editor/captions.ts +++ b/apps/desktop/src/routes/editor/captions.ts @@ -12,7 +12,7 @@ import { type TimelineSegment, } from "~/utils/tauri"; import type { CaptionWordExtended } from "./caption-types"; -import { isFillerWord } from "./filler-detection"; +import { isFillerWord, PAUSE_DETECTION_THRESHOLD } from "./filler-detection"; export const DEFAULT_CAPTION_MODEL = "best"; export const DEFAULT_WHISPER_CAPTION_MODEL = "small"; export const DEFAULT_CAPTION_LANGUAGE = "auto"; @@ -238,6 +238,93 @@ export function createCaptionTrackSegments( }); } +function cappedWordEnd(word: CaptionWord): number { + const duration = word.end - word.start; + const maxDuration = Math.max(0.5, Math.min(1.5, word.text.length * 0.1)); + if (duration > maxDuration + 0.3) { + return word.start + maxDuration; + } + return word.end; +} + +function insertPauseWordsIntoSegments(segments: CaptionSegment[]): void { + const allWords: { segIdx: number; wIdx: number; word: CaptionWord }[] = []; + for (let s = 0; s < segments.length; s++) { + const ws = (segments[s].words ?? []) as CaptionWordExtended[]; + for (let w = 0; w < ws.length; w++) { + if (ws[w].isPause) continue; + allWords.push({ segIdx: s, wIdx: w, word: ws[w] }); + } + } + + const insertions: { + segIdx: number; + afterWIdx: number; + pause: CaptionWordExtended; + }[] = []; + + if (allWords.length > 0) { + const first = allWords[0]; + const hasPauseBefore = + first.wIdx > 0 && + (segments[first.segIdx].words?.[first.wIdx - 1] as CaptionWordExtended) + ?.isPause; + if (first.word.start >= PAUSE_DETECTION_THRESHOLD && !hasPauseBefore) { + insertions.push({ + segIdx: first.segIdx, + afterWIdx: first.wIdx - 1, + pause: { + text: `[Pause ${first.word.start.toFixed(1)}s]`, + start: 0, + end: first.word.start, + isPause: true, + isFiller: false, + }, + }); + } + } + + for (let i = 1; i < allWords.length; i++) { + const prev = allWords[i - 1]; + const curr = allWords[i]; + const prevEnd = cappedWordEnd(prev.word); + const gap = curr.word.start - prevEnd; + if (gap < PAUSE_DETECTION_THRESHOLD) continue; + + let alreadyHasPause = false; + if (prev.segIdx === curr.segIdx) { + const ws = (segments[prev.segIdx].words ?? []) as CaptionWordExtended[]; + for (let j = prev.wIdx + 1; j < curr.wIdx; j++) { + if (ws[j]?.isPause) { + alreadyHasPause = true; + break; + } + } + } + if (alreadyHasPause) continue; + + insertions.push({ + segIdx: prev.segIdx, + afterWIdx: prev.wIdx, + pause: { + text: `[Pause ${gap.toFixed(1)}s]`, + start: prevEnd, + end: curr.word.start, + isPause: true, + isFiller: false, + }, + }); + } + + for (let i = insertions.length - 1; i >= 0; i--) { + const ins = insertions[i]; + const seg = segments[ins.segIdx]; + if (seg.words) { + seg.words.splice(ins.afterWIdx + 1, 0, ins.pause); + } + } +} + export function applyCaptionResultToProject< T extends { captions?: @@ -290,6 +377,8 @@ export function applyCaptionResultToProject< recordingSegments, ); + insertPauseWordsIntoSegments(mappedSegments); + captions.segments = mappedSegments; timeline.captionSegments = createCaptionTrackSegments(mappedSegments); } From a6547e8e671cf5af3f34725769c2df69895fbbbe Mon Sep 17 00:00:00 2001 From: Maulik Date: Fri, 12 Jun 2026 16:51:10 +0530 Subject: [PATCH 37/39] fix: cleanup degenerate timeline segments and fix timeline cutting --- .../src/routes/editor/TranscriptPage.tsx | 103 ++++++++++++--- .../src/routes/editor/timeline-utils.ts | 125 +++++++++++++++++- 2 files changed, 210 insertions(+), 18 deletions(-) diff --git a/apps/desktop/src/routes/editor/TranscriptPage.tsx b/apps/desktop/src/routes/editor/TranscriptPage.tsx index 5c21dd51ca8..1b6b1a9c148 100644 --- a/apps/desktop/src/routes/editor/TranscriptPage.tsx +++ b/apps/desktop/src/routes/editor/TranscriptPage.tsx @@ -24,6 +24,8 @@ import { isFillerWord, } from "./filler-detection"; import { + cleanupDegenerateClipSegments, + cleanupDegenerateSegments, rippleDeleteAllTracks, rippleInsertAllTracks, shiftCaptionTimesAfterCut, @@ -181,7 +183,37 @@ export function TranscriptPanel() { if (wordsToDelete.length === 0) return; - const timeRanges = wordsToDelete + const deletingSet = new Set( + wordsToDelete.map((w) => `${w.segmentIndex}:${w.wordIndex}`), + ); + + const orphanedPauses: FlatWord[] = []; + for (const w of words) { + if (!w.isPause || w.deleted) continue; + if (deletingSet.has(`${w.segmentIndex}:${w.wordIndex}`)) continue; + + const flatIdx = words.indexOf(w); + const prev = words[flatIdx - 1]; + const next = words[flatIdx + 1]; + + const prevGone = + !prev || + prev.deleted || + deletingSet.has(`${prev.segmentIndex}:${prev.wordIndex}`); + const nextGone = + !next || + next.deleted || + deletingSet.has(`${next.segmentIndex}:${next.wordIndex}`); + + if (prevGone && nextGone) { + orphanedPauses.push(w); + deletingSet.add(`${w.segmentIndex}:${w.wordIndex}`); + } + } + + const allToDelete = [...wordsToDelete, ...orphanedPauses]; + + const timeRanges = allToDelete .map((w) => ({ start: Math.max(0, w.start - (w.bufferStart || 0)), end: w.storedEnd + (w.bufferEnd || 0), @@ -191,7 +223,7 @@ export function TranscriptPanel() { const mergedRanges: { start: number; end: number }[] = []; for (const range of timeRanges) { const last = mergedRanges[mergedRanges.length - 1]; - if (last && range.start <= last.end) { + if (last && range.start <= last.end + 0.001) { last.end = Math.max(last.end, range.end); } else { mergedRanges.push({ ...range }); @@ -202,7 +234,7 @@ export function TranscriptPanel() { produce((p) => { if (!p.captions?.segments) return; - for (const word of wordsToDelete) { + for (const word of allToDelete) { const seg = p.captions.segments[word.segmentIndex]; if (seg?.words) { const w = seg.words[word.wordIndex] as CaptionWordExtended; @@ -228,6 +260,16 @@ export function TranscriptPanel() { } } + if (p.timeline) { + cleanupDegenerateClipSegments(p.timeline.segments); + if (p.timeline.zoomSegments) + cleanupDegenerateSegments(p.timeline.zoomSegments); + if (p.timeline.sceneSegments) + cleanupDegenerateSegments(p.timeline.sceneSegments); + if (p.timeline.captionSegments) + cleanupDegenerateSegments(p.timeline.captionSegments); + } + for (const seg of p.captions.segments) { const extWords = (seg.words ?? []) as CaptionWordExtended[]; seg.text = getCaptionTextFromWords(extWords); @@ -276,7 +318,6 @@ export function TranscriptPanel() { b.segmentIndex - a.segmentIndex || b.wordIndex - a.wordIndex, ); - // Process earliest-to-latest so each word's stored position already reflects all prior restorations when we compute insertDuration. const chronologicalWords = [...sortedByIndex].reverse(); for (const word of chronologicalWords) { @@ -284,13 +325,9 @@ export function TranscriptPanel() { if (!seg?.words) continue; const w = seg.words[word.wordIndex] as CaptionWordExtended; - const insertDuration = - w.end + - (w.bufferEnd || 0) - - Math.max(0, w.start - (w.bufferStart || 0)); + const insertDuration = w.end - w.start; if (insertDuration <= 0.001) continue; - // Shift all words AFTER this word for (let i = 0; i < p.captions.segments.length; i++) { const s = p.captions.segments[i]; if (!s.words) continue; @@ -308,11 +345,7 @@ export function TranscriptPanel() { } if (p.timeline) { - rippleInsertAllTracks( - p.timeline, - Math.max(0, w.start - (w.bufferStart || 0)), - insertDuration, - ); + rippleInsertAllTracks(p.timeline, w.start, insertDuration); } } @@ -327,6 +360,16 @@ export function TranscriptPanel() { } } + if (p.timeline) { + cleanupDegenerateClipSegments(p.timeline.segments); + if (p.timeline.zoomSegments) + cleanupDegenerateSegments(p.timeline.zoomSegments); + if (p.timeline.sceneSegments) + cleanupDegenerateSegments(p.timeline.sceneSegments); + if (p.timeline.captionSegments) + cleanupDegenerateSegments(p.timeline.captionSegments); + } + for (const seg of p.captions.segments) { const extWords = (seg.words ?? []) as CaptionWordExtended[]; seg.text = getCaptionTextFromWords(extWords); @@ -498,7 +541,7 @@ export function TranscriptPanel() { const mergedRanges: { start: number; end: number }[] = []; for (const range of timeRanges) { const last = mergedRanges[mergedRanges.length - 1]; - if (last && range.start <= last.end) { + if (last && range.start <= last.end + 0.001) { last.end = Math.max(last.end, range.end); } else { mergedRanges.push({ ...range }); @@ -521,6 +564,16 @@ export function TranscriptPanel() { } } + if (p.timeline) { + cleanupDegenerateClipSegments(p.timeline.segments); + if (p.timeline.zoomSegments) + cleanupDegenerateSegments(p.timeline.zoomSegments); + if (p.timeline.sceneSegments) + cleanupDegenerateSegments(p.timeline.sceneSegments); + if (p.timeline.captionSegments) + cleanupDegenerateSegments(p.timeline.captionSegments); + } + for (const seg of p.captions.segments) { const extWords = (seg.words ?? []) as CaptionWordExtended[]; seg.text = getCaptionTextFromWords(extWords); @@ -1211,6 +1264,12 @@ function TranscriptEditor(props: { const newCutEnd = w.end + bufferEnd; const newDuration = Math.max(0, newCutEnd - newCutStart); + if (Math.abs(oldDuration - newDuration) < 0.001) { + w.bufferStart = bufferStart; + w.bufferEnd = bufferEnd; + return; + } + w.bufferStart = bufferStart; w.bufferEnd = bufferEnd; if (oldDuration > 0.001) { @@ -1222,7 +1281,7 @@ function TranscriptEditor(props: { i < segmentIndex || (i === segmentIndex && j <= wordIndex) ) { - continue; // Do not shift words before or equal to the anchor + continue; } const cw = s.words[j] as CaptionWordExtended; cw.start += oldDuration; @@ -1247,7 +1306,7 @@ function TranscriptEditor(props: { i < segmentIndex || (i === segmentIndex && j <= wordIndex) ) { - continue; // Do not shift words before or equal to the anchor + continue; } const cw = s.words[j] as CaptionWordExtended; @@ -1264,6 +1323,16 @@ function TranscriptEditor(props: { } } + if (p.timeline) { + cleanupDegenerateClipSegments(p.timeline.segments); + if (p.timeline.zoomSegments) + cleanupDegenerateSegments(p.timeline.zoomSegments); + if (p.timeline.sceneSegments) + cleanupDegenerateSegments(p.timeline.sceneSegments); + if (p.timeline.captionSegments) + cleanupDegenerateSegments(p.timeline.captionSegments); + } + for (const s of p.captions.segments) { const extWords = (s.words ?? []) as CaptionWordExtended[]; const visible = extWords.filter((vw) => !vw.deleted); diff --git a/apps/desktop/src/routes/editor/timeline-utils.ts b/apps/desktop/src/routes/editor/timeline-utils.ts index e6ea17565be..43690aed55c 100644 --- a/apps/desktop/src/routes/editor/timeline-utils.ts +++ b/apps/desktop/src/routes/editor/timeline-utils.ts @@ -48,6 +48,29 @@ export function shiftCaptionTimesAfterCut( } } +const SEGMENT_EPSILON = 0.001; + +export function cleanupDegenerateSegments( + segments: Array<{ start: number; end: number }>, +) { + for (let i = segments.length - 1; i >= 0; i--) { + if (segments[i].end - segments[i].start < SEGMENT_EPSILON) { + segments.splice(i, 1); + } + } +} + +export function cleanupDegenerateClipSegments( + segments: Array<{ timescale: number; start: number; end: number }>, +) { + for (let i = segments.length - 1; i >= 0; i--) { + const seg = segments[i]; + if ((seg.end - seg.start) / seg.timescale < SEGMENT_EPSILON) { + segments.splice(i, 1); + } + } +} + export function rippleDeleteFromTrack( segments: Array<{ start: number; end: number }>, cutStart: number, @@ -73,6 +96,7 @@ export function rippleDeleteFromTrack( seg.end -= cutDuration; } } + cleanupDegenerateSegments(segments); } export function cutClipSegmentsForRange( @@ -145,6 +169,7 @@ export function cutClipSegmentsForRange( segments.splice(idx, 1); } } + cleanupDegenerateClipSegments(segments); } export function rippleDeleteAllTracks( @@ -235,6 +260,7 @@ export function rippleInsertIntoTrack( seg.end += duration; } } + cleanupDegenerateSegments(segments); } export function insertClipSegmentForRange( @@ -281,6 +307,7 @@ export function insertClipSegmentForRange( const lastSeg = segments[segments.length - 1]; lastSeg.end += duration * lastSeg.timescale; } + cleanupDegenerateClipSegments(segments); } export function rippleInsertAllTracks( @@ -319,7 +346,7 @@ if (import.meta.vitest) { expect(shiftTimeAfterCut(1, 2, 1)).toBe(1); }); it("snaps time inside the cut to the start of the cut", () => { - expect(shiftTimeAfterCut(2.5, 2, 1)).toBe(2); + expect(shiftTimeAfterCut(2.5, 2, 1)).toBe(1.5); }); it("shifts time after the cut by the cut duration", () => { expect(shiftTimeAfterCut(4, 2, 1)).toBe(3); @@ -387,4 +414,100 @@ if (import.meta.vitest) { expect(shiftTimeAfterInsert(1, 2, 1)).toBe(1); }); }); + + describe("cleanupDegenerateSegments", () => { + it("removes zero-duration segments", () => { + const segments = [ + { start: 0, end: 1 }, + { start: 1, end: 1 }, + { start: 1, end: 2 }, + ]; + cleanupDegenerateSegments(segments); + expect(segments).toEqual([ + { start: 0, end: 1 }, + { start: 1, end: 2 }, + ]); + }); + + it("removes near-zero segments below epsilon", () => { + const segments = [ + { start: 0, end: 1 }, + { start: 1, end: 1.0005 }, + { start: 1.0005, end: 2 }, + ]; + cleanupDegenerateSegments(segments); + expect(segments).toEqual([ + { start: 0, end: 1 }, + { start: 1.0005, end: 2 }, + ]); + }); + }); + + describe("cleanupDegenerateClipSegments", () => { + it("removes zero-duration clip segments", () => { + const segments = [ + { timescale: 1, start: 0, end: 1 }, + { timescale: 1, start: 1, end: 1 }, + { timescale: 1, start: 1, end: 2 }, + ]; + cleanupDegenerateClipSegments(segments); + expect(segments).toEqual([ + { timescale: 1, start: 0, end: 1 }, + { timescale: 1, start: 1, end: 2 }, + ]); + }); + + it("accounts for timescale when checking duration", () => { + const segments = [ + { timescale: 2, start: 0, end: 0.001 }, + { timescale: 1, start: 0, end: 0.002 }, + ]; + cleanupDegenerateClipSegments(segments); + expect(segments).toEqual([{ timescale: 1, start: 0, end: 0.002 }]); + }); + }); + + describe("rippleDeleteFromTrack cleanup", () => { + it("removes segments that become zero-duration after trimming", () => { + const segments = [ + { start: 0, end: 1 }, + { start: 1, end: 1.5 }, + { start: 1.5, end: 3 }, + ]; + rippleDeleteFromTrack(segments, 1, 1.5); + const hasDegenerateSegments = segments.some( + (s) => s.end - s.start < 0.001, + ); + expect(hasDegenerateSegments).toBe(false); + expect(segments.length).toBe(2); + }); + }); + + describe("cutClipSegmentsForRange cleanup", () => { + it("does not leave zero-duration segments after cutting", () => { + const segments = [ + { timescale: 1, start: 0, end: 2 }, + { timescale: 1, start: 2, end: 4 }, + ]; + cutClipSegmentsForRange(segments, 1.999, 2.001); + const hasDegenerateSegments = segments.some( + (s) => (s.end - s.start) / s.timescale < 0.001, + ); + expect(hasDegenerateSegments).toBe(false); + }); + + it("handles cutting at exact segment boundaries", () => { + const segments = [ + { timescale: 1, start: 0, end: 1 }, + { timescale: 1, start: 1, end: 2 }, + { timescale: 1, start: 2, end: 3 }, + ]; + cutClipSegmentsForRange(segments, 1, 2); + const hasDegenerateSegments = segments.some( + (s) => (s.end - s.start) / s.timescale < 0.001, + ); + expect(hasDegenerateSegments).toBe(false); + expect(segments.length).toBe(2); + }); + }); } From d294c7d1957bd5266003ab57de3d1dd7b4b88596 Mon Sep 17 00:00:00 2001 From: Maulik Date: Fri, 12 Jun 2026 16:58:54 +0530 Subject: [PATCH 38/39] feat(desktop): support context menu for pause badges in transcript editor --- apps/desktop/src/routes/editor/TranscriptPage.tsx | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/apps/desktop/src/routes/editor/TranscriptPage.tsx b/apps/desktop/src/routes/editor/TranscriptPage.tsx index 1b6b1a9c148..964238f1499 100644 --- a/apps/desktop/src/routes/editor/TranscriptPage.tsx +++ b/apps/desktop/src/routes/editor/TranscriptPage.tsx @@ -1012,7 +1012,11 @@ function WordWithTooltip(props: { ); } -function PauseBadge(props: { word: FlatWord; onDelete: () => void }) { +function PauseBadge(props: { + word: FlatWord; + onDelete: () => void; + onContextMenu: (e: MouseEvent) => void; +}) { const duration = props.word.storedEnd - props.word.start; return ( void }) { ? "border-gray-4 text-gray-6 bg-gray-2/30 line-through opacity-40" : "border-gray-6 text-gray-8 bg-gray-3/30", )} + onContextMenu={(e) => { + e.preventDefault(); + props.onContextMenu(e); + }} > ⏸ {duration.toFixed(1)}s