import AVFoundation import CoreMedia import Foundation /// Owns one `AVAssetWriter`. Mirrors telegram-ios's /// [VideoRecorder.swift](file:///Users/agra/projects/telegram-ios/submodules/Camera/Sources/VideoRecorder.swift) /// state machine closely — lazy per-type input creation, /// gated `startWriting`, pending-audio-buffer queue: /// /// 1. `start()` only creates the `AVAssetWriter` shell and sets /// `recordingStartSampleTime` to wall-clock now. No inputs yet. /// 2. First **video** sample → create `videoInput` from its /// `CMFormatDescription` (`sourceFormatHint:`) + transform. /// Pre-`startWriting` because audio input may still be pending. /// 3. First **audio** sample (if `hasAudio`) → create `audioInput` /// with sample-rate / channel-layout extracted from the audio /// `CMFormatDescription` merged into `baseAudioSettings` /// (`recommendedAudioSettingsForAssetWriter`). /// 4. Next video sample arrives with both inputs added → /// `assetWriter.startWriting()`. Sample is dropped (telegram's /// behaviour — initial frame loss is acceptable, the writer needs /// one cycle to settle). /// 5. Subsequent video sample → `startSession(atSourceTime: pts)`, /// `recordingStartSampleTime = pts`. Appends begin. /// 6. Audio samples that arrive before `recordingStartSampleTime` is /// set are queued in `pendingAudioSampleBuffers`. After each /// successful video append, the queue is drained for samples whose /// `endTime <= lastVideoSampleTime`. /// 7. `stop()` sets `recordingStopSampleTime` to wall-clock now. /// Sample callbacks set `hasAllVideoBuffers` / `hasAllAudioBuffers` /// when their PTS crosses the stop time. `maybeFinish()` runs when /// both flags are set, gates `finishWriting` on /// `writer.status == .writing`. If audio never arrived, the audio /// flag is set synchronously in `stop()` so the video side can /// complete on its own. /// /// Sample-count diagnostics emit via [onDiagnostic] at each major /// checkpoint so the operator can verify "audio actually captured" /// without instrumenting the call sites. The closure is wired to the /// Dart-side `Log.tag('camera').i(...)` via the `ux/camera/events` /// channel — visible in `~/banlu/tools/log_server/data/banlu.jsonl`. final class VideoRecorder { /// Maps Flutter's `DeviceOrientation` to the rotation transform /// embedded as `AVAssetWriterInput.transform`. Source buffers /// are portrait-shape (see [CameraInstance.applyVideoOrientationOnPreview]), /// so the table assumes portrait source — see /// [CameraOrientationTests] for the four cases. public static func transform( for orientation: DeviceOrientationFlutter ) -> CGAffineTransform { switch orientation { case .portraitUp: return .identity case .portraitDown: return CGAffineTransform(rotationAngle: .pi) case .landscapeLeft: return CGAffineTransform(rotationAngle: -.pi / 2) case .landscapeRight: return CGAffineTransform(rotationAngle: .pi / 2) } } // MARK: - immutable config private let url: URL private let videoTransform: CGAffineTransform private let hasAudio: Bool private let baseVideoSettings: [String: Any]? private let baseAudioSettings: [String: Any] private let recorderQueue: DispatchQueue // MARK: - mutable state (always touched on recorderQueue) private var writer: AVAssetWriter? private var videoInput: AVAssetWriterInput? private var audioInput: AVAssetWriterInput? /// Wall-clock "start" time set by [start], then overwritten to the /// first video sample's PTS once the session is started. Used to /// gate samples whose PTS is older than start. private var recordingStartSampleTime: CMTime = .invalid /// Set by [stop]. Samples whose PTS crosses this set the matching /// `hasAllXBuffers` flag and trigger [maybeFinish]. private var recordingStopSampleTime: CMTime = .invalid /// PTS of the last video sample successfully appended. Used to /// gate audio drains (audio samples whose `endTime` exceeds this /// stay queued until video catches up). private var lastVideoSampleTime: CMTime = .invalid private var startedSession = false private var stopped = false private var hasAllVideoBuffers = false private var hasAllAudioBuffers = false private var failed = false /// Audio samples arriving before video has caught up. Drained /// after each successful video append. private var pendingAudioSampleBuffers: [CMSampleBuffer] = [] private var completion: ((Result) -> Void)? // MARK: - diagnostics (emit via [onDiagnostic] → ux.Log) private func diag(_ message: String) { onDiagnostic?(message) } private var videoReceived: Int = 0 private var videoAppended: Int = 0 private var audioReceived: Int = 0 private var audioAppended: Int = 0 private var audioQueued: Int = 0 /// Set by [CameraInstance] to ship diagnostic messages over the /// `ux/camera/events` channel as `{event: "diagnostic"}`. The /// Dart-side controller turns those into `Log.tag('camera').i(...)` /// — so they land in the log_server pipeline and can be tailed /// from `~/banlu/tools/log_server/data/banlu.jsonl`. var onDiagnostic: ((String) -> Void)? // MARK: - init / start init( url: URL, orientation: DeviceOrientationFlutter, hasAudio: Bool, baseVideoSettings: [String: Any]?, baseAudioSettings: [String: Any], recorderQueue: DispatchQueue ) { self.url = url self.videoTransform = VideoRecorder.transform(for: orientation) self.hasAudio = hasAudio self.baseVideoSettings = baseVideoSettings self.baseAudioSettings = baseAudioSettings self.recorderQueue = recorderQueue } /// Open the file. Inputs are created lazily on the first sample /// of each type — see class doc. Throws on `AVAssetWriter` /// allocation failure (typically a path / file-system issue). func start() throws { let writer = try AVAssetWriter(url: url, fileType: .mp4) self.writer = writer // Sentinel until the first video sample's PTS overwrites it — // see [handleVideo] when it calls `writer.startSession`. recordingStartSampleTime = CMTime( seconds: CACurrentMediaTime(), preferredTimescale: CMTimeScale(NSEC_PER_SEC) ) diag("start: file=\(url.lastPathComponent) hasAudio=\(hasAudio)") } /// Hard cancel — drop pending audio, `cancelWriting` if the writer /// is writing, delete the partial file. Mirrors telegram-ios's /// [`VideoRecorder.cancelRecording`](file:///Users/agra/projects/telegram-ios/submodules/Camera/Sources/VideoRecorder.swift#L329). /// Used by [CameraInstance.dispose] when a recording is in flight /// at teardown — there's no caller to deliver the file to, so no /// reason to wait for `finishWriting` to flush. func cancel(completion: (() -> Void)? = nil) { recorderQueue.async { if self.stopped || self.failed { completion?() return } self.stopped = true self.pendingAudioSampleBuffers = [] if let writer = self.writer, writer.status == .writing { writer.cancelWriting() } try? FileManager.default.removeItem(at: self.url) self.diag("cancel: vRecv=\(self.videoReceived) aRecv=\(self.audioReceived)") // Resolve any pending stop() completion so the caller's // Future doesn't dangle. if let cb = self.completion { self.completion = nil cb(.failure(NSError( domain: "ux.camera", code: -12, userInfo: [NSLocalizedDescriptionKey: "Recording cancelled"] ))) } completion?() } } /// Stop. Sets `recordingStopSampleTime` so the next video / audio /// sample crossing it flips the matching `hasAllXBuffers` flag, /// which triggers `maybeFinish` → `finishWriting`. Completion /// fires once when the writer finishes. /// /// Idempotent: a second call while a stop is already in flight is /// silently dropped. func stop(completion: @escaping (Result) -> Void) { recorderQueue.async { if self.completion != nil { return } self.completion = completion let stopTime = CMTime( seconds: CACurrentMediaTime(), preferredTimescale: CMTimeScale(NSEC_PER_SEC) ) self.recordingStopSampleTime = stopTime self.diag("stop: vRecv=\(self.videoReceived) vApp=\(self.videoAppended)" + " aRecv=\(self.audioReceived) aApp=\(self.audioAppended)" + " aQueued=\(self.pendingAudioSampleBuffers.count)") // Nothing ever arrived — no sample callback will ever // trigger `maybeFinish`. Cancel the writer instead. if !self.startedSession { self.writer?.cancelWriting() self.failed = true self.deliver(.failure(NSError( domain: "ux.camera", code: -11, userInfo: [ NSLocalizedDescriptionKey: "Recording stopped before any samples were written" ] ))) return } // No audio path (mic permission denied, etc.) — the audio // side is "drained" by definition. `maybeFinish` then only // waits for the next video sample whose PTS crosses // `stopTime` (~one frame later, ~33ms at 30fps). if self.audioInput == nil || self.audioReceived == 0 { self.hasAllAudioBuffers = true } } } // MARK: - sample append (from videoBufferQueue / audioBufferQueue) func appendVideo(_ sampleBuffer: CMSampleBuffer) { recorderQueue.async { self.handleVideo(sampleBuffer) } } func appendAudio(_ sampleBuffer: CMSampleBuffer) { recorderQueue.async { self.handleAudio(sampleBuffer) } } // MARK: - recorderQueue handlers private func handleVideo(_ sampleBuffer: CMSampleBuffer) { guard !stopped, !failed else { return } guard let writer = writer else { return } guard let formatDescription = CMSampleBufferGetFormatDescription(sampleBuffer), CMFormatDescriptionGetMediaType(formatDescription) == kCMMediaType_Video else { return } videoReceived += 1 let presentationTime = CMSampleBufferGetPresentationTimeStamp(sampleBuffer) // 1. Lazy create the video input on first video sample, with // the buffer's format description as `sourceFormatHint`. if videoInput == nil { let videoSettings = baseVideoSettings ?? [:] if writer.canApply(outputSettings: videoSettings, forMediaType: .video) { let input = AVAssetWriterInput( mediaType: .video, outputSettings: videoSettings, sourceFormatHint: formatDescription ) input.expectsMediaDataInRealTime = true input.transform = videoTransform if writer.canAdd(input) { writer.add(input) videoInput = input diag("video input added") } else { fail(NSError(domain: "ux.camera", code: -30, userInfo: [NSLocalizedDescriptionKey: "canAdd videoInput failed"])) return } } else { fail(NSError(domain: "ux.camera", code: -31, userInfo: [NSLocalizedDescriptionKey: "canApply videoSettings failed"])) return } } // 2. Writer state machine if writer.status == .unknown { // Drop samples that arrived BEFORE the wall-clock start // (rare, but happens if the session was already running // before start() was called). if presentationTime < recordingStartSampleTime { return } // Only start the writer when ALL needed inputs are ready. if videoInput != nil && (audioInput != nil || !hasAudio) { if !writer.startWriting() { fail(writer.error) return } diag("startWriting") } // Drop this sample regardless — the writer needs a cycle // to settle. Next sample will hit the `.writing` branch. return } else if writer.status == .writing && !startedSession { writer.startSession(atSourceTime: presentationTime) recordingStartSampleTime = presentationTime lastVideoSampleTime = presentationTime startedSession = true diag(String(format: "startSession at %.3fs", presentationTime.seconds)) } // Drop pre-start samples (post-startSession). if recordingStartSampleTime == .invalid || presentationTime < recordingStartSampleTime { return } if writer.status == .writing && startedSession { // 3. Stop-time gating — set hasAllVideoBuffers when we // see a sample past stop time, trigger finish. if recordingStopSampleTime.isValid && presentationTime > recordingStopSampleTime { hasAllVideoBuffers = true maybeFinish() return } guard let input = videoInput else { return } // Busy-wait briefly if the input isn't ready. Matches // telegram-ios's pattern at VideoRecorder.swift:202-206. // Real-time capture; we can't backpressure the camera. while !input.isReadyForMoreMediaData { RunLoop.current.run(until: Date(timeIntervalSinceNow: 0.05)) } if input.append(sampleBuffer) { lastVideoSampleTime = presentationTime videoAppended += 1 } // 4. Drain any pending audio whose endTime now fits // under lastVideoSampleTime. if !tryAppendingPendingAudioBuffers() { fail(writer.error) } } } private func handleAudio(_ sampleBuffer: CMSampleBuffer) { guard !stopped, !failed, hasAudio else { return } guard let writer = writer else { return } guard let formatDescription = CMSampleBufferGetFormatDescription(sampleBuffer), CMFormatDescriptionGetMediaType(formatDescription) == kCMMediaType_Audio else { return } audioReceived += 1 // 1. Lazy create audio input on first audio sample, with // sample-rate / channel-layout extracted from the // sample's CMAudioFormatDescription. if audioInput == nil { var audioSettings = baseAudioSettings if let asbd = CMAudioFormatDescriptionGetStreamBasicDescription(formatDescription) { audioSettings[AVSampleRateKey] = asbd.pointee.mSampleRate audioSettings[AVNumberOfChannelsKey] = asbd.pointee.mChannelsPerFrame } var channelLayoutSize: Int = 0 let channelLayoutPtr = CMAudioFormatDescriptionGetChannelLayout( formatDescription, sizeOut: &channelLayoutSize ) let channelLayoutData: Data if let ptr = channelLayoutPtr, channelLayoutSize > 0 { channelLayoutData = Data(bytes: ptr, count: channelLayoutSize) } else { channelLayoutData = Data() } audioSettings[AVChannelLayoutKey] = channelLayoutData if writer.canApply(outputSettings: audioSettings, forMediaType: .audio) { let input = AVAssetWriterInput( mediaType: .audio, outputSettings: audioSettings, sourceFormatHint: formatDescription ) input.expectsMediaDataInRealTime = true if writer.canAdd(input) { writer.add(input) audioInput = input diag("audio input added" + " sr=\(audioSettings[AVSampleRateKey] ?? "?")" + " ch=\(audioSettings[AVNumberOfChannelsKey] ?? "?")") } else { diag("canAdd audioInput failed") return } } else { diag("canApply audioSettings failed") return } } // 2. Need the video stream to have given us a session start // time before any audio can be appended. if recordingStartSampleTime == .invalid { return } let presentationTime = CMSampleBufferGetPresentationTimeStamp(sampleBuffer) if presentationTime < recordingStartSampleTime { return } // 3. Stop-time gating. if recordingStopSampleTime.isValid && presentationTime > recordingStopSampleTime { hasAllAudioBuffers = true maybeFinish() return } // 4. Append (or queue) — drain pending first, then this // sample. tryAppendingAudioSampleBuffer chooses queue vs // immediate-append based on its endTime vs lastVideoSampleTime. if !tryAppendingPendingAudioBuffers() || !tryAppendingAudioSampleBuffer(sampleBuffer) { fail(writer.error) } } // MARK: - audio buffer queue /// Append [sampleBuffer] immediately if its `endTime` doesn't /// run past the latest video sample; otherwise enqueue. private func tryAppendingAudioSampleBuffer(_ sampleBuffer: CMSampleBuffer) -> Bool { if sampleBuffer.endTime > lastVideoSampleTime { pendingAudioSampleBuffers.append(sampleBuffer) audioQueued += 1 return true } return internalAppendAudioSampleBuffer(sampleBuffer) } /// Drain queued audio samples that have caught up to the latest /// video sample. Called after every video append. private func tryAppendingPendingAudioBuffers() -> Bool { guard !pendingAudioSampleBuffers.isEmpty else { return true } var stillPending: [CMSampleBuffer] = [] stillPending.reserveCapacity(pendingAudioSampleBuffers.count) var ok = true for sample in pendingAudioSampleBuffers { if !ok { stillPending.append(sample) continue } if sample.endTime <= lastVideoSampleTime { if !internalAppendAudioSampleBuffer(sample) { ok = false } } else { stillPending.append(sample) } } pendingAudioSampleBuffers = stillPending return ok } private func internalAppendAudioSampleBuffer(_ sampleBuffer: CMSampleBuffer) -> Bool { guard startedSession, let input = audioInput else { return true } while !input.isReadyForMoreMediaData { RunLoop.current.run(until: Date(timeIntervalSinceNow: 0.05)) } if input.append(sampleBuffer) { audioAppended += 1 return true } if writer?.error != nil { return false } // Append returned false but no writer error — treat as // recoverable. Telegram does the same. return true } // MARK: - finish private func maybeFinish() { guard hasAllVideoBuffers, (!hasAudio || hasAllAudioBuffers), !stopped, !failed else { return } stopped = true finish() } private func finish() { // Drain any audio buffer still pending up to the stop time. _ = tryAppendingPendingAudioBuffers() guard let writer = writer else { deliver(.failure(NSError( domain: "ux.camera", code: -40, userInfo: [NSLocalizedDescriptionKey: "writer missing on finish"] ))) return } // Only `finishWriting` when the writer reached `.writing`. guard writer.status == .writing else { diag("finish skipped — writer.status=\(writer.status.rawValue)") failOnError(writer.error) return } let url = self.url diag("finishWriting:" + " vRecv=\(videoReceived) vApp=\(videoAppended)" + " aRecv=\(audioReceived) aApp=\(audioAppended)" + " aQueuedDrop=\(pendingAudioSampleBuffers.count)") writer.finishWriting { [weak self] in self?.recorderQueue.async { guard let self = self else { return } if writer.status == .completed { self.deliver(.success(url)) } else { self.failOnError(writer.error) } } } } private func fail(_ error: Error?) { if failed { return } failed = true failOnError(error) } private func failOnError(_ error: Error?) { try? FileManager.default.removeItem(at: url) let ns = (error as NSError?) ?? NSError( domain: "ux.camera", code: -41, userInfo: [NSLocalizedDescriptionKey: "AVAssetWriter failed"] ) deliver(.failure(ns)) } private func deliver(_ outcome: Result) { let cb = completion completion = nil cb?(outcome) } } // MARK: - CMSampleBuffer ergonomics private extension CMSampleBuffer { /// `presentationTime + duration` — last instant covered by this /// buffer. Used to pace audio against video. var endTime: CMTime { let pts = CMSampleBufferGetPresentationTimeStamp(self) let dur = CMSampleBufferGetDuration(self) if dur.flags.contains(.valid) { return pts + dur } return pts } }