ux/android/ffmpeg/ffmpeg_jni.cc

/*
 * Copyright 2026 swipelab.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * JNI bridge for the ux FFmpeg video decoder. Exposes a small surface
 * (init / sendPacket / receiveFrame / flush / release) that
 * FfmpegVideoDecoder.java drives. The audio path was dropped — Media3's
 * MediaCodec AAC decoder handles audio on every device we ship to.
 */
#include <android/log.h>
#include <jni.h>
#include <stdlib.h>
#include <string.h>

extern "C" {
#ifdef __cplusplus
#define __STDC_CONSTANT_MACROS
#ifdef _STDINT_H
#undef _STDINT_H
#endif
#include <stdint.h>
#endif
#include <libavcodec/avcodec.h>
#include <libavutil/error.h>
#include <libavutil/imgutils.h>
#include <libavutil/opt.h>
#include <libavutil/pixfmt.h>
}

#define LOG_TAG "ux_ffmpeg_jni"
#define LOGE(...) \
  ((void)__android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__))
#define LOGI(...) \
  ((void)__android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__))
#define LOGD(...) \
  ((void)__android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, __VA_ARGS__))

#define LIBRARY_FUNC(RETURN_TYPE, NAME, ...)                                  \
  extern "C" {                                                                \
  JNIEXPORT RETURN_TYPE                                                       \
  Java_io_swipelab_ux_video_ffmpeg_FfmpegLibrary_##NAME(JNIEnv* env,          \
                                                        jobject thiz,         \
                                                        ##__VA_ARGS__);       \
  }                                                                           \
  JNIEXPORT RETURN_TYPE                                                       \
  Java_io_swipelab_ux_video_ffmpeg_FfmpegLibrary_##NAME(                      \
      JNIEnv* env, jobject thiz, ##__VA_ARGS__)

#define VIDEO_DECODER_FUNC(RETURN_TYPE, NAME, ...)                            \
  extern "C" {                                                                \
  JNIEXPORT RETURN_TYPE                                                       \
  Java_io_swipelab_ux_video_ffmpeg_FfmpegVideoDecoder_##NAME(                 \
      JNIEnv* env, jobject thiz, ##__VA_ARGS__);                              \
  }                                                                           \
  JNIEXPORT RETURN_TYPE                                                       \
  Java_io_swipelab_ux_video_ffmpeg_FfmpegVideoDecoder_##NAME(                 \
      JNIEnv* env, jobject thiz, ##__VA_ARGS__)

#define ERROR_STRING_BUFFER_LENGTH 256

// Mirrored in FfmpegVideoDecoder.java.
static const int VIDEO_DECODER_SUCCESS = 0;
static const int VIDEO_DECODER_ERROR_INVALID_DATA = -1;
static const int VIDEO_DECODER_ERROR_OTHER = -2;
static const int VIDEO_DECODER_READ_AGAIN = -3;

// VideoDecoderOutputBuffer.COLORSPACE_* mirror.
static const int COLORSPACE_UNKNOWN = 0;
static const int COLORSPACE_BT601 = 1;
static const int COLORSPACE_BT709 = 2;
static const int COLORSPACE_BT2020 = 3;

static jmethodID initForYuvFrameMethod;
static jfieldID dataField;
// Carries full-range info (1) vs limited-range info (0) per frame to
// Java so the GL shader picks the matching BT.709 conversion matrix.
static jfieldID decoderPrivateField;
// Reassigned per output frame to the decoded frame's actual PTS
// (NOT the input packet's PTS — for H.264 reorder, output display
// order differs from input decode order, and using the input PTS
// scrambles ExoPlayer's frame-late detection so it drops half the
// stream).
static jfieldID timeUsField;

static int colorspaceFromAVColorSpace(AVColorSpace cs) {
  switch (cs) {
    case AVCOL_SPC_BT709:
      return COLORSPACE_BT709;
    case AVCOL_SPC_BT470BG:
    case AVCOL_SPC_SMPTE170M:
      return COLORSPACE_BT601;
    case AVCOL_SPC_BT2020_NCL:
    case AVCOL_SPC_BT2020_CL:
      return COLORSPACE_BT2020;
    default:
      return COLORSPACE_UNKNOWN;
  }
}

static void logError(const char* fn, int err) {
  char buf[ERROR_STRING_BUFFER_LENGTH] = {0};
  av_strerror(err, buf, ERROR_STRING_BUFFER_LENGTH);
  LOGE("Error in %s: %s", fn, buf);
}

static int transformError(int err) {
  return err == AVERROR_INVALIDDATA ? VIDEO_DECODER_ERROR_INVALID_DATA
                                    : VIDEO_DECODER_ERROR_OTHER;
}

// Decoder state held across JNI calls; the long handle returned by
// videoInitialize is a pointer to one of these. AVCodecContext alone
// isn't enough because we want a reusable AVFrame to avoid per-decode
// allocation churn, plus a pending_frame slot to cache frames pulled
// during a send-side EAGAIN drain so the next receiveFrame call emits
// them in order instead of losing them.
struct UxFfmpegVideoContext {
  AVCodecContext* codec_ctx = nullptr;
  AVFrame* frame = nullptr;
  AVFrame* pending_frame = nullptr;
  bool has_pending = false;
};

static void releaseContext(UxFfmpegVideoContext* ctx) {
  if (!ctx) return;
  if (ctx->frame) {
    av_frame_free(&ctx->frame);
  }
  if (ctx->pending_frame) {
    av_frame_free(&ctx->pending_frame);
  }
  if (ctx->codec_ctx) {
    avcodec_free_context(&ctx->codec_ctx);
  }
  delete ctx;
}

jint JNI_OnLoad(JavaVM* vm, void* reserved) {
  JNIEnv* env;
  if (vm->GetEnv(reinterpret_cast<void**>(&env), JNI_VERSION_1_6) != JNI_OK) {
    LOGE("JNI_OnLoad: GetEnv failed");
    return -1;
  }
  jclass clazz =
      env->FindClass("androidx/media3/decoder/VideoDecoderOutputBuffer");
  if (!clazz) {
    LOGE("JNI_OnLoad: FindClass(VideoDecoderOutputBuffer) failed");
    return -1;
  }
  initForYuvFrameMethod = env->GetMethodID(clazz, "initForYuvFrame", "(IIIII)Z");
  if (!initForYuvFrameMethod) {
    LOGE("JNI_OnLoad: GetMethodID(initForYuvFrame) failed");
    return -1;
  }
  dataField = env->GetFieldID(clazz, "data", "Ljava/nio/ByteBuffer;");
  if (!dataField) {
    LOGE("JNI_OnLoad: GetFieldID(data) failed");
    return -1;
  }
  decoderPrivateField = env->GetFieldID(clazz, "decoderPrivate", "J");
  if (!decoderPrivateField) {
    LOGE("JNI_OnLoad: GetFieldID(decoderPrivate) failed");
    return -1;
  }
  // timeUs lives on the DecoderOutputBuffer base class but is
  // discoverable via the concrete subclass.
  timeUsField = env->GetFieldID(clazz, "timeUs", "J");
  if (!timeUsField) {
    LOGE("JNI_OnLoad: GetFieldID(timeUs) failed");
    return -1;
  }
  return JNI_VERSION_1_6;
}

LIBRARY_FUNC(jstring, ffmpegGetVersion) {
  return env->NewStringUTF(LIBAVCODEC_IDENT);
}

LIBRARY_FUNC(jint, ffmpegGetInputBufferPaddingSize) {
  return (jint)AV_INPUT_BUFFER_PADDING_SIZE;
}

LIBRARY_FUNC(jboolean, ffmpegHasDecoder, jstring codecName) {
  if (!codecName) return JNI_FALSE;
  const char* name = env->GetStringUTFChars(codecName, nullptr);
  const AVCodec* codec = avcodec_find_decoder_by_name(name);
  env->ReleaseStringUTFChars(codecName, name);
  return codec != nullptr;
}

VIDEO_DECODER_FUNC(jlong, ffmpegVideoInitialize, jstring codecName,
                   jbyteArray extraData, jint threads) {
  if (!codecName) {
    LOGE("ffmpegVideoInitialize: codecName is null");
    return 0L;
  }
  const char* name = env->GetStringUTFChars(codecName, nullptr);
  const AVCodec* codec = avcodec_find_decoder_by_name(name);
  env->ReleaseStringUTFChars(codecName, name);
  if (!codec) {
    LOGE("ffmpegVideoInitialize: codec not found");
    return 0L;
  }

  UxFfmpegVideoContext* ctx = new UxFfmpegVideoContext();
  ctx->codec_ctx = avcodec_alloc_context3(codec);
  if (!ctx->codec_ctx) {
    LOGE("ffmpegVideoInitialize: avcodec_alloc_context3 failed");
    releaseContext(ctx);
    return 0L;
  }

  if (extraData) {
    jsize size = env->GetArrayLength(extraData);
    ctx->codec_ctx->extradata =
        (uint8_t*)av_mallocz(size + AV_INPUT_BUFFER_PADDING_SIZE);
    if (!ctx->codec_ctx->extradata) {
      LOGE("ffmpegVideoInitialize: extradata alloc failed");
      releaseContext(ctx);
      return 0L;
    }
    env->GetByteArrayRegion(extraData, 0, size,
                            (jbyte*)ctx->codec_ctx->extradata);
    ctx->codec_ctx->extradata_size = size;
  }

  ctx->codec_ctx->thread_count = threads > 0 ? threads : 0;
  // FF_THREAD_SLICE only. FRAME threading buffers thread_count
  // input frames before producing output, pushing decoded frames
  // past their PTS deadline and causing ExoPlayer to drop them.
  // Most iOS-captured H.264 emits one slice per frame, so slice
  // threading degenerates to single-threaded; libavcodec's H.264
  // decoder does not auto-promote SLICE-only to FRAME, so we
  // accept modest throughput in exchange for low latency. 480p
  // decode is ~2 ms per frame single-threaded on any modern ARM
  // core anyway.
  ctx->codec_ctx->thread_type = FF_THREAD_SLICE;
  ctx->codec_ctx->err_recognition = AV_EF_IGNORE_ERR;
  // PTS values are passed in microseconds (Media3's native unit),
  // and libavcodec propagates packet.pts → frame.pts through the
  // reorder buffer so we can recover display-order timestamps on
  // receive.
  ctx->codec_ctx->time_base = AVRational{1, 1000000};
  ctx->codec_ctx->pkt_timebase = AVRational{1, 1000000};

  int result = avcodec_open2(ctx->codec_ctx, codec, nullptr);
  if (result < 0) {
    logError("avcodec_open2", result);
    releaseContext(ctx);
    return 0L;
  }

  ctx->frame = av_frame_alloc();
  ctx->pending_frame = av_frame_alloc();
  if (!ctx->frame || !ctx->pending_frame) {
    LOGE("ffmpegVideoInitialize: av_frame_alloc failed");
    releaseContext(ctx);
    return 0L;
  }
  ctx->has_pending = false;
  return (jlong)ctx;
}

VIDEO_DECODER_FUNC(jint, ffmpegVideoSendPacket, jlong handle, jobject inputData,
                   jint inputSize, jlong ptsUs) {
  if (!handle) {
    LOGE("ffmpegVideoSendPacket: null handle");
    return VIDEO_DECODER_ERROR_OTHER;
  }
  if (!inputData || inputSize <= 0) {
    LOGE("ffmpegVideoSendPacket: bad input");
    return VIDEO_DECODER_ERROR_OTHER;
  }
  UxFfmpegVideoContext* ctx = (UxFfmpegVideoContext*)handle;
  uint8_t* buf = (uint8_t*)env->GetDirectBufferAddress(inputData);
  if (!buf) {
    LOGE("ffmpegVideoSendPacket: GetDirectBufferAddress null");
    return VIDEO_DECODER_ERROR_OTHER;
  }
  AVPacket* pkt = av_packet_alloc();
  if (!pkt) {
    LOGE("ffmpegVideoSendPacket: av_packet_alloc failed");
    return VIDEO_DECODER_ERROR_OTHER;
  }
  pkt->data = buf;
  pkt->size = inputSize;
  // Media3's C.TIME_UNSET is Long.MIN_VALUE which by happy coincidence
  // equals libavcodec's AV_NOPTS_VALUE; gate it explicitly so a future
  // Media3 sentinel change doesn't silently scramble PTS recovery.
  pkt->pts = (ptsUs == INT64_MIN) ? AV_NOPTS_VALUE : (int64_t)ptsUs;
  pkt->dts = AV_NOPTS_VALUE;

  // Per libavcodec contract, EAGAIN on send means the packet was NOT
  // consumed and the caller must drain output before re-sending. We
  // can't return EAGAIN to SimpleDecoder (its 1-in / 1-out model
  // would consume the input buffer and lose the packet), so when the
  // queue is full we drain one frame into pending_frame and retry.
  // pending_frame is then emitted by the next ffmpegVideoReceiveFrame
  // call before pulling a new one from libavcodec.
  int result = avcodec_send_packet(ctx->codec_ctx, pkt);
  if (result == AVERROR(EAGAIN) && !ctx->has_pending) {
    int recv = avcodec_receive_frame(ctx->codec_ctx, ctx->pending_frame);
    if (recv == 0) {
      ctx->has_pending = true;
      result = avcodec_send_packet(ctx->codec_ctx, pkt);
    } else {
      logError("send-EAGAIN drain receive", recv);
    }
  }
  av_packet_free(&pkt);
  if (result == AVERROR(EAGAIN)) {
    // Pending slot already full; drop this packet rather than block.
    // Should never happen at steady state given numOutputBuffers=16.
    LOGE("ffmpegVideoSendPacket: queue full and pending slot occupied");
    return VIDEO_DECODER_READ_AGAIN;
  }
  if (result < 0) {
    logError("avcodec_send_packet", result);
    return transformError(result);
  }
  return VIDEO_DECODER_SUCCESS;
}

// Pulls the next decoded frame and writes it into the Java
// VideoDecoderOutputBuffer's YUV planes. Returns:
//   VIDEO_DECODER_SUCCESS       -> frame written
//   VIDEO_DECODER_READ_AGAIN    -> no frame yet, send more packets
//   VIDEO_DECODER_ERROR_*       -> fatal
VIDEO_DECODER_FUNC(jint, ffmpegVideoReceiveFrame, jlong handle,
                   jobject outputBuffer) {
  if (!handle) {
    LOGE("ffmpegVideoReceiveFrame: null handle");
    return VIDEO_DECODER_ERROR_OTHER;
  }
  UxFfmpegVideoContext* ctx = (UxFfmpegVideoContext*)handle;
  AVFrame* f = ctx->frame;
  // If a frame was drained into pending_frame to recover from a
  // send-side EAGAIN, emit it before pulling the next one — keeps
  // display-order continuity even when libavcodec backpressures the
  // input queue.
  if (ctx->has_pending) {
    av_frame_unref(f);
    av_frame_move_ref(f, ctx->pending_frame);
    ctx->has_pending = false;
  } else {
    int result = avcodec_receive_frame(ctx->codec_ctx, f);
    if (result == AVERROR(EAGAIN) || result == AVERROR_EOF) {
      return VIDEO_DECODER_READ_AGAIN;
    }
    if (result < 0) {
      logError("avcodec_receive_frame", result);
      return transformError(result);
    }
  }
  // Only planar 4:2:0 YUV is supported by VideoDecoderOutputBuffer's
  // 3-plane layout. iOS H.264 produces YUV420P (limited range) or
  // YUVJ420P (full range); identical memory layout, only range
  // interpretation differs.
  AVPixelFormat pix = (AVPixelFormat)f->format;
  if (pix != AV_PIX_FMT_YUV420P && pix != AV_PIX_FMT_YUVJ420P) {
    LOGE("ffmpegVideoReceiveFrame: unsupported pix_fmt=%d", pix);
    av_frame_unref(f);
    return VIDEO_DECODER_ERROR_OTHER;
  }

  int width = f->width;
  int height = f->height;
  int yStride = f->linesize[0];
  int uvStride = f->linesize[1];
  int colorspace = colorspaceFromAVColorSpace(f->colorspace);
  if (colorspace == COLORSPACE_UNKNOWN) {
    // iOS H.264 commonly leaves VUI fields unspecified; default to
    // BT.709 for HD-shaped frames, BT.601 below SD width threshold.
    colorspace = (width >= 1280 || height >= 720) ? COLORSPACE_BT709
                                                  : COLORSPACE_BT601;
  }

  jboolean ok = env->CallBooleanMethod(outputBuffer, initForYuvFrameMethod,
                                       width, height, yStride, uvStride,
                                       colorspace);
  if (env->ExceptionCheck()) {
    LOGE("initForYuvFrame threw");
    env->ExceptionDescribe();
    env->ExceptionClear();
    av_frame_unref(f);
    return VIDEO_DECODER_ERROR_OTHER;
  }
  if (!ok) {
    LOGE("initForYuvFrame returned false (overflow?)");
    av_frame_unref(f);
    return VIDEO_DECODER_ERROR_OTHER;
  }

  // Display-order PTS recovered from libavcodec. The Java side
  // initialised the buffer with the input packet's PTS; for B-frame
  // streams that is the WRONG value because the frame we're about to
  // emit was decoded out of input order. Overwriting with f->pts puts
  // each output buffer back on the timeline ExoPlayer expects.
  if (f->pts != AV_NOPTS_VALUE) {
    env->SetLongField(outputBuffer, timeUsField, (jlong)f->pts);
  }

  // Copy each plane into the ByteBuffer that initForYuvFrame allocated.
  jobject dataBuf = env->GetObjectField(outputBuffer, dataField);
  if (!dataBuf) {
    LOGE("ffmpegVideoReceiveFrame: data ByteBuffer is null after init");
    av_frame_unref(f);
    return VIDEO_DECODER_ERROR_OTHER;
  }
  uint8_t* dst = (uint8_t*)env->GetDirectBufferAddress(dataBuf);
  if (!dst) {
    LOGE("ffmpegVideoReceiveFrame: GetDirectBufferAddress null");
    env->DeleteLocalRef(dataBuf);
    av_frame_unref(f);
    return VIDEO_DECODER_ERROR_OTHER;
  }

  int uvHeight = (height + 1) / 2;
  int yLength = yStride * height;
  int uvLength = uvStride * uvHeight;

  // Plane 0: Y
  memcpy(dst, f->data[0], yLength);
  // Plane 1: U
  memcpy(dst + yLength, f->data[1], uvLength);
  // Plane 2: V
  memcpy(dst + yLength + uvLength, f->data[2], uvLength);

  // iOS yuvj420p / AVCOL_RANGE_JPEG => full-range YUV; the renderer's
  // shader needs to skip the limited-range pre-scale on Y.
  jlong rangeFlag = (pix == AV_PIX_FMT_YUVJ420P ||
                     f->color_range == AVCOL_RANGE_JPEG)
                        ? 1L
                        : 0L;
  env->SetLongField(outputBuffer, decoderPrivateField, rangeFlag);

  env->DeleteLocalRef(dataBuf);
  av_frame_unref(f);
  return VIDEO_DECODER_SUCCESS;
}

VIDEO_DECODER_FUNC(void, ffmpegVideoFlush, jlong handle) {
  if (!handle) return;
  UxFfmpegVideoContext* ctx = (UxFfmpegVideoContext*)handle;
  if (ctx->has_pending) {
    av_frame_unref(ctx->pending_frame);
    ctx->has_pending = false;
  }
  avcodec_flush_buffers(ctx->codec_ctx);
}

VIDEO_DECODER_FUNC(void, ffmpegVideoRelease, jlong handle) {
  if (!handle) return;
  releaseContext((UxFfmpegVideoContext*)handle);
}