apv_decode: add a Vulkan hwaccel

2026-06-24 08:48:37 +00:00 · 2025-11-02 14:38:33 +00:00
parent 704df177aa
commit 5ad8c67e6c
11 changed files with 845 additions and 1 deletions
--- a/1
+++ b/1
@@ -11,6 +11,7 @@ version <next>:
 - Add AMF Frame Rate Converter (vf_frc_amf) filter
 - SMPTE 2094-50 metadata support and passthrough
 - ProRes RAW VideoToolbox hwaccel
+- APV Vulkan hwaccel


 version 8.1:
--- a/2
+++ b/2
@@ -3403,6 +3403,8 @@ videotoolbox_hwaccel_extralibs="-framework QuartzCore"
 vulkan_deps="threads"
 vulkan_deps_any="libdl LoadLibrary"

+apv_vulkan_hwaccel_deps="vulkan spirv_compiler"
+apv_vulkan_hwaccel_select="apv_decoder"
 av1_d3d11va_hwaccel_deps="d3d11va DXVA_PicParams_AV1"
 av1_d3d11va_hwaccel_select="av1_decoder"
 av1_d3d11va2_hwaccel_deps="d3d11va DXVA_PicParams_AV1"
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -1047,6 +1047,7 @@ OBJS-$(CONFIG_VIDEOTOOLBOX)               += videotoolbox.o
 OBJS-$(CONFIG_VDPAU)                      += vdpau.o
 OBJS-$(CONFIG_VULKAN)                     += vulkan.o vulkan_video.o

+OBJS-$(CONFIG_APV_VULKAN_HWACCEL)         += vulkan_decode.o vulkan_apv.o
 OBJS-$(CONFIG_AV1_D3D11VA_HWACCEL)        += dxva2_av1.o
 OBJS-$(CONFIG_AV1_DXVA2_HWACCEL)          += dxva2_av1.o
 OBJS-$(CONFIG_AV1_D3D12VA_HWACCEL)        += dxva2_av1.o d3d12va_av1.o
--- a/libavcodec/apv_decode.c
+++ b/libavcodec/apv_decode.c
@@ -51,6 +51,9 @@ static enum AVPixelFormat get_pixel_format(AVCodecContext *avctx,
                                           enum AVPixelFormat pix_fmt)
 {
    enum AVPixelFormat pix_fmts[] = {
+#if CONFIG_APV_VULKAN_HWACCEL
+        AV_PIX_FMT_VULKAN,
+#endif
        pix_fmt,
        AV_PIX_FMT_NONE,
    };
@@ -603,6 +606,9 @@ const FFCodec ff_apv_decoder = {
                             AV_CODEC_CAP_FRAME_THREADS,
    .caps_internal         = FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM,
    .hw_configs     = (const AVCodecHWConfigInternal *const []) {
+#if CONFIG_APV_VULKAN_HWACCEL
+        HWACCEL_VULKAN(apv),
+#endif
        NULL
    },
 };
--- a/libavcodec/hwaccels.h
+++ b/libavcodec/hwaccels.h
@@ -19,6 +19,7 @@
 #ifndef AVCODEC_HWACCELS_H
 #define AVCODEC_HWACCELS_H

+extern const struct FFHWAccel ff_apv_vulkan_hwaccel;
 extern const struct FFHWAccel ff_av1_d3d11va_hwaccel;
 extern const struct FFHWAccel ff_av1_d3d11va2_hwaccel;
 extern const struct FFHWAccel ff_av1_d3d12va_hwaccel;
--- a/libavcodec/version.h
+++ b/libavcodec/version.h
@@ -29,7 +29,7 @@

 #include "version_major.h"

-#define LIBAVCODEC_VERSION_MINOR  33
+#define LIBAVCODEC_VERSION_MINOR  34
 #define LIBAVCODEC_VERSION_MICRO 100

 #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
--- a/libavcodec/vulkan/Makefile
+++ b/libavcodec/vulkan/Makefile
@@ -1,6 +1,9 @@
 clean::
 	$(RM) $(CLEANSUFFIXES:%=libavcodec/vulkan/%)

+OBJS-$(CONFIG_APV_VULKAN_HWACCEL) += vulkan/apv_decode.comp.spv.o \
+                                     vulkan/apv_idct.comp.spv.o
+
 OBJS-$(CONFIG_FFV1_VULKAN_ENCODER) += vulkan/ffv1_enc_setup.comp.spv.o \
                                      vulkan/ffv1_enc_reset.comp.spv.o \
                                      vulkan/ffv1_enc_reset_golomb.comp.spv.o \
--- a/libavcodec/vulkan/apv_decode.comp.glsl
+++ b/libavcodec/vulkan/apv_decode.comp.glsl
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2025 Lynne <dev@lynne.ee>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#version 460
+#pragma shader_stage(compute)
+#extension GL_GOOGLE_include_directive : require
+
+#include "common.glsl"
+
+#define APV_MAX_NUM_COMP    4
+#define APV_MAX_TILE_COLS   20
+#define APV_MAX_TILE_ROWS   20
+#define APV_MAX_TILE_COUNT  (APV_MAX_TILE_COLS * APV_MAX_TILE_ROWS)
+#define APV_MIN_TRANS_COEFF -32768
+#define APV_MAX_TRANS_COEFF 32767
+#define APV_TR_SIZE         8
+#define APV_BLK_COEFFS      (APV_TR_SIZE * APV_TR_SIZE)
+#define APV_MB_SIZE         (ivec2(16, 16))
+
+layout (set = 0, binding = 0) uniform writeonly uimage2D dst[];
+layout (set = 0, binding = 1, scalar) readonly buffer frame_data_buf {
+    uvec2 tile_offset[APV_MAX_NUM_COMP * APV_MAX_TILE_COUNT];
+    uint8_t q_matrix[APV_MAX_NUM_COMP][8][8];
+    uint8_t tile_qp[APV_MAX_NUM_COMP * APV_MAX_TILE_COUNT];
+    uint16_t tile_col[APV_MAX_TILE_COLS + 1];
+    uint16_t tile_row[APV_MAX_TILE_ROWS + 1];
+};
+
+layout (push_constant, scalar) uniform pushConstants {
+    u8buf tile_data;
+    ivec2 tile_count;
+    ivec2 log2_chroma_sub;
+    int components;
+    int bit_depth;
+};
+
+GetBitContext gb;
+
+int apv_read_vlc(int k)
+{
+    /* Top 32 bits, longest valid APV code is 1 + 2*5 + 5 = 16 bits */
+    uint bits = show_bits(gb, 32);
+    uint mask = (1u << k) - 1u;
+
+    /* 1xxx: short, length 1+k, value = next k bits */
+    if (bits >= 0x80000000u) {
+        skip_bits(gb, 1 + k);
+        return int((bits >> (31 - k)) & mask);
+    }
+
+    /* 00xxx: short, length 2+k, value = (1<<k) + next k bits */
+    if (bits < 0x40000000u) {
+        skip_bits(gb, 2 + k);
+        return int((bits >> (30 - k)) & mask) + (1 << k);
+    }
+
+    /* 01 prefix + (n leading zeros) + 1 + (n+k value bits),
+     * after shifting out the 01 prefix, findMSB tells us n */
+    uint suffix = bits << 2;
+    if (suffix == 0u)
+        return APV_MAX_TRANS_COEFF + 1;
+
+    int n = 31 - findMSB(suffix);
+    skip_bits(gb, 3 + n);
+    /* (2<<k) + ((1<<n)-1) * (1<<k) is equal to ((1<<n) + 1) << k */
+    return (((1 << n) + 1) << k) + int(get_bits(gb, n + k));
+}
+
+/* ff_zigzag_direct, packed: each byte is the raster index (y*8 + x). */
+const uint8_t zigzag[64] = {
+    uint8_t( 0), uint8_t( 1), uint8_t( 8), uint8_t(16),
+    uint8_t( 9), uint8_t( 2), uint8_t( 3), uint8_t(10),
+    uint8_t(17), uint8_t(24), uint8_t(32), uint8_t(25),
+    uint8_t(18), uint8_t(11), uint8_t( 4), uint8_t( 5),
+    uint8_t(12), uint8_t(19), uint8_t(26), uint8_t(33),
+    uint8_t(40), uint8_t(48), uint8_t(41), uint8_t(34),
+    uint8_t(27), uint8_t(20), uint8_t(13), uint8_t( 6),
+    uint8_t( 7), uint8_t(14), uint8_t(21), uint8_t(28),
+    uint8_t(35), uint8_t(42), uint8_t(49), uint8_t(56),
+    uint8_t(57), uint8_t(50), uint8_t(43), uint8_t(36),
+    uint8_t(29), uint8_t(22), uint8_t(15), uint8_t(23),
+    uint8_t(30), uint8_t(37), uint8_t(44), uint8_t(51),
+    uint8_t(58), uint8_t(59), uint8_t(52), uint8_t(45),
+    uint8_t(38), uint8_t(31), uint8_t(39), uint8_t(46),
+    uint8_t(53), uint8_t(60), uint8_t(61), uint8_t(54),
+    uint8_t(47), uint8_t(55), uint8_t(62), uint8_t(63),
+};
+
+int prev_dc;
+int prev_k_dc;
+int prev_1st_ac_level;
+
+void decode_block(ivec2 pos, uint comp)
+{
+    int dc_coeff;
+    int abs_diff = apv_read_vlc(prev_k_dc);
+
+    if (abs_diff != 0) {
+        if (get_bit(gb))
+            dc_coeff = prev_dc - abs_diff;
+        else
+            dc_coeff = prev_dc + abs_diff;
+    } else {
+        dc_coeff = prev_dc;
+    }
+
+    if (dc_coeff < APV_MIN_TRANS_COEFF ||
+        dc_coeff > APV_MAX_TRANS_COEFF)
+        return;
+
+    imageStore(dst[comp], pos, uvec4(uint(dc_coeff) & 0xFFFFu));
+    prev_dc   = dc_coeff;
+    prev_k_dc = min(abs_diff >> 1, 5);
+
+    /* ACs */
+    int scan_pos   = 1;
+    int first_ac   = 1;
+    int prev_level = prev_1st_ac_level;
+    int prev_run   = 0;
+
+    do {
+        int coeff_zero_run;
+
+        int k_param = clamp(prev_run >> 2, 0, 2);
+        coeff_zero_run = apv_read_vlc(k_param);
+
+        if (coeff_zero_run > APV_BLK_COEFFS - scan_pos)
+            return;
+
+        /* image was already pre-cleared to all zeroes */
+        scan_pos += coeff_zero_run;
+        prev_run = coeff_zero_run;
+
+        if (scan_pos < APV_BLK_COEFFS) {
+            int abs_ac_coeff_minus1;
+            int level;
+
+            k_param = clamp(prev_level >> 2, 0, 4);
+            abs_ac_coeff_minus1 = apv_read_vlc(k_param);
+            bool sign_ac_coeff = get_bit(gb);
+
+            if (sign_ac_coeff)
+                level = -abs_ac_coeff_minus1 - 1;
+            else
+                level = abs_ac_coeff_minus1 + 1;
+
+            if (level < APV_MIN_TRANS_COEFF || level > APV_MAX_TRANS_COEFF)
+                return;
+
+            int zz = int(zigzag[scan_pos]);
+            imageStore(dst[comp], pos + ivec2(zz & 7, zz >> 3), uvec4(uint(level) & 0xFFFFu));
+
+            prev_level = abs_ac_coeff_minus1 + 1;
+            if (first_ac != 0) {
+                prev_1st_ac_level = prev_level;
+                first_ac = 0;
+            }
+
+            scan_pos++;
+        }
+    } while (scan_pos < APV_BLK_COEFFS);
+}
+
+void main(void)
+{
+    const ivec2 tile_pos = ivec2(gl_WorkGroupID.xy);
+    const uint comp_idx = uint(gl_WorkGroupID.z);
+
+    /* EC state */
+    prev_dc = 0;
+    prev_k_dc = 5;
+    prev_1st_ac_level = 0;
+
+    const int num_tiles = tile_count.x * tile_count.y;
+    const int tile_idx  = tile_pos.y * tile_count.x + tile_pos.x;
+    const uvec2 tile_bs = tile_offset[int(comp_idx) * num_tiles + tile_idx];
+    init_get_bits(gb, u8buf(tile_data + tile_bs.x), int(tile_bs.y));
+
+    ivec2 sub_shift = comp_idx == 0 ? ivec2(0) : log2_chroma_sub;
+    ivec2 tile_start = ivec2(tile_col[tile_pos.x], tile_row[tile_pos.y]);
+    ivec2 tile_dim = ivec2(tile_col[tile_pos.x + 1],
+                           tile_row[tile_pos.y + 1]) - tile_start;
+    ivec2 tile_mb_dim = tile_dim / APV_MB_SIZE;
+    ivec2 blk_mb_dim = ivec2(2, 2) >> sub_shift;
+
+    ivec2 mb, blk;
+    for (mb.y = 0; mb.y < tile_mb_dim.y; mb.y++) {
+        for (mb.x = 0; mb.x < tile_mb_dim.x; mb.x++) {
+            for (blk.y = 0; blk.y < blk_mb_dim.y; blk.y++) {
+                for (blk.x = 0; blk.x < blk_mb_dim.x; blk.x++) {
+                    ivec2 pos = (APV_MB_SIZE*mb +
+                                 APV_TR_SIZE*blk + tile_start) >> sub_shift;
+
+                    decode_block(pos, comp_idx);
+                }
+            }
+        }
+    }
+}
--- a/libavcodec/vulkan/apv_idct.comp.glsl
+++ b/libavcodec/vulkan/apv_idct.comp.glsl
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2025 Lynne <dev@lynne.ee>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#version 460
+#pragma shader_stage(compute)
+#extension GL_GOOGLE_include_directive : require
+
+#include "common.glsl"
+#include "dct.glsl"
+
+#define APV_MAX_NUM_COMP    4
+#define APV_MAX_TILE_COLS   20
+#define APV_MAX_TILE_ROWS   20
+#define APV_MAX_TILE_COUNT  (APV_MAX_TILE_COLS * APV_MAX_TILE_ROWS)
+#define APV_TR_SIZE         8
+#define APV_BLOCKS_PER_WG   8
+
+layout (set = 0, binding = 0) uniform uimage2D dst[];
+layout (set = 0, binding = 1, scalar) readonly buffer frame_data_buf {
+    uvec2 tile_offset[APV_MAX_NUM_COMP * APV_MAX_TILE_COUNT];
+    uint8_t q_matrix[APV_MAX_NUM_COMP][8][8];
+    uint8_t tile_qp[APV_MAX_NUM_COMP * APV_MAX_TILE_COUNT];
+    uint16_t tile_col[APV_MAX_TILE_COLS + 1];
+    uint16_t tile_row[APV_MAX_TILE_ROWS + 1];
+};
+
+layout (push_constant, scalar) uniform pushConstants {
+    u8buf tile_data;
+    ivec2 tile_count;
+    ivec2 log2_chroma_sub;
+    int components;
+    int bit_depth;
+};
+
+const int apv_level_scale[6] = { 40, 45, 51, 57, 64, 71 };
+
+void main(void)
+{
+    const uvec3 wgid = gl_WorkGroupID;
+    const uint comp = wgid.z;
+
+    const uvec3 lid = gl_LocalInvocationID;
+    const uint  block = (lid.y << 2) | (lid.x >> 3); /* 0..7 block in chunk */
+    const uint  col = lid.x & 0x7u;                  /* 0..7 column in block */
+
+    /* one workgroup handles eight horizontally neighbouring blocks */
+    const int blk_x = int(wgid.x) * APV_BLOCKS_PER_WG + int(block);
+    const int blk_y = int(wgid.y);
+    const ivec2 pos = ivec2(blk_x, blk_y) * APV_TR_SIZE;
+
+    /* note: some oddness happens on tile-boundaries */
+    const ivec2 sub_shift = (comp == 0u) ? ivec2(0) : log2_chroma_sub;
+    const ivec2 luma_pos  = pos << sub_shift;
+
+    /* figure out the tile position */
+    int tx = 0;
+    while (tx + 1 < tile_count.x && int(tile_col[tx + 1]) <= luma_pos.x)
+        tx++;
+    int ty = 0;
+    while (ty + 1 < tile_count.y && int(tile_row[ty + 1]) <= luma_pos.y)
+        ty++;
+
+    const int tile_idx = ty * tile_count.x + tx;
+    const int qp = int(tile_qp[int(comp) * APV_MAX_TILE_COUNT + tile_idx]);
+    const int level_scale = apv_level_scale[qp % 6];
+    const int qp_shift = qp / 6;
+
+    const int half_range = 1 << (bit_depth - 1);
+    const int max_val = (1 << bit_depth) - 1;
+    const float fact = float(half_range);
+    const float norm = 1.0f / (1024.0f * fact); /* DCT normalization const */
+
+    [[unroll]]
+    for (uint y = 0u; y < 8u; y++) {
+        /* load */
+        int   raw   = int(imageLoad(dst[comp], pos + ivec2(col, y)).x);
+        int   coeff = sign_extend(raw, 16);
+        /* dequant + norm */
+        int   qs    = level_scale * int(q_matrix[comp][col][y]) * (1 << qp_shift);
+        float v     = float(coeff * qs) * norm;
+        /* scale */
+        blocks[block][y * 9u + col] = v * idct_scale[y * 8u + col];
+    }
+    barrier();
+
+    idct8(block, col, 9);
+    barrier();
+
+    blocks[block][col * 9u] += 1.0f;
+
+    idct8(block, col * 9u, 1);
+    barrier();
+
+    [[unroll]]
+    for (int y = 0; y < 8; y++) {
+        float v = round(blocks[block][y * 9u + col] * fact);
+        imageStore(dst[comp], pos + ivec2(col, y),
+                   uvec4(uint(clamp(int(v), 0, max_val))));
+    }
+}
--- a/libavcodec/vulkan_apv.c
+++ b/libavcodec/vulkan_apv.c
@@ -0,0 +1,490 @@
+/*
+ * Copyright (c) 2025 Lynne <dev@lynne.ee>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "vulkan_decode.h"
+#include "hwaccel_internal.h"
+
+#include "apv_decode.h"
+#include "libavutil/mem.h"
+
+extern const unsigned char ff_apv_decode_comp_spv_data[];
+extern const unsigned int ff_apv_decode_comp_spv_len;
+
+extern const unsigned char ff_apv_idct_comp_spv_data[];
+extern const unsigned int ff_apv_idct_comp_spv_len;
+
+const FFVulkanDecodeDescriptor ff_vk_dec_apv_desc = {
+    .codec_id         = AV_CODEC_ID_APV,
+    .queue_flags      = VK_QUEUE_COMPUTE_BIT,
+};
+
+typedef struct APVVulkanDecodePicture {
+    FFVulkanDecodePicture vp;
+
+    AVBufferRef *frame_data_buf;
+    uint32_t    *frame_data;
+    int          tile_num;
+} APVVulkanDecodePicture;
+
+typedef struct APVVulkanDecodeContext {
+    FFVulkanShader decode;
+    FFVulkanShader idct;
+
+    AVBufferPool *frame_data_pool;
+} APVVulkanDecodeContext;
+
+typedef struct DecodePushData {
+    VkDeviceAddress tile_data;
+    int tile_count[2];
+    int log2_chroma_sub[2];
+    int components;
+    int bit_depth;
+} DecodePushData;
+
+static int vk_apv_start_frame(AVCodecContext          *avctx,
+                              const AVBufferRef       *buffer_ref,
+                              av_unused const uint8_t *buffer,
+                              av_unused uint32_t       size)
+{
+    int err;
+    APVDecodeContext *apv = avctx->priv_data;
+    FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data;
+    FFVulkanDecodeShared *ctx = dec->shared_ctx;
+    APVVulkanDecodeContext *apvvk = ctx->sd_ctx;
+
+    APVVulkanDecodePicture *apvvp = apv->hwaccel_picture_private;
+    FFVulkanDecodePicture *vp = &apvvp->vp;
+
+    /* Host map the input tile data if supported */
+    if (ctx->s.extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY)
+        ff_vk_host_map_buffer(&ctx->s, &vp->slices_buf, buffer_ref->data,
+                              buffer_ref,
+                              VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+                              VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT);
+
+    /* Allocate frame data buffer */
+    int fd_size = (2*4*APV_MAX_TILE_COUNT)*APV_MAX_NUM_COMP +
+                  (64 + APV_MAX_TILE_COUNT)*APV_MAX_NUM_COMP +
+                  (APV_MAX_TILE_COLS + 1 + APV_MAX_TILE_ROWS + 1)*2;
+
+    err = ff_vk_get_pooled_buffer(&ctx->s, &apvvk->frame_data_pool,
+                                  &apvvp->frame_data_buf,
+                                  VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+                                  VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
+                                  NULL, fd_size,
+                                  VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+                                  VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
+    if (err < 0)
+        return err;
+
+    /* Frame data */
+    FFVkBuffer *frame_data = (FFVkBuffer *)apvvp->frame_data_buf->data;
+    uint8_t *fd = frame_data->mapped_mem;
+
+    fd += 2*4*APV_MAX_TILE_COUNT*APV_MAX_NUM_COMP; /* Tile offsets go first */
+
+    /* per-component qmatrix and QPs */
+    for (int i = 0; i < APV_MAX_NUM_COMP; i++)
+        memcpy(fd + 64*i,
+               apv->cur_raw_frame->frame_header.quantization_matrix.q_matrix[i],
+               64);
+    fd += 64*APV_MAX_NUM_COMP;
+
+    for (int i = 0; i < APV_MAX_NUM_COMP; i++) {
+        for (int j = 0; j < APV_MAX_TILE_COUNT; j++)
+            fd[j] = apv->cur_raw_frame->tile[j].tile_header.tile_qp[i];
+        fd += APV_MAX_TILE_COUNT;
+    }
+
+    /* tile col/row offset */
+    memcpy(fd, apv->tile_info.col_starts, (APV_MAX_TILE_COLS+1)*2);
+    fd += (APV_MAX_TILE_COLS+1)*2;
+    memcpy(fd, apv->tile_info.row_starts, (APV_MAX_TILE_ROWS+1)*2);
+
+    /* Prepare frame to be used */
+    err = ff_vk_decode_prepare_frame_sdr(dec, apv->output_frame, vp, 1,
+                                         FF_VK_REP_NATIVE, 0);
+    if (err < 0)
+        return err;
+
+    return 0;
+}
+
+static int vk_apv_decode_slice(AVCodecContext *avctx,
+                               const uint8_t  *data,
+                               uint32_t        size)
+{
+    APVDecodeContext *apv = avctx->priv_data;
+
+    APVVulkanDecodePicture *apvvp = apv->hwaccel_picture_private;
+    FFVulkanDecodePicture *vp = &apvvp->vp;
+
+    FFVkBuffer *frame_data = (FFVkBuffer *)apvvp->frame_data_buf->data;
+    FFVkBuffer *slices_buf = vp->slices_buf ? (FFVkBuffer *)vp->slices_buf->data : NULL;
+
+    if (slices_buf && slices_buf->host_ref) {
+        AV_WN32(frame_data->mapped_mem + (2*apvvp->tile_num + 0)*sizeof(uint32_t),
+                data - slices_buf->mapped_mem);
+        AV_WN32(frame_data->mapped_mem + (2*apvvp->tile_num + 1)*sizeof(uint32_t),
+                size);
+
+        apvvp->tile_num++;
+    } else {
+        int err = ff_vk_decode_add_slice(avctx, vp, data, size, 0,
+                                         &apvvp->tile_num,
+                                         (const uint32_t **)&apvvp->frame_data);
+        if (err < 0)
+            return err;
+
+        AV_WN32(frame_data->mapped_mem + (2*(apvvp->tile_num - 1) + 0)*sizeof(uint32_t),
+                apvvp->frame_data[apvvp->tile_num - 1]);
+        AV_WN32(frame_data->mapped_mem + (2*(apvvp->tile_num - 1) + 1)*sizeof(uint32_t),
+                size);
+    }
+
+    return 0;
+}
+
+static int vk_apv_end_frame(AVCodecContext *avctx)
+{
+    int err;
+    APVDecodeContext *apv = avctx->priv_data;
+    const CodedBitstreamAPVContext *apv_cbc = apv->cbc->priv_data;
+    FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data;
+    FFVulkanDecodeShared *ctx = dec->shared_ctx;
+    APVVulkanDecodeContext *apvvk = ctx->sd_ctx;
+    FFVulkanFunctions *vk = &ctx->s.vkfn;
+
+    APVVulkanDecodePicture *apvvp = apv->hwaccel_picture_private;
+    FFVulkanDecodePicture *vp = &apvvp->vp;
+
+    FFVkBuffer *slices_buf = (FFVkBuffer *)vp->slices_buf->data;
+    FFVkBuffer *frame_data_buf = (FFVkBuffer *)apvvp->frame_data_buf->data;
+
+    AVHWFramesContext *hwfc = (AVHWFramesContext *)avctx->hw_frames_ctx->data;
+    enum AVPixelFormat sw_format = hwfc->sw_format;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(sw_format);
+
+    VkImageMemoryBarrier2 img_bar[8];
+    int nb_img_bar = 0;
+
+    FFVkExecContext *exec = ff_vk_exec_get(&ctx->s, &ctx->exec_pool);
+    ff_vk_exec_start(&ctx->s, exec);
+
+    /* Make sure the buffer is flushed */
+    RET(ff_vk_flush_buffer(&ctx->s, frame_data_buf, 0, frame_data_buf->size, 1));
+
+    /* Prepare deps */
+    RET(ff_vk_exec_add_dep_frame(&ctx->s, exec, apv->output_frame,
+                                 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+                                 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
+
+    err = ff_vk_exec_mirror_sem_value(&ctx->s, exec, &vp->sem, &vp->sem_value,
+                                      apv->output_frame);
+    if (err < 0)
+        return err;
+
+    RET(ff_vk_exec_add_dep_buf(&ctx->s, exec, &vp->slices_buf, 1, 0));
+    vp->slices_buf = NULL;
+    RET(ff_vk_exec_add_dep_buf(&ctx->s, exec, &apvvp->frame_data_buf, 1, 0));
+    apvvp->frame_data_buf = NULL;
+
+    AVVkFrame *vkf = (AVVkFrame *)apv->output_frame->data[0];
+    vkf->layout[0] = VK_IMAGE_LAYOUT_UNDEFINED;
+    vkf->access[0] = VK_ACCESS_2_NONE;
+
+    ff_vk_frame_barrier(&ctx->s, exec, apv->output_frame,
+                        img_bar, &nb_img_bar,
+                        VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
+                        VK_PIPELINE_STAGE_2_CLEAR_BIT,
+                        VK_ACCESS_2_TRANSFER_WRITE_BIT,
+                        VK_IMAGE_LAYOUT_GENERAL,
+                        VK_QUEUE_FAMILY_IGNORED);
+    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+        .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+        .pImageMemoryBarriers = img_bar,
+        .imageMemoryBarrierCount = nb_img_bar,
+    });
+    nb_img_bar = 0;
+
+    /* Zero frame */
+    for (int i = 0; i < ff_vk_count_images(vkf); i++)
+        vk->CmdClearColorImage(exec->buf, vkf->img[i],
+                               VK_IMAGE_LAYOUT_GENERAL,
+                               &((VkClearColorValue) { 0 }),
+                               1, &((VkImageSubresourceRange) {
+                                   .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+                                   .levelCount = 1,
+                                   .layerCount = 1,
+                               }));
+
+    /* Wait for the frame to get zeroed out before continuing */
+    ff_vk_frame_barrier(&ctx->s, exec, apv->output_frame, img_bar, &nb_img_bar,
+                        VK_PIPELINE_STAGE_2_CLEAR_BIT,
+                        VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+                        VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+                        VK_IMAGE_LAYOUT_GENERAL,
+                        VK_QUEUE_FAMILY_IGNORED);
+    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+        .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+        .pImageMemoryBarriers = img_bar,
+        .imageMemoryBarrierCount = nb_img_bar,
+    });
+    nb_img_bar = 0;
+
+    /* Setup push data */
+    DecodePushData pd = (DecodePushData) {
+        .tile_data = slices_buf->address,
+        .tile_count = { apv->tile_info.tile_cols, apv->tile_info.tile_rows },
+        .log2_chroma_sub = { desc->log2_chroma_w, desc->log2_chroma_h },
+        .components = desc->nb_components,
+        .bit_depth = apv_cbc->bit_depth,
+    };
+
+    /* Decoding */
+    ff_vk_shader_update_img_array(&ctx->s, exec, &apvvk->decode,
+                                  apv->output_frame, vp->view.out,
+                                  0, 0,
+                                  VK_IMAGE_LAYOUT_GENERAL,
+                                  VK_NULL_HANDLE);
+    ff_vk_shader_update_desc_buffer(&ctx->s, exec, &apvvk->decode,
+                                    0, 1, 0,
+                                    frame_data_buf,
+                                    0, frame_data_buf->size,
+                                    VK_FORMAT_UNDEFINED);
+
+    ff_vk_exec_bind_shader(&ctx->s, exec, &apvvk->decode);
+    ff_vk_shader_update_push_const(&ctx->s, exec, &apvvk->decode,
+                                   VK_SHADER_STAGE_COMPUTE_BIT,
+                                   0, sizeof(pd), &pd);
+
+    vk->CmdDispatch(exec->buf,
+                    apv->tile_info.tile_cols, apv->tile_info.tile_rows,
+                    desc->nb_components);
+
+    /* Wait for all decoding to finish */
+    ff_vk_frame_barrier(&ctx->s, exec, apv->output_frame, img_bar, &nb_img_bar,
+                        VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+                        VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
+                        VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
+                        VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
+                        VK_IMAGE_LAYOUT_GENERAL,
+                        VK_QUEUE_FAMILY_IGNORED);
+    vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
+        .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
+        .pImageMemoryBarriers = img_bar,
+        .imageMemoryBarrierCount = nb_img_bar,
+    });
+    nb_img_bar = 0;
+
+    /* iDCT */
+    ff_vk_shader_update_img_array(&ctx->s, exec, &apvvk->idct,
+                                  apv->output_frame, vp->view.out,
+                                  0, 0,
+                                  VK_IMAGE_LAYOUT_GENERAL,
+                                  VK_NULL_HANDLE);
+    ff_vk_shader_update_desc_buffer(&ctx->s, exec, &apvvk->idct,
+                                    0, 1, 0,
+                                    frame_data_buf,
+                                    0, frame_data_buf->size,
+                                    VK_FORMAT_UNDEFINED);
+
+    ff_vk_exec_bind_shader(&ctx->s, exec, &apvvk->idct);
+    ff_vk_shader_update_push_const(&ctx->s, exec, &apvvk->idct,
+                                   VK_SHADER_STAGE_COMPUTE_BIT,
+                                   0, sizeof(pd), &pd);
+
+    /* one workgroup per group of 8 horizontally adjacent transform blocks,
+     * in the luma basis coords, in case a block is OOB writes/reads are ignored */
+    int idct_cx = 0, idct_by = 0;
+    for (int comp = 0; comp < desc->nb_components; comp++) {
+        int sw = (comp == 0) ? 0 : desc->log2_chroma_w;
+        int sh = (comp == 0) ? 0 : desc->log2_chroma_h;
+        int bx = (avctx->coded_width  + (1 << (3 + sw)) - 1) >> (3 + sw);
+        int by = (avctx->coded_height + (1 << (3 + sh)) - 1) >> (3 + sh);
+        idct_cx = FFMAX(idct_cx, (bx + 7) >> 3);
+        idct_by = FFMAX(idct_by, by);
+    }
+    vk->CmdDispatch(exec->buf, idct_cx, idct_by, desc->nb_components);
+
+    err = ff_vk_exec_submit(&ctx->s, exec);
+    if (err < 0)
+        return err;
+
+fail:
+    return 0;
+}
+
+static int init_decode_shader(AVCodecContext *avctx, FFVulkanContext *s,
+                              FFVkExecPool *pool, FFVulkanShader *shd)
+{
+    int err;
+    AVHWFramesContext *dec_frames_ctx;
+    dec_frames_ctx = (AVHWFramesContext *)avctx->hw_frames_ctx->data;
+
+    ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, NULL,
+                      (uint32_t []) { 1, 1, 1 }, 0);
+    ff_vk_shader_add_push_const(shd, 0, sizeof(DecodePushData),
+                                VK_SHADER_STAGE_COMPUTE_BIT);
+
+    const FFVulkanDescriptorSetBinding desc_set[] = {
+        {
+            .name       = "dst",
+            .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .elems      = av_pix_fmt_count_planes(dec_frames_ctx->sw_format),
+            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
+        },
+        {
+            .name        = "frame_data_buf",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+        }
+    };
+    ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2, 0, 0);
+
+    RET(ff_vk_shader_link(s, shd,
+                          ff_apv_decode_comp_spv_data,
+                          ff_apv_decode_comp_spv_len, "main"));
+
+    RET(ff_vk_shader_register_exec(s, pool, shd));
+
+fail:
+    return err;
+}
+
+static int init_idct_shader(AVCodecContext *avctx, FFVulkanContext *s,
+                            FFVkExecPool *pool, FFVulkanShader *shd)
+{
+    int err;
+    AVHWFramesContext *dec_frames_ctx;
+    dec_frames_ctx = (AVHWFramesContext *)avctx->hw_frames_ctx->data;
+
+    SPEC_LIST_CREATE(sl, 1 + 64, (1 + 64)*sizeof(uint32_t))
+    SPEC_LIST_ADD(sl, 16, 32, 8); /* nb_blocks per workgroup */
+
+    const double idct_8_scales[8] = {
+        cos(4.0*M_PI/16.0) / 2.0, cos(1.0*M_PI/16.0) / 2.0,
+        cos(2.0*M_PI/16.0) / 2.0, cos(3.0*M_PI/16.0) / 2.0,
+        cos(4.0*M_PI/16.0) / 2.0, cos(5.0*M_PI/16.0) / 2.0,
+        cos(6.0*M_PI/16.0) / 2.0, cos(7.0*M_PI/16.0) / 2.0,
+    };
+    for (int i = 0; i < 64; i++)
+        SPEC_LIST_ADD(sl, 18 + i, 32,
+                      av_float2int(idct_8_scales[i >> 3]*idct_8_scales[i & 7]));
+
+    ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, sl,
+                      (uint32_t []) { 32, 2, 1 }, 0);
+    ff_vk_shader_add_push_const(shd, 0, sizeof(DecodePushData),
+                                VK_SHADER_STAGE_COMPUTE_BIT);
+
+    FFVulkanDescriptorSetBinding desc_set[] = {
+        {
+            .name       = "dst",
+            .type       = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+            .stages     = VK_SHADER_STAGE_COMPUTE_BIT,
+            .elems      = av_pix_fmt_count_planes(dec_frames_ctx->sw_format),
+        },
+        {
+            .name        = "frame_data_buf",
+            .type        = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            .stages      = VK_SHADER_STAGE_COMPUTE_BIT,
+        },
+    };
+    ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2, 0, 0);
+
+    RET(ff_vk_shader_link(s, shd,
+                          ff_apv_idct_comp_spv_data,
+                          ff_apv_idct_comp_spv_len, "main"));
+
+    RET(ff_vk_shader_register_exec(s, pool, shd));
+
+fail:
+    return err;
+}
+
+static void vk_decode_apv_uninit(FFVulkanDecodeShared *ctx)
+{
+    APVVulkanDecodeContext *apvvk = ctx->sd_ctx;
+
+    ff_vk_shader_free(&ctx->s, &apvvk->decode);
+    ff_vk_shader_free(&ctx->s, &apvvk->idct);
+
+    av_buffer_pool_uninit(&apvvk->frame_data_pool);
+
+    av_freep(&apvvk);
+}
+
+static int vk_decode_apv_init(AVCodecContext *avctx)
+{
+    int err;
+    FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data;
+
+    err = ff_vk_decode_init(avctx);
+    if (err < 0)
+        return err;
+
+    FFVulkanDecodeShared *ctx = dec->shared_ctx;
+    APVVulkanDecodeContext *apvvk = ctx->sd_ctx = av_mallocz(sizeof(*apvvk));
+    if (!apvvk) {
+        err = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    ctx->sd_ctx_free = &vk_decode_apv_uninit;
+
+    RET(init_decode_shader(avctx, &ctx->s, &ctx->exec_pool,
+                           &apvvk->decode));
+
+    RET(init_idct_shader(avctx, &ctx->s, &ctx->exec_pool,
+                         &apvvk->idct));
+
+fail:
+    return err;
+}
+
+static void vk_apv_free_frame_priv(AVRefStructOpaque _hwctx, void *data)
+{
+    AVHWDeviceContext *dev_ctx = _hwctx.nc;
+
+    APVVulkanDecodePicture *apvvp = data;
+    FFVulkanDecodePicture *vp = &apvvp->vp;
+
+    ff_vk_decode_free_frame(dev_ctx, vp);
+
+    av_buffer_unref(&apvvp->frame_data_buf);
+}
+
+const FFHWAccel ff_apv_vulkan_hwaccel = {
+    .p.name                = "apv_vulkan",
+    .p.type                = AVMEDIA_TYPE_VIDEO,
+    .p.id                  = AV_CODEC_ID_APV,
+    .p.pix_fmt             = AV_PIX_FMT_VULKAN,
+    .start_frame           = &vk_apv_start_frame,
+    .decode_slice          = &vk_apv_decode_slice,
+    .end_frame             = &vk_apv_end_frame,
+    .free_frame_priv       = &vk_apv_free_frame_priv,
+    .frame_priv_data_size  = sizeof(APVVulkanDecodePicture),
+    .init                  = &vk_decode_apv_init,
+    .update_thread_context = &ff_vk_update_thread_context,
+    .uninit                = &ff_vk_decode_uninit,
+    .frame_params          = &ff_vk_frame_params,
+    .priv_data_size        = sizeof(FFVulkanDecodeContext),
+    .caps_internal         = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_THREAD_SAFE,
+};
--- a/libavcodec/vulkan_decode.c
+++ b/libavcodec/vulkan_decode.c
@@ -28,6 +28,7 @@
 #define DECODER_IS_SDR(codec_id) \
    (((codec_id) == AV_CODEC_ID_FFV1) || \
     ((codec_id) == AV_CODEC_ID_DPX) || \
+     ((codec_id) == AV_CODEC_ID_APV) || \
     ((codec_id) == AV_CODEC_ID_PRORES_RAW) || \
     ((codec_id) == AV_CODEC_ID_PRORES))

@@ -55,6 +56,9 @@ extern const FFVulkanDecodeDescriptor ff_vk_dec_prores_desc;
 #if CONFIG_DPX_VULKAN_HWACCEL
 extern const FFVulkanDecodeDescriptor ff_vk_dec_dpx_desc;
 #endif
+#if CONFIG_APV_VULKAN_HWACCEL
+extern const FFVulkanDecodeDescriptor ff_vk_dec_apv_desc;
+#endif

 static const FFVulkanDecodeDescriptor *dec_descs[] = {
 #if CONFIG_H264_VULKAN_HWACCEL
@@ -81,6 +85,9 @@ static const FFVulkanDecodeDescriptor *dec_descs[] = {
 #if CONFIG_DPX_VULKAN_HWACCEL
    &ff_vk_dec_dpx_desc,
 #endif
+#if CONFIG_APV_VULKAN_HWACCEL
+    &ff_vk_dec_apv_desc,
+#endif
 };

 typedef struct FFVulkanDecodeProfileData {