apv_decode: add a Vulkan hwaccel

This commit is contained in:
Lynne
2025-11-02 14:38:33 +00:00
parent 704df177aa
commit 5ad8c67e6c
11 changed files with 845 additions and 1 deletions

View File

@@ -11,6 +11,7 @@ version <next>:
- Add AMF Frame Rate Converter (vf_frc_amf) filter
- SMPTE 2094-50 metadata support and passthrough
- ProRes RAW VideoToolbox hwaccel
- APV Vulkan hwaccel
version 8.1:

2
configure vendored
View File

@@ -3403,6 +3403,8 @@ videotoolbox_hwaccel_extralibs="-framework QuartzCore"
vulkan_deps="threads"
vulkan_deps_any="libdl LoadLibrary"
apv_vulkan_hwaccel_deps="vulkan spirv_compiler"
apv_vulkan_hwaccel_select="apv_decoder"
av1_d3d11va_hwaccel_deps="d3d11va DXVA_PicParams_AV1"
av1_d3d11va_hwaccel_select="av1_decoder"
av1_d3d11va2_hwaccel_deps="d3d11va DXVA_PicParams_AV1"

View File

@@ -1047,6 +1047,7 @@ OBJS-$(CONFIG_VIDEOTOOLBOX) += videotoolbox.o
OBJS-$(CONFIG_VDPAU) += vdpau.o
OBJS-$(CONFIG_VULKAN) += vulkan.o vulkan_video.o
OBJS-$(CONFIG_APV_VULKAN_HWACCEL) += vulkan_decode.o vulkan_apv.o
OBJS-$(CONFIG_AV1_D3D11VA_HWACCEL) += dxva2_av1.o
OBJS-$(CONFIG_AV1_DXVA2_HWACCEL) += dxva2_av1.o
OBJS-$(CONFIG_AV1_D3D12VA_HWACCEL) += dxva2_av1.o d3d12va_av1.o

View File

@@ -51,6 +51,9 @@ static enum AVPixelFormat get_pixel_format(AVCodecContext *avctx,
enum AVPixelFormat pix_fmt)
{
enum AVPixelFormat pix_fmts[] = {
#if CONFIG_APV_VULKAN_HWACCEL
AV_PIX_FMT_VULKAN,
#endif
pix_fmt,
AV_PIX_FMT_NONE,
};
@@ -603,6 +606,9 @@ const FFCodec ff_apv_decoder = {
AV_CODEC_CAP_FRAME_THREADS,
.caps_internal = FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM,
.hw_configs = (const AVCodecHWConfigInternal *const []) {
#if CONFIG_APV_VULKAN_HWACCEL
HWACCEL_VULKAN(apv),
#endif
NULL
},
};

View File

@@ -19,6 +19,7 @@
#ifndef AVCODEC_HWACCELS_H
#define AVCODEC_HWACCELS_H
extern const struct FFHWAccel ff_apv_vulkan_hwaccel;
extern const struct FFHWAccel ff_av1_d3d11va_hwaccel;
extern const struct FFHWAccel ff_av1_d3d11va2_hwaccel;
extern const struct FFHWAccel ff_av1_d3d12va_hwaccel;

View File

@@ -29,7 +29,7 @@
#include "version_major.h"
#define LIBAVCODEC_VERSION_MINOR 33
#define LIBAVCODEC_VERSION_MINOR 34
#define LIBAVCODEC_VERSION_MICRO 100
#define LIBAVCODEC_VERSION_INT AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \

View File

@@ -1,6 +1,9 @@
clean::
$(RM) $(CLEANSUFFIXES:%=libavcodec/vulkan/%)
OBJS-$(CONFIG_APV_VULKAN_HWACCEL) += vulkan/apv_decode.comp.spv.o \
vulkan/apv_idct.comp.spv.o
OBJS-$(CONFIG_FFV1_VULKAN_ENCODER) += vulkan/ffv1_enc_setup.comp.spv.o \
vulkan/ffv1_enc_reset.comp.spv.o \
vulkan/ffv1_enc_reset_golomb.comp.spv.o \

View File

@@ -0,0 +1,216 @@
/*
* Copyright (c) 2025 Lynne <dev@lynne.ee>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#version 460
#pragma shader_stage(compute)
#extension GL_GOOGLE_include_directive : require
#include "common.glsl"
#define APV_MAX_NUM_COMP 4
#define APV_MAX_TILE_COLS 20
#define APV_MAX_TILE_ROWS 20
#define APV_MAX_TILE_COUNT (APV_MAX_TILE_COLS * APV_MAX_TILE_ROWS)
#define APV_MIN_TRANS_COEFF -32768
#define APV_MAX_TRANS_COEFF 32767
#define APV_TR_SIZE 8
#define APV_BLK_COEFFS (APV_TR_SIZE * APV_TR_SIZE)
#define APV_MB_SIZE (ivec2(16, 16))
layout (set = 0, binding = 0) uniform writeonly uimage2D dst[];
layout (set = 0, binding = 1, scalar) readonly buffer frame_data_buf {
uvec2 tile_offset[APV_MAX_NUM_COMP * APV_MAX_TILE_COUNT];
uint8_t q_matrix[APV_MAX_NUM_COMP][8][8];
uint8_t tile_qp[APV_MAX_NUM_COMP * APV_MAX_TILE_COUNT];
uint16_t tile_col[APV_MAX_TILE_COLS + 1];
uint16_t tile_row[APV_MAX_TILE_ROWS + 1];
};
layout (push_constant, scalar) uniform pushConstants {
u8buf tile_data;
ivec2 tile_count;
ivec2 log2_chroma_sub;
int components;
int bit_depth;
};
GetBitContext gb;
int apv_read_vlc(int k)
{
/* Top 32 bits, longest valid APV code is 1 + 2*5 + 5 = 16 bits */
uint bits = show_bits(gb, 32);
uint mask = (1u << k) - 1u;
/* 1xxx: short, length 1+k, value = next k bits */
if (bits >= 0x80000000u) {
skip_bits(gb, 1 + k);
return int((bits >> (31 - k)) & mask);
}
/* 00xxx: short, length 2+k, value = (1<<k) + next k bits */
if (bits < 0x40000000u) {
skip_bits(gb, 2 + k);
return int((bits >> (30 - k)) & mask) + (1 << k);
}
/* 01 prefix + (n leading zeros) + 1 + (n+k value bits),
* after shifting out the 01 prefix, findMSB tells us n */
uint suffix = bits << 2;
if (suffix == 0u)
return APV_MAX_TRANS_COEFF + 1;
int n = 31 - findMSB(suffix);
skip_bits(gb, 3 + n);
/* (2<<k) + ((1<<n)-1) * (1<<k) is equal to ((1<<n) + 1) << k */
return (((1 << n) + 1) << k) + int(get_bits(gb, n + k));
}
/* ff_zigzag_direct, packed: each byte is the raster index (y*8 + x). */
const uint8_t zigzag[64] = {
uint8_t( 0), uint8_t( 1), uint8_t( 8), uint8_t(16),
uint8_t( 9), uint8_t( 2), uint8_t( 3), uint8_t(10),
uint8_t(17), uint8_t(24), uint8_t(32), uint8_t(25),
uint8_t(18), uint8_t(11), uint8_t( 4), uint8_t( 5),
uint8_t(12), uint8_t(19), uint8_t(26), uint8_t(33),
uint8_t(40), uint8_t(48), uint8_t(41), uint8_t(34),
uint8_t(27), uint8_t(20), uint8_t(13), uint8_t( 6),
uint8_t( 7), uint8_t(14), uint8_t(21), uint8_t(28),
uint8_t(35), uint8_t(42), uint8_t(49), uint8_t(56),
uint8_t(57), uint8_t(50), uint8_t(43), uint8_t(36),
uint8_t(29), uint8_t(22), uint8_t(15), uint8_t(23),
uint8_t(30), uint8_t(37), uint8_t(44), uint8_t(51),
uint8_t(58), uint8_t(59), uint8_t(52), uint8_t(45),
uint8_t(38), uint8_t(31), uint8_t(39), uint8_t(46),
uint8_t(53), uint8_t(60), uint8_t(61), uint8_t(54),
uint8_t(47), uint8_t(55), uint8_t(62), uint8_t(63),
};
int prev_dc;
int prev_k_dc;
int prev_1st_ac_level;
void decode_block(ivec2 pos, uint comp)
{
int dc_coeff;
int abs_diff = apv_read_vlc(prev_k_dc);
if (abs_diff != 0) {
if (get_bit(gb))
dc_coeff = prev_dc - abs_diff;
else
dc_coeff = prev_dc + abs_diff;
} else {
dc_coeff = prev_dc;
}
if (dc_coeff < APV_MIN_TRANS_COEFF ||
dc_coeff > APV_MAX_TRANS_COEFF)
return;
imageStore(dst[comp], pos, uvec4(uint(dc_coeff) & 0xFFFFu));
prev_dc = dc_coeff;
prev_k_dc = min(abs_diff >> 1, 5);
/* ACs */
int scan_pos = 1;
int first_ac = 1;
int prev_level = prev_1st_ac_level;
int prev_run = 0;
do {
int coeff_zero_run;
int k_param = clamp(prev_run >> 2, 0, 2);
coeff_zero_run = apv_read_vlc(k_param);
if (coeff_zero_run > APV_BLK_COEFFS - scan_pos)
return;
/* image was already pre-cleared to all zeroes */
scan_pos += coeff_zero_run;
prev_run = coeff_zero_run;
if (scan_pos < APV_BLK_COEFFS) {
int abs_ac_coeff_minus1;
int level;
k_param = clamp(prev_level >> 2, 0, 4);
abs_ac_coeff_minus1 = apv_read_vlc(k_param);
bool sign_ac_coeff = get_bit(gb);
if (sign_ac_coeff)
level = -abs_ac_coeff_minus1 - 1;
else
level = abs_ac_coeff_minus1 + 1;
if (level < APV_MIN_TRANS_COEFF || level > APV_MAX_TRANS_COEFF)
return;
int zz = int(zigzag[scan_pos]);
imageStore(dst[comp], pos + ivec2(zz & 7, zz >> 3), uvec4(uint(level) & 0xFFFFu));
prev_level = abs_ac_coeff_minus1 + 1;
if (first_ac != 0) {
prev_1st_ac_level = prev_level;
first_ac = 0;
}
scan_pos++;
}
} while (scan_pos < APV_BLK_COEFFS);
}
void main(void)
{
const ivec2 tile_pos = ivec2(gl_WorkGroupID.xy);
const uint comp_idx = uint(gl_WorkGroupID.z);
/* EC state */
prev_dc = 0;
prev_k_dc = 5;
prev_1st_ac_level = 0;
const int num_tiles = tile_count.x * tile_count.y;
const int tile_idx = tile_pos.y * tile_count.x + tile_pos.x;
const uvec2 tile_bs = tile_offset[int(comp_idx) * num_tiles + tile_idx];
init_get_bits(gb, u8buf(tile_data + tile_bs.x), int(tile_bs.y));
ivec2 sub_shift = comp_idx == 0 ? ivec2(0) : log2_chroma_sub;
ivec2 tile_start = ivec2(tile_col[tile_pos.x], tile_row[tile_pos.y]);
ivec2 tile_dim = ivec2(tile_col[tile_pos.x + 1],
tile_row[tile_pos.y + 1]) - tile_start;
ivec2 tile_mb_dim = tile_dim / APV_MB_SIZE;
ivec2 blk_mb_dim = ivec2(2, 2) >> sub_shift;
ivec2 mb, blk;
for (mb.y = 0; mb.y < tile_mb_dim.y; mb.y++) {
for (mb.x = 0; mb.x < tile_mb_dim.x; mb.x++) {
for (blk.y = 0; blk.y < blk_mb_dim.y; blk.y++) {
for (blk.x = 0; blk.x < blk_mb_dim.x; blk.x++) {
ivec2 pos = (APV_MB_SIZE*mb +
APV_TR_SIZE*blk + tile_start) >> sub_shift;
decode_block(pos, comp_idx);
}
}
}
}
}

View File

@@ -0,0 +1,117 @@
/*
* Copyright (c) 2025 Lynne <dev@lynne.ee>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#version 460
#pragma shader_stage(compute)
#extension GL_GOOGLE_include_directive : require
#include "common.glsl"
#include "dct.glsl"
#define APV_MAX_NUM_COMP 4
#define APV_MAX_TILE_COLS 20
#define APV_MAX_TILE_ROWS 20
#define APV_MAX_TILE_COUNT (APV_MAX_TILE_COLS * APV_MAX_TILE_ROWS)
#define APV_TR_SIZE 8
#define APV_BLOCKS_PER_WG 8
layout (set = 0, binding = 0) uniform uimage2D dst[];
layout (set = 0, binding = 1, scalar) readonly buffer frame_data_buf {
uvec2 tile_offset[APV_MAX_NUM_COMP * APV_MAX_TILE_COUNT];
uint8_t q_matrix[APV_MAX_NUM_COMP][8][8];
uint8_t tile_qp[APV_MAX_NUM_COMP * APV_MAX_TILE_COUNT];
uint16_t tile_col[APV_MAX_TILE_COLS + 1];
uint16_t tile_row[APV_MAX_TILE_ROWS + 1];
};
layout (push_constant, scalar) uniform pushConstants {
u8buf tile_data;
ivec2 tile_count;
ivec2 log2_chroma_sub;
int components;
int bit_depth;
};
const int apv_level_scale[6] = { 40, 45, 51, 57, 64, 71 };
void main(void)
{
const uvec3 wgid = gl_WorkGroupID;
const uint comp = wgid.z;
const uvec3 lid = gl_LocalInvocationID;
const uint block = (lid.y << 2) | (lid.x >> 3); /* 0..7 block in chunk */
const uint col = lid.x & 0x7u; /* 0..7 column in block */
/* one workgroup handles eight horizontally neighbouring blocks */
const int blk_x = int(wgid.x) * APV_BLOCKS_PER_WG + int(block);
const int blk_y = int(wgid.y);
const ivec2 pos = ivec2(blk_x, blk_y) * APV_TR_SIZE;
/* note: some oddness happens on tile-boundaries */
const ivec2 sub_shift = (comp == 0u) ? ivec2(0) : log2_chroma_sub;
const ivec2 luma_pos = pos << sub_shift;
/* figure out the tile position */
int tx = 0;
while (tx + 1 < tile_count.x && int(tile_col[tx + 1]) <= luma_pos.x)
tx++;
int ty = 0;
while (ty + 1 < tile_count.y && int(tile_row[ty + 1]) <= luma_pos.y)
ty++;
const int tile_idx = ty * tile_count.x + tx;
const int qp = int(tile_qp[int(comp) * APV_MAX_TILE_COUNT + tile_idx]);
const int level_scale = apv_level_scale[qp % 6];
const int qp_shift = qp / 6;
const int half_range = 1 << (bit_depth - 1);
const int max_val = (1 << bit_depth) - 1;
const float fact = float(half_range);
const float norm = 1.0f / (1024.0f * fact); /* DCT normalization const */
[[unroll]]
for (uint y = 0u; y < 8u; y++) {
/* load */
int raw = int(imageLoad(dst[comp], pos + ivec2(col, y)).x);
int coeff = sign_extend(raw, 16);
/* dequant + norm */
int qs = level_scale * int(q_matrix[comp][col][y]) * (1 << qp_shift);
float v = float(coeff * qs) * norm;
/* scale */
blocks[block][y * 9u + col] = v * idct_scale[y * 8u + col];
}
barrier();
idct8(block, col, 9);
barrier();
blocks[block][col * 9u] += 1.0f;
idct8(block, col * 9u, 1);
barrier();
[[unroll]]
for (int y = 0; y < 8; y++) {
float v = round(blocks[block][y * 9u + col] * fact);
imageStore(dst[comp], pos + ivec2(col, y),
uvec4(uint(clamp(int(v), 0, max_val))));
}
}

490
libavcodec/vulkan_apv.c Normal file
View File

@@ -0,0 +1,490 @@
/*
* Copyright (c) 2025 Lynne <dev@lynne.ee>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "vulkan_decode.h"
#include "hwaccel_internal.h"
#include "apv_decode.h"
#include "libavutil/mem.h"
extern const unsigned char ff_apv_decode_comp_spv_data[];
extern const unsigned int ff_apv_decode_comp_spv_len;
extern const unsigned char ff_apv_idct_comp_spv_data[];
extern const unsigned int ff_apv_idct_comp_spv_len;
const FFVulkanDecodeDescriptor ff_vk_dec_apv_desc = {
.codec_id = AV_CODEC_ID_APV,
.queue_flags = VK_QUEUE_COMPUTE_BIT,
};
typedef struct APVVulkanDecodePicture {
FFVulkanDecodePicture vp;
AVBufferRef *frame_data_buf;
uint32_t *frame_data;
int tile_num;
} APVVulkanDecodePicture;
typedef struct APVVulkanDecodeContext {
FFVulkanShader decode;
FFVulkanShader idct;
AVBufferPool *frame_data_pool;
} APVVulkanDecodeContext;
typedef struct DecodePushData {
VkDeviceAddress tile_data;
int tile_count[2];
int log2_chroma_sub[2];
int components;
int bit_depth;
} DecodePushData;
static int vk_apv_start_frame(AVCodecContext *avctx,
const AVBufferRef *buffer_ref,
av_unused const uint8_t *buffer,
av_unused uint32_t size)
{
int err;
APVDecodeContext *apv = avctx->priv_data;
FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data;
FFVulkanDecodeShared *ctx = dec->shared_ctx;
APVVulkanDecodeContext *apvvk = ctx->sd_ctx;
APVVulkanDecodePicture *apvvp = apv->hwaccel_picture_private;
FFVulkanDecodePicture *vp = &apvvp->vp;
/* Host map the input tile data if supported */
if (ctx->s.extensions & FF_VK_EXT_EXTERNAL_HOST_MEMORY)
ff_vk_host_map_buffer(&ctx->s, &vp->slices_buf, buffer_ref->data,
buffer_ref,
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT);
/* Allocate frame data buffer */
int fd_size = (2*4*APV_MAX_TILE_COUNT)*APV_MAX_NUM_COMP +
(64 + APV_MAX_TILE_COUNT)*APV_MAX_NUM_COMP +
(APV_MAX_TILE_COLS + 1 + APV_MAX_TILE_ROWS + 1)*2;
err = ff_vk_get_pooled_buffer(&ctx->s, &apvvk->frame_data_pool,
&apvvp->frame_data_buf,
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
NULL, fd_size,
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
if (err < 0)
return err;
/* Frame data */
FFVkBuffer *frame_data = (FFVkBuffer *)apvvp->frame_data_buf->data;
uint8_t *fd = frame_data->mapped_mem;
fd += 2*4*APV_MAX_TILE_COUNT*APV_MAX_NUM_COMP; /* Tile offsets go first */
/* per-component qmatrix and QPs */
for (int i = 0; i < APV_MAX_NUM_COMP; i++)
memcpy(fd + 64*i,
apv->cur_raw_frame->frame_header.quantization_matrix.q_matrix[i],
64);
fd += 64*APV_MAX_NUM_COMP;
for (int i = 0; i < APV_MAX_NUM_COMP; i++) {
for (int j = 0; j < APV_MAX_TILE_COUNT; j++)
fd[j] = apv->cur_raw_frame->tile[j].tile_header.tile_qp[i];
fd += APV_MAX_TILE_COUNT;
}
/* tile col/row offset */
memcpy(fd, apv->tile_info.col_starts, (APV_MAX_TILE_COLS+1)*2);
fd += (APV_MAX_TILE_COLS+1)*2;
memcpy(fd, apv->tile_info.row_starts, (APV_MAX_TILE_ROWS+1)*2);
/* Prepare frame to be used */
err = ff_vk_decode_prepare_frame_sdr(dec, apv->output_frame, vp, 1,
FF_VK_REP_NATIVE, 0);
if (err < 0)
return err;
return 0;
}
static int vk_apv_decode_slice(AVCodecContext *avctx,
const uint8_t *data,
uint32_t size)
{
APVDecodeContext *apv = avctx->priv_data;
APVVulkanDecodePicture *apvvp = apv->hwaccel_picture_private;
FFVulkanDecodePicture *vp = &apvvp->vp;
FFVkBuffer *frame_data = (FFVkBuffer *)apvvp->frame_data_buf->data;
FFVkBuffer *slices_buf = vp->slices_buf ? (FFVkBuffer *)vp->slices_buf->data : NULL;
if (slices_buf && slices_buf->host_ref) {
AV_WN32(frame_data->mapped_mem + (2*apvvp->tile_num + 0)*sizeof(uint32_t),
data - slices_buf->mapped_mem);
AV_WN32(frame_data->mapped_mem + (2*apvvp->tile_num + 1)*sizeof(uint32_t),
size);
apvvp->tile_num++;
} else {
int err = ff_vk_decode_add_slice(avctx, vp, data, size, 0,
&apvvp->tile_num,
(const uint32_t **)&apvvp->frame_data);
if (err < 0)
return err;
AV_WN32(frame_data->mapped_mem + (2*(apvvp->tile_num - 1) + 0)*sizeof(uint32_t),
apvvp->frame_data[apvvp->tile_num - 1]);
AV_WN32(frame_data->mapped_mem + (2*(apvvp->tile_num - 1) + 1)*sizeof(uint32_t),
size);
}
return 0;
}
static int vk_apv_end_frame(AVCodecContext *avctx)
{
int err;
APVDecodeContext *apv = avctx->priv_data;
const CodedBitstreamAPVContext *apv_cbc = apv->cbc->priv_data;
FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data;
FFVulkanDecodeShared *ctx = dec->shared_ctx;
APVVulkanDecodeContext *apvvk = ctx->sd_ctx;
FFVulkanFunctions *vk = &ctx->s.vkfn;
APVVulkanDecodePicture *apvvp = apv->hwaccel_picture_private;
FFVulkanDecodePicture *vp = &apvvp->vp;
FFVkBuffer *slices_buf = (FFVkBuffer *)vp->slices_buf->data;
FFVkBuffer *frame_data_buf = (FFVkBuffer *)apvvp->frame_data_buf->data;
AVHWFramesContext *hwfc = (AVHWFramesContext *)avctx->hw_frames_ctx->data;
enum AVPixelFormat sw_format = hwfc->sw_format;
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(sw_format);
VkImageMemoryBarrier2 img_bar[8];
int nb_img_bar = 0;
FFVkExecContext *exec = ff_vk_exec_get(&ctx->s, &ctx->exec_pool);
ff_vk_exec_start(&ctx->s, exec);
/* Make sure the buffer is flushed */
RET(ff_vk_flush_buffer(&ctx->s, frame_data_buf, 0, frame_data_buf->size, 1));
/* Prepare deps */
RET(ff_vk_exec_add_dep_frame(&ctx->s, exec, apv->output_frame,
VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
err = ff_vk_exec_mirror_sem_value(&ctx->s, exec, &vp->sem, &vp->sem_value,
apv->output_frame);
if (err < 0)
return err;
RET(ff_vk_exec_add_dep_buf(&ctx->s, exec, &vp->slices_buf, 1, 0));
vp->slices_buf = NULL;
RET(ff_vk_exec_add_dep_buf(&ctx->s, exec, &apvvp->frame_data_buf, 1, 0));
apvvp->frame_data_buf = NULL;
AVVkFrame *vkf = (AVVkFrame *)apv->output_frame->data[0];
vkf->layout[0] = VK_IMAGE_LAYOUT_UNDEFINED;
vkf->access[0] = VK_ACCESS_2_NONE;
ff_vk_frame_barrier(&ctx->s, exec, apv->output_frame,
img_bar, &nb_img_bar,
VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
VK_PIPELINE_STAGE_2_CLEAR_BIT,
VK_ACCESS_2_TRANSFER_WRITE_BIT,
VK_IMAGE_LAYOUT_GENERAL,
VK_QUEUE_FAMILY_IGNORED);
vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
.pImageMemoryBarriers = img_bar,
.imageMemoryBarrierCount = nb_img_bar,
});
nb_img_bar = 0;
/* Zero frame */
for (int i = 0; i < ff_vk_count_images(vkf); i++)
vk->CmdClearColorImage(exec->buf, vkf->img[i],
VK_IMAGE_LAYOUT_GENERAL,
&((VkClearColorValue) { 0 }),
1, &((VkImageSubresourceRange) {
.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
.levelCount = 1,
.layerCount = 1,
}));
/* Wait for the frame to get zeroed out before continuing */
ff_vk_frame_barrier(&ctx->s, exec, apv->output_frame, img_bar, &nb_img_bar,
VK_PIPELINE_STAGE_2_CLEAR_BIT,
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
VK_IMAGE_LAYOUT_GENERAL,
VK_QUEUE_FAMILY_IGNORED);
vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
.pImageMemoryBarriers = img_bar,
.imageMemoryBarrierCount = nb_img_bar,
});
nb_img_bar = 0;
/* Setup push data */
DecodePushData pd = (DecodePushData) {
.tile_data = slices_buf->address,
.tile_count = { apv->tile_info.tile_cols, apv->tile_info.tile_rows },
.log2_chroma_sub = { desc->log2_chroma_w, desc->log2_chroma_h },
.components = desc->nb_components,
.bit_depth = apv_cbc->bit_depth,
};
/* Decoding */
ff_vk_shader_update_img_array(&ctx->s, exec, &apvvk->decode,
apv->output_frame, vp->view.out,
0, 0,
VK_IMAGE_LAYOUT_GENERAL,
VK_NULL_HANDLE);
ff_vk_shader_update_desc_buffer(&ctx->s, exec, &apvvk->decode,
0, 1, 0,
frame_data_buf,
0, frame_data_buf->size,
VK_FORMAT_UNDEFINED);
ff_vk_exec_bind_shader(&ctx->s, exec, &apvvk->decode);
ff_vk_shader_update_push_const(&ctx->s, exec, &apvvk->decode,
VK_SHADER_STAGE_COMPUTE_BIT,
0, sizeof(pd), &pd);
vk->CmdDispatch(exec->buf,
apv->tile_info.tile_cols, apv->tile_info.tile_rows,
desc->nb_components);
/* Wait for all decoding to finish */
ff_vk_frame_barrier(&ctx->s, exec, apv->output_frame, img_bar, &nb_img_bar,
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
VK_IMAGE_LAYOUT_GENERAL,
VK_QUEUE_FAMILY_IGNORED);
vk->CmdPipelineBarrier2(exec->buf, &(VkDependencyInfo) {
.sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
.pImageMemoryBarriers = img_bar,
.imageMemoryBarrierCount = nb_img_bar,
});
nb_img_bar = 0;
/* iDCT */
ff_vk_shader_update_img_array(&ctx->s, exec, &apvvk->idct,
apv->output_frame, vp->view.out,
0, 0,
VK_IMAGE_LAYOUT_GENERAL,
VK_NULL_HANDLE);
ff_vk_shader_update_desc_buffer(&ctx->s, exec, &apvvk->idct,
0, 1, 0,
frame_data_buf,
0, frame_data_buf->size,
VK_FORMAT_UNDEFINED);
ff_vk_exec_bind_shader(&ctx->s, exec, &apvvk->idct);
ff_vk_shader_update_push_const(&ctx->s, exec, &apvvk->idct,
VK_SHADER_STAGE_COMPUTE_BIT,
0, sizeof(pd), &pd);
/* one workgroup per group of 8 horizontally adjacent transform blocks,
* in the luma basis coords, in case a block is OOB writes/reads are ignored */
int idct_cx = 0, idct_by = 0;
for (int comp = 0; comp < desc->nb_components; comp++) {
int sw = (comp == 0) ? 0 : desc->log2_chroma_w;
int sh = (comp == 0) ? 0 : desc->log2_chroma_h;
int bx = (avctx->coded_width + (1 << (3 + sw)) - 1) >> (3 + sw);
int by = (avctx->coded_height + (1 << (3 + sh)) - 1) >> (3 + sh);
idct_cx = FFMAX(idct_cx, (bx + 7) >> 3);
idct_by = FFMAX(idct_by, by);
}
vk->CmdDispatch(exec->buf, idct_cx, idct_by, desc->nb_components);
err = ff_vk_exec_submit(&ctx->s, exec);
if (err < 0)
return err;
fail:
return 0;
}
static int init_decode_shader(AVCodecContext *avctx, FFVulkanContext *s,
FFVkExecPool *pool, FFVulkanShader *shd)
{
int err;
AVHWFramesContext *dec_frames_ctx;
dec_frames_ctx = (AVHWFramesContext *)avctx->hw_frames_ctx->data;
ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, NULL,
(uint32_t []) { 1, 1, 1 }, 0);
ff_vk_shader_add_push_const(shd, 0, sizeof(DecodePushData),
VK_SHADER_STAGE_COMPUTE_BIT);
const FFVulkanDescriptorSetBinding desc_set[] = {
{
.name = "dst",
.type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
.elems = av_pix_fmt_count_planes(dec_frames_ctx->sw_format),
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
},
{
.name = "frame_data_buf",
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
}
};
ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2, 0, 0);
RET(ff_vk_shader_link(s, shd,
ff_apv_decode_comp_spv_data,
ff_apv_decode_comp_spv_len, "main"));
RET(ff_vk_shader_register_exec(s, pool, shd));
fail:
return err;
}
static int init_idct_shader(AVCodecContext *avctx, FFVulkanContext *s,
FFVkExecPool *pool, FFVulkanShader *shd)
{
int err;
AVHWFramesContext *dec_frames_ctx;
dec_frames_ctx = (AVHWFramesContext *)avctx->hw_frames_ctx->data;
SPEC_LIST_CREATE(sl, 1 + 64, (1 + 64)*sizeof(uint32_t))
SPEC_LIST_ADD(sl, 16, 32, 8); /* nb_blocks per workgroup */
const double idct_8_scales[8] = {
cos(4.0*M_PI/16.0) / 2.0, cos(1.0*M_PI/16.0) / 2.0,
cos(2.0*M_PI/16.0) / 2.0, cos(3.0*M_PI/16.0) / 2.0,
cos(4.0*M_PI/16.0) / 2.0, cos(5.0*M_PI/16.0) / 2.0,
cos(6.0*M_PI/16.0) / 2.0, cos(7.0*M_PI/16.0) / 2.0,
};
for (int i = 0; i < 64; i++)
SPEC_LIST_ADD(sl, 18 + i, 32,
av_float2int(idct_8_scales[i >> 3]*idct_8_scales[i & 7]));
ff_vk_shader_load(shd, VK_SHADER_STAGE_COMPUTE_BIT, sl,
(uint32_t []) { 32, 2, 1 }, 0);
ff_vk_shader_add_push_const(shd, 0, sizeof(DecodePushData),
VK_SHADER_STAGE_COMPUTE_BIT);
FFVulkanDescriptorSetBinding desc_set[] = {
{
.name = "dst",
.type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
.elems = av_pix_fmt_count_planes(dec_frames_ctx->sw_format),
},
{
.name = "frame_data_buf",
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.stages = VK_SHADER_STAGE_COMPUTE_BIT,
},
};
ff_vk_shader_add_descriptor_set(s, shd, desc_set, 2, 0, 0);
RET(ff_vk_shader_link(s, shd,
ff_apv_idct_comp_spv_data,
ff_apv_idct_comp_spv_len, "main"));
RET(ff_vk_shader_register_exec(s, pool, shd));
fail:
return err;
}
static void vk_decode_apv_uninit(FFVulkanDecodeShared *ctx)
{
APVVulkanDecodeContext *apvvk = ctx->sd_ctx;
ff_vk_shader_free(&ctx->s, &apvvk->decode);
ff_vk_shader_free(&ctx->s, &apvvk->idct);
av_buffer_pool_uninit(&apvvk->frame_data_pool);
av_freep(&apvvk);
}
static int vk_decode_apv_init(AVCodecContext *avctx)
{
int err;
FFVulkanDecodeContext *dec = avctx->internal->hwaccel_priv_data;
err = ff_vk_decode_init(avctx);
if (err < 0)
return err;
FFVulkanDecodeShared *ctx = dec->shared_ctx;
APVVulkanDecodeContext *apvvk = ctx->sd_ctx = av_mallocz(sizeof(*apvvk));
if (!apvvk) {
err = AVERROR(ENOMEM);
goto fail;
}
ctx->sd_ctx_free = &vk_decode_apv_uninit;
RET(init_decode_shader(avctx, &ctx->s, &ctx->exec_pool,
&apvvk->decode));
RET(init_idct_shader(avctx, &ctx->s, &ctx->exec_pool,
&apvvk->idct));
fail:
return err;
}
static void vk_apv_free_frame_priv(AVRefStructOpaque _hwctx, void *data)
{
AVHWDeviceContext *dev_ctx = _hwctx.nc;
APVVulkanDecodePicture *apvvp = data;
FFVulkanDecodePicture *vp = &apvvp->vp;
ff_vk_decode_free_frame(dev_ctx, vp);
av_buffer_unref(&apvvp->frame_data_buf);
}
const FFHWAccel ff_apv_vulkan_hwaccel = {
.p.name = "apv_vulkan",
.p.type = AVMEDIA_TYPE_VIDEO,
.p.id = AV_CODEC_ID_APV,
.p.pix_fmt = AV_PIX_FMT_VULKAN,
.start_frame = &vk_apv_start_frame,
.decode_slice = &vk_apv_decode_slice,
.end_frame = &vk_apv_end_frame,
.free_frame_priv = &vk_apv_free_frame_priv,
.frame_priv_data_size = sizeof(APVVulkanDecodePicture),
.init = &vk_decode_apv_init,
.update_thread_context = &ff_vk_update_thread_context,
.uninit = &ff_vk_decode_uninit,
.frame_params = &ff_vk_frame_params,
.priv_data_size = sizeof(FFVulkanDecodeContext),
.caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_THREAD_SAFE,
};

View File

@@ -28,6 +28,7 @@
#define DECODER_IS_SDR(codec_id) \
(((codec_id) == AV_CODEC_ID_FFV1) || \
((codec_id) == AV_CODEC_ID_DPX) || \
((codec_id) == AV_CODEC_ID_APV) || \
((codec_id) == AV_CODEC_ID_PRORES_RAW) || \
((codec_id) == AV_CODEC_ID_PRORES))
@@ -55,6 +56,9 @@ extern const FFVulkanDecodeDescriptor ff_vk_dec_prores_desc;
#if CONFIG_DPX_VULKAN_HWACCEL
extern const FFVulkanDecodeDescriptor ff_vk_dec_dpx_desc;
#endif
#if CONFIG_APV_VULKAN_HWACCEL
extern const FFVulkanDecodeDescriptor ff_vk_dec_apv_desc;
#endif
static const FFVulkanDecodeDescriptor *dec_descs[] = {
#if CONFIG_H264_VULKAN_HWACCEL
@@ -81,6 +85,9 @@ static const FFVulkanDecodeDescriptor *dec_descs[] = {
#if CONFIG_DPX_VULKAN_HWACCEL
&ff_vk_dec_dpx_desc,
#endif
#if CONFIG_APV_VULKAN_HWACCEL
&ff_vk_dec_apv_desc,
#endif
};
typedef struct FFVulkanDecodeProfileData {