Files
FFmpeg/libswscale/x86/ops_float.asm
Niklas Haas 43a8e2da01 swscale/x86/ops: rewrite based on uops_macros.h
This is a ground-up refactor of the existing x86 ops code, using the new
uops macros to auto-generate every single kernel instance without guesswork.

While I was at it, I also cleaned up the file a bit and made sure we have only
a single, consistent way of writing/defining the kernels. This also gets rid
of some of the old boilerplate like decl_pattern.

Most kernels are trivial ports, but a few deserve attention or note:

- SWS_UOP_LINEAR is now generated more efficiently, thanks to the distinction
  between 0/1/arbitrary components. I also rewrote the code to keep track of
  whether the output was initialized yet or not, which lets us skip the
  initial `xorps` and `addps` for the first component.

- SWS_UOP_PERMUTE is generated automatically by using some NASM logic to
  detect permutation cycles and emit the minimal sequence of `mova`
  instructions. SWS_UOP_COPY, on the other hand, is implemented naively. I
  originally had a more complex implementation that could handle both, but
  I decided it really isn't worth the complication just to save 2-3 cycles.

- SWS_UOP_SCALE now has a native 8-bit implementation, which is faster than
  falling back to C code.

- SWS_UOP_SWAP_BYTES is no longer compiled as a type-agnostic pshufb, instead
  we hard-code the shuffle mask

- SWS_UOP_DITHER is now much simpler and avoids branching etc. entirely

Signed-off-by: Niklas Haas <git@haasn.dev>
2026-06-09 18:27:20 +02:00

701 lines
20 KiB
NASM

;******************************************************************************
;* Copyright (c) 2025-2026 Niklas Haas
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "ops_include.asm"
%assign SWS_FILTER_SCALE (1 << 14)
%define FILTER_WIDTH (BLOCK_SIZE / 4) ; always outputs F32
%define FILTER_SIZE (FILTER_WIDTH * BYTES) ; in source bytes
SECTION_RODATA
align 32
bias16: times 16 dw 0x8000 ; shift unsigned to signed range
align 16
bias32: times 4 dd 0x8000 * SWS_FILTER_SCALE
scale_inv: times 4 dd 0x38800000 ; 1.0f / SWS_FILTER_SCALE
SECTION .text
;---------------------------------------------------------
; Generic vertical filtering (add+mul and fma)
%macro FLOAD_SWS_PIXEL_U8 2 ; dst, src
pmovzxbd %1, %2
vcvtdq2ps %1, %1
%endmacro
%macro FLOAD_SWS_PIXEL_U16 2 ; dst, src
pmovzxwd %1, %2
vcvtdq2ps %1, %1
%endmacro
%macro FLOAD_SWS_PIXEL_U32 2 ; dst, src
%error TODO, no clean way to convert U32 to F32 on AVX2
%endmacro
%macro FLOAD_SWS_PIXEL_F32 2 ; dst, src
movu %1, %2
%endmacro
%macro ACCUM_MULADD 3 ; sum, src, weight
mulps %2, %3
addps %1, %2
%endmacro
%macro ACCUM_FMA 3 ; sum, src, weight
fmaddps %1, %2, %3, %1
%endmacro
%macro filter_v_iter 1 ; accum_fn
vbroadcastss m12, [weights]
IF X, FLOAD_PX m8, [in0q]
IF Y, FLOAD_PX m9, [in1q]
IF Z, FLOAD_PX m10, [in2q]
IF W, FLOAD_PX m11, [in3q]
IF X, %1 mx, m8, m12
IF Y, %1 my, m9, m12
IF Z, %1 mz, m10, m12
IF W, %1 mw, m11, m12
IF X, FLOAD_PX m8, [in0q + (FILTER_SIZE >> 1)]
IF Y, FLOAD_PX m9, [in1q + (FILTER_SIZE >> 1)]
IF Z, FLOAD_PX m10, [in2q + (FILTER_SIZE >> 1)]
IF W, FLOAD_PX m11, [in3q + (FILTER_SIZE >> 1)]
IF X, %1 mx2, m8, m12
IF Y, %1 my2, m9, m12
IF Z, %1 mz2, m10, m12
IF W, %1 mw2, m11, m12
%endmacro
%macro read_planar_fv 1 ; accum_fn
%xdefine FLOAD_PX FLOAD_ %+ TYPE
%xdefine weights tmp0q
%xdefine fltsize tmp1q
assert_idn FILTER_TYPE, SWS_PIXEL_F32
%assign SIZEOF_WEIGHT 4
mov weights, [implq + SwsOpImpl.priv] ; float *weights
mov fltsize, [implq + SwsOpImpl.priv + 8] ; size_t filter_size
; weights += filter_size * y * sizeof(float)
mov tmp2q, fltsize
imul tmp2q, yq
lea weights, [weights + SIZEOF_WEIGHT * tmp2q]
filter_v_iter mulps
dec fltsize
jz .done
IF X, push in0q
IF Y, push in1q
IF Z, push in2q
IF W, push in3q
.loop:
IF X, add in0q, [execq + SwsOpExec.in_stride0]
IF Y, add in1q, [execq + SwsOpExec.in_stride1]
IF Z, add in2q, [execq + SwsOpExec.in_stride2]
IF W, add in3q, [execq + SwsOpExec.in_stride3]
add weights, SIZEOF_WEIGHT
filter_v_iter %1
dec fltsize
jnz .loop
IF W, pop in3q
IF Z, pop in2q
IF Y, pop in1q
IF X, pop in0q
.done:
LOAD_CONT tmp0q
IF W, add in3q, FILTER_SIZE
IF Z, add in2q, FILTER_SIZE
IF Y, add in1q, FILTER_SIZE
IF X, add in0q, FILTER_SIZE
CONTINUE tmp0q
%undef weights
%undef fltsize
%endmacro
%macro READ_PLANAR_FV 1
%xdefine FILTER_TYPE %1
read_planar_fv ACCUM_MULADD
%endmacro
%macro READ_PLANAR_FV_FMA 1
%xdefine FILTER_TYPE %1
read_planar_fv ACCUM_FMA
%endmacro
;---------------------------------------------------------
; Generic horizontal filtering (vpgatherdd)
%macro FILTER_H_SWS_PIXEL_U8 4 ; acc, acc2, src, first
pcmpeqb m12, m12
pcmpeqb m13, m13
vpgatherdd m8, [%3 + m14], m12 ; { ABCD | EFGH } 4 pixel per word
vpgatherdd m9, [%3 + m15], m13 ; { IJKL | MNOP }
; unpack 4 bytes into separate 16-bit integer registers
punpckhbw m10, m8, m12 ; { CCDD | GGHH } 2 pixels per word
punpcklbw m8, m8, m12 ; { AABB | EEFF }
punpckhbw m11, m9, m12 ; { KKLL | OOPP }
punpcklbw m9, m9, m12 ; { IIJJ | MMNN }
pmaddwd m8, [weights]
pmaddwd m10, [weights + mmsize]
pmaddwd m9, [weights + mmsize * 2]
pmaddwd m11, [weights + mmsize * 3]
%if %4
phaddd %1, m8, m10 ; { ABCD | EFGH }
phaddd %2, m9, m11 ; { IJKL | MNOP }
%else
phaddd m8, m10
phaddd m9, m11
paddd %1, m8
paddd %2, m9
%endif
%endmacro
%macro FILTER_H_SWS_PIXEL_U16 4 ; acc, acc2, src, first
pcmpeqb m12, m12
pcmpeqb m13, m13
vpgatherdd m8, [%3 + m14], m12
vpgatherdd m9, [%3 + m15], m13
psubw m8, m10
psubw m9, m10
%if %4
pmaddwd %1, m8, [weights]
pmaddwd %2, m9, [weights + mmsize]
%else
pmaddwd m8, [weights]
pmaddwd m9, [weights + mmsize]
paddd %1, m8
paddd %2, m9
%endif
%endmacro
%macro FILTER_H_SWS_PIXEL_F32 4 ; acc, acc2, src, first
pcmpeqb m12, m12
pcmpeqb m13, m13
vpgatherdd m8, [%3 + m14], m12
vpgatherdd m9, [%3 + m15], m13
%if %4
mulps %1, m8, [weights]
mulps %2, m9, [weights + mmsize]
%else
mulps m8, [weights]
mulps m9, [weights + mmsize]
addps %1, m8
addps %2, m9
%endif
%endmacro
%macro READ_PLANAR_FH 1
%xdefine FILTER_TYPE %1
%xdefine weights tmp0q
%xdefine fltsize tmp1q
%xdefine FILTER_H_PX FILTER_H_ %+ TYPE
assert_idn FILTER_TYPE, SWS_PIXEL_F32
%ifidn TYPE, SWS_PIXEL_F32
%assign SIZEOF_WEIGHT 4 ; F32
%else
%assign SIZEOF_WEIGHT 2 ; I16
%endif
mov tmp0q, [execq + SwsOpExec.in_offset_x]
mov fltsize, [implq + SwsOpImpl.priv + 8] ; size_t filter_size
mov tmp2d, bxd
shl_log2 tmp2q, FILTER_WIDTH ; x := bx * FILTER_WIDTH
movu m14, [tmp0q + 4 * tmp2q] ; &exec->in_offset_x[x]
movu m15, [tmp0q + 4 * tmp2q + mmsize]
mov weights, [implq + SwsOpImpl.priv]
%ifidn TYPE, SWS_PIXEL_U16
VBROADCASTI128 m10, [bias16]
VBROADCASTI128 m11, [bias32]
%endif
imul tmp2q, fltsize
lea weights, [weights + tmp2q * SIZEOF_WEIGHT] ; weights += x * filter_size
IF X, FILTER_H_PX mx, mx2, in0q, 1
IF Y, FILTER_H_PX my, my2, in1q, 1
IF Z, FILTER_H_PX mz, mz2, in2q, 1
IF W, FILTER_H_PX mw, mw2, in3q, 1
sub fltsize, 4 / BYTES
jz .done
IF X, push in0q
IF Y, push in1q
IF Z, push in2q
IF W, push in3q
.loop:
IF X, add in0q, 4
IF Y, add in1q, 4
IF Z, add in2q, 4
IF W, add in3q, 4
add weights, BLOCK_WIDTH * SIZEOF_WEIGHT
IF X, FILTER_H_PX mx, mx2, in0q, 0
IF Y, FILTER_H_PX my, my2, in1q, 0
IF Z, FILTER_H_PX mz, mz2, in2q, 0
IF W, FILTER_H_PX mw, mw2, in3q, 0
sub fltsize, 4 / BYTES
jnz .loop
IF W, pop in3q
IF Z, pop in2q
IF Y, pop in1q
IF X, pop in0q
.done:
%ifidn TYPE, SWS_PIXEL_U16
IF X, paddd mx, m11
IF Y, paddd my, m11
IF Z, paddd mz, m11
IF W, paddd mw, m11
IF X, paddd mx2, m11
IF Y, paddd my2, m11
IF Z, paddd mz2, m11
IF W, paddd mw2, m11
%endif
%ifnidn TYPE, SWS_PIXEL_F32
IF X, vcvtdq2ps mx, mx
IF Y, vcvtdq2ps my, my
IF Z, vcvtdq2ps mz, mz
IF W, vcvtdq2ps mw, mw
IF X, vcvtdq2ps mx2, mx2
IF Y, vcvtdq2ps my2, my2
IF Z, vcvtdq2ps mz2, mz2
IF W, vcvtdq2ps mw2, mw2
%endif
VBROADCASTI128 m12, [scale_inv]
LOAD_CONT tmp0q
IF X, mulps mx, m12
IF Y, mulps my, m12
IF Z, mulps mz, m12
IF W, mulps mw, m12
IF X, mulps mx2, m12
IF Y, mulps my2, m12
IF Z, mulps mz2, m12
IF W, mulps mw2, m12
CONTINUE tmp0q
%undef weights
%undef fltsize
%endmacro
;---------------------------------------------------------
; 4x4 transposed horizontal filtering
%macro ILOAD_SWS_PIXEL_U8 2 ; dst, src
pmovzxbw %1, %2
%endmacro
%macro ILOAD_SWS_PIXEL_U16 2 ; dst, src
movu %1, %2
psubw %1, [bias16] ; shift into signed I16 range
%endmacro
%macro ILOAD_SWS_PIXEL_F32 2 ; dst, src
movu %1, %2
%endmacro
; filter 4 adjacent pixels at the same time
%macro filter_h4_4x4 2 ; dst, src
%ifidn TYPE, SWS_PIXEL_F32
%xdefine MUL mulps
%xdefine ADD addps
%else
%xdefine MUL pmaddwd
%xdefine ADD paddd
%endif
ILOAD_PX xm8, [%2 + offset0q] ; {a0, a1, a2, a3}
ILOAD_PX xm9, [%2 + offset1q] ; {b0, b1, b2, b3}
ILOAD_PX xm10, [%2 + offset2q] ; {c0, c1, c2, c3}
ILOAD_PX xm11, [%2 + offset3q] ; {d0, d1, d2, d3}
MUL xm8, [weights]
MUL xm9, [weights + 16]
MUL xm10, [weights + 32]
MUL xm11, [weights + 48]
mov bxq, fltsize
sub bxq, 64
jz %%done
push weights
push %2
%%loop:
add weights, 64
add %2, (16 / SIZEOF_WEIGHT) * BYTES ; pixels per xmm reg
ILOAD_PX xm14, [%2 + offset0q]
ILOAD_PX xm15, [%2 + offset1q]
MUL xm14, [weights]
MUL xm15, [weights + 16]
ADD xm8, xm14
ADD xm9, xm15
ILOAD_PX xm14, [%2 + offset2q]
ILOAD_PX xm15, [%2 + offset3q]
MUL xm14, [weights + 32]
MUL xm15, [weights + 48]
ADD xm10, xm14
ADD xm11, xm15
sub bxq, 64
jnz %%loop
pop %2
pop weights
%%done:
; 4x4 transpose (on XMM size)
punpckhdq xm15, xm8, xm9 ; {a2, b2, a3, b3}
punpckldq xm8, xm9 ; {a0, b0, a1, b1}
punpckhdq xm9, xm10, xm11 ; {c2, d2, c3, d3}
punpckldq xm10, xm11 ; {c0, d0, c1, d1}
punpckhqdq xm11, xm8, xm10 ; {a1, b1, c1, d1}
punpcklqdq xm8, xm10 ; {a0, b0, c0, d0}
punpckhqdq xm10, xm15, xm9 ; {a3, b3, c3, d3}
punpcklqdq xm15, xm9 ; {a2, b2, c2, d2}
ADD xm8, xm11 ; sum all even terms
ADD xm15, xm10 ; sum all odd terms
ADD %1, xm8, xm15
%undef MUL
%undef ADD
%endmacro
; filter low and high lanes separately and combine results for each plane
%macro filter_h8_4x4 1-2 ; offsets, dst_suffix
movsxd offset0q, dword [%1 + 0]
movsxd offset1q, dword [%1 + 4]
movsxd offset2q, dword [%1 + 8]
movsxd offset3q, dword [%1 + 12]
IF X, filter_h4_4x4 xmx%2, in0q
IF Y, filter_h4_4x4 xmy%2, in1q
IF Z, filter_h4_4x4 xmz%2, in2q
IF W, filter_h4_4x4 xmw%2, in3q
add weights, fltsize
movsxd offset0q, dword [%1 + 16]
movsxd offset1q, dword [%1 + 20]
movsxd offset2q, dword [%1 + 24]
movsxd offset3q, dword [%1 + 28]
IF X, filter_h4_4x4 xm12, in0q
IF Y, filter_h4_4x4 xm13, in1q
IF X, vinsertf128 mx%2, mx%2, xmm12, 1
IF Y, vinsertf128 my%2, my%2, xmm13, 1
IF Z, filter_h4_4x4 xm12, in2q
IF W, filter_h4_4x4 xm13, in3q
IF Z, vinsertf128 mz%2, mz%2, xmm12, 1
IF W, vinsertf128 mw%2, mw%2, xmm13, 1
%ifidn TYPE, SWS_PIXEL_U16
VBROADCASTI128 m15, [bias32]
IF X, paddd mx%2, m15
IF Y, paddd my%2, m15
IF Z, paddd mz%2, m15
IF W, paddd mw%2, m15
%endif
%ifnidn TYPE, SWS_PIXEL_F32
IF X, vcvtdq2ps mx%2, mx%2
IF Y, vcvtdq2ps my%2, my%2
IF Z, vcvtdq2ps mz%2, mz%2
IF W, vcvtdq2ps mw%2, mw%2
%endif
%endmacro
%macro READ_PLANAR_FH_4X4 1
%xdefine FILTER_TYPE %1
%xdefine offset0q out0q
%xdefine offset1q out1q
%xdefine offset2q out2q
%xdefine offset3q out3q
%xdefine weights tmp0q
%xdefine offsets tmp2q
%xdefine fltsize tmp1q
%xdefine ILOAD_PX ILOAD_ %+ TYPE
assert_idn FILTER_TYPE, SWS_PIXEL_F32
%ifidn TYPE, SWS_PIXEL_F32
%assign SIZEOF_WEIGHT 4 ; F32
%else
%assign SIZEOF_WEIGHT 2 ; I16
%endif
; reserve some registers for the inner loops
push bxq
push offset0q
push offset1q
push offset2q
push offset3q
shl_log2 bxq, FILTER_WIDTH ; x := bx * FILTER_WIDTH
mov weights, [implq + SwsOpImpl.priv] ; int16_t *weights
mov tmp1d, [implq + SwsOpImpl.priv + 8] ; size_t filter_size
mov offsets, [execq + SwsOpExec.in_offset_x] ; int32_t *offsets
lea offsets, [offsets + 4 * bxq] ; offsets += x * sizeof(int32_t)
imul bxq, fltsize
add weights, bxq ; weights += x * filter_size
shl fltsize, 2 ; fltsize *= 4 (number of pixels / iter)
filter_h8_4x4 offsets
add weights, fltsize
filter_h8_4x4 offsets + 32, 2
VBROADCASTI128 m10, [scale_inv]
pop offset3q
pop offset2q
pop offset1q
pop offset0q
pop bxq
LOAD_CONT tmp0q
IF X, mulps mx, m10
IF Y, mulps my, m10
IF Z, mulps mz, m10
IF W, mulps mw, m10
IF X, mulps mx2, m10
IF Y, mulps my2, m10
IF Z, mulps mz2, m10
IF W, mulps mw2, m10
CONTINUE tmp0q
%undef offset0q
%undef offset1q
%undef offset2q
%undef offset3q
%undef weights
%undef offsets
%undef fltsize
%endmacro
;---------------------------------------------------------
; Arithmetic operations
%macro SCALE 0
vbroadcastss m8, [implq + SwsOpImpl.priv]
LOAD_CONT tmp0q
IF X, mulps mx, m8
IF Y, mulps my, m8
IF Z, mulps mz, m8
IF W, mulps mw, m8
IF X, mulps mx2, m8
IF Y, mulps my2, m8
IF Z, mulps mz2, m8
IF W, mulps mw2, m8
CONTINUE tmp0q
%endmacro
%macro ADD 0
IF X, vbroadcastss m8, [implq + SwsOpImpl.priv + 0 * BYTES]
IF Y, vbroadcastss m9, [implq + SwsOpImpl.priv + 1 * BYTES]
IF Z, vbroadcastss m10, [implq + SwsOpImpl.priv + 2 * BYTES]
IF W, vbroadcastss m11, [implq + SwsOpImpl.priv + 3 * BYTES]
LOAD_CONT tmp0q
IF X, addps mx, m8
IF Y, addps my, m9
IF Z, addps mz, m10
IF W, addps mw, m11
IF X, addps mx2, m8
IF Y, addps my2, m9
IF Z, addps mz2, m10
IF W, addps mw2, m11
CONTINUE tmp0q
%endmacro
%macro MIN 0
IF X, vbroadcastss m8, [implq + SwsOpImpl.priv + 0 * BYTES]
IF Y, vbroadcastss m9, [implq + SwsOpImpl.priv + 1 * BYTES]
IF Z, vbroadcastss m10, [implq + SwsOpImpl.priv + 2 * BYTES]
IF W, vbroadcastss m11, [implq + SwsOpImpl.priv + 3 * BYTES]
LOAD_CONT tmp0q
IF X, minps mx, m8
IF Y, minps my, m9
IF Z, minps mz, m10
IF W, minps mw, m11
IF X, minps mx2, m8
IF Y, minps my2, m9
IF Z, minps mz2, m10
IF W, minps mw2, m11
CONTINUE tmp0q
%endmacro
%macro MAX 0
IF X, vbroadcastss m8, [implq + SwsOpImpl.priv + 0 * BYTES]
IF Y, vbroadcastss m9, [implq + SwsOpImpl.priv + 1 * BYTES]
IF Z, vbroadcastss m10, [implq + SwsOpImpl.priv + 2 * BYTES]
IF W, vbroadcastss m11, [implq + SwsOpImpl.priv + 3 * BYTES]
LOAD_CONT tmp0q
IF X, maxps mx, m8
IF Y, maxps my, m9
IF Z, maxps mz, m10
IF W, maxps mw, m11
IF X, maxps mx2, m8
IF Y, maxps my2, m9
IF Z, maxps mz2, m10
IF W, maxps mw2, m11
CONTINUE tmp0q
%endmacro
;---------------------------------------------------------
; Linear operations
%define LIN_MASK(I, J) (1 << (5 * (I) + (J)))
%macro linear_muladd 5 ; dst, src, use_coef, coef, use_fma
%if INIT ; dst is already initialized
%if %3 && %5
fmaddps %1, %4, %2, %1
%elif %3
mulps %4, %2
addps %1, %4
%else
addps %1, %2
%endif
%else
%assign INIT 1
%if %3
mulps %1, %4, %2
%else
mova %1, %2
%endif
%endif
%endmacro
%macro linear_row 3-4 ; dst, src, row, suffix
%xdefine NEED(J) (!(ZERO_MASK & LIN_MASK(%3, J)))
%xdefine LOAD(J) (NEED(J) && !(ONE_MASK & LIN_MASK(%3, J)))
%xdefine FMA(J) (EXACT_MASK & LIN_MASK(%3, J))
%assign INIT 0 ; track whether `dst` already contains data
%if !(ZERO_MASK & LIN_MASK(%3, 4)) ; nonzero output offset
%assign INIT 1
vbroadcastss %1, [%2 + 4 * BYTES]
%endif
IF LOAD(0), vbroadcastss m12, [%2 + 0 * BYTES]
IF LOAD(1), vbroadcastss m13, [%2 + 1 * BYTES]
IF LOAD(2), vbroadcastss m14, [%2 + 2 * BYTES]
IF LOAD(3), vbroadcastss m15, [%2 + 3 * BYTES]
IF NEED(0), linear_muladd %1, mx%4, LOAD(0), m12, FMA(0)
IF NEED(1), linear_muladd %1, my%4, LOAD(1), m13, FMA(1)
IF NEED(2), linear_muladd %1, mz%4, LOAD(2), m14, FMA(2)
IF NEED(3), linear_muladd %1, mw%4, LOAD(3), m15, FMA(3)
assert INIT, SWS_UOP_LINEAR should not contain empty rows
%endmacro
%macro LINEAR_FMA 3
%assign ONE_MASK %1
%assign ZERO_MASK %2
%assign EXACT_MASK %3
mov tmp0q, [implq + SwsOpImpl.priv] ; address of matrix
LOAD_CONT tmp1q
IF1 X, linear_row m8, tmp0q + 0 * BYTES, 0
IF1 Y, linear_row m9, tmp0q + 5 * BYTES, 1
IF1 Z, linear_row m10, tmp0q + 10 * BYTES, 2
IF1 W, linear_row m11, tmp0q + 15 * BYTES, 3
IF X, mova mx, m8
IF Y, mova my, m9
IF Z, mova mz, m10
IF W, mova mw, m11
IF1 X, linear_row m8, tmp0q + 0 * BYTES, 0, 2
IF1 Y, linear_row m9, tmp0q + 5 * BYTES, 1, 2
IF1 Z, linear_row m10, tmp0q + 10 * BYTES, 2, 2
IF1 W, linear_row m11, tmp0q + 15 * BYTES, 3, 2
IF X, mova mx2, m8
IF Y, mova my2, m9
IF Z, mova mz2, m10
IF W, mova mw2, m11
CONTINUE tmp1q
%endmacro
;---------------------------------------------------------
; Dithering
%macro VBROADCAST 3 ; size, dst, src
%if %1 == 4
vbroadcastss %2, %3
%elif %1 == 8
vbroadcastsd %2, %3
%elif %1 == 16
VBROADCASTF128 %2, %3
%else
mova %2, %3
%endif
%endmacro
%macro DITHER 5 ; x_off, y_off, z_off, w_off, size_log2
%assign SIZE_LOG2 %5
%assign SIZE (1 << SIZE_LOG2)
%assign STRIDE (SIZE * BYTES)
; dither matrix is stored indirectly at the private data address
mov tmp1q, [implq + SwsOpImpl.priv]
; add y offset. note that for 2x2, we would only need to look at the
; sign of `y`, but this special case is ignored for simplicity reasons
; (and because the current upstream format code never generates matrices
; that small)
mov tmp0d, yd
and tmp0d, SIZE - 1
shl_log2 tmp0d, STRIDE
add tmp1q, tmp0q
%if STRIDE > BLOCK_SIZE
; need to add in x offset
mov tmp0d, bxd
shl tmp0d, BLOCK_SIZE
and tmp0d, STRIDE - 1
add tmp1q, tmp0q
%endif
%xdefine dither_addr_x (tmp1q + %1 * STRIDE)
%xdefine dither_addr_y (tmp1q + %2 * STRIDE)
%xdefine dither_addr_z (tmp1q + %3 * STRIDE)
%xdefine dither_addr_w (tmp1q + %4 * STRIDE)
%if STRIDE > mmsize
LOAD_CONT tmp0q
IF X, addps mx, [dither_addr_x]
IF Y, addps my, [dither_addr_y]
IF Z, addps mz, [dither_addr_z]
IF W, addps mw, [dither_addr_w]
IF X, addps mx2, [dither_addr_x + mmsize]
IF Y, addps my2, [dither_addr_y + mmsize]
IF Z, addps mz2, [dither_addr_z + mmsize]
IF W, addps mw2, [dither_addr_w + mmsize]
%else
IF X, VBROADCAST STRIDE, m8, [dither_addr_x]
IF Y, VBROADCAST STRIDE, m9, [dither_addr_y]
IF Z, VBROADCAST STRIDE, m10, [dither_addr_z]
IF W, VBROADCAST STRIDE, m11, [dither_addr_w]
LOAD_CONT tmp0q
IF X, addps mx, m8
IF Y, addps my, m9
IF Z, addps mz, m10
IF W, addps mw, m11
IF X, addps mx2, m8
IF Y, addps my2, m9
IF Z, addps mz2, m10
IF W, addps mw2, m11
%endif
CONTINUE tmp0q
%endmacro
;---------------------------------------------------------
; Instantiate above macros to generate all uop kernels
%macro decl_filter_ops 1 ; type
decl_suffix _4x4, DECL_%1_READ_PLANAR_FH (READ_PLANAR_FH_4X4)
DECL_%1_READ_PLANAR_FH (READ_PLANAR_FH)
DECL_%1_READ_PLANAR_FV (READ_PLANAR_FV)
DECL_%1_READ_PLANAR_FV_FMA (READ_PLANAR_FV_FMA)
%endmacro
%macro decl_float_ops 1 ; type
DECL_%1_SCALE (SCALE)
DECL_%1_ADD (ADD)
DECL_%1_MIN (MIN)
DECL_%1_MAX (MAX)
DECL_%1_LINEAR_FMA (LINEAR_FMA)
DECL_%1_DITHER (DITHER)
%endmacro
INIT_YMM avx2
%assign V2 1
decl_filter_ops U8
decl_filter_ops U16
decl_filter_ops F32
decl_float_ops F32