diff --git a/libswscale/arm/hscale.S b/libswscale/arm/hscale.S index 5c3551a0f1..f85382c4a5 100644 --- a/libswscale/arm/hscale.S +++ b/libswscale/arm/hscale.S @@ -22,48 +22,48 @@ #include "libavutil/arm/asm.S" function ff_hscale_8_to_15_neon, export=1 - push {r4-r12, lr} - vpush {q4-q7} - ldr r4, [sp, #104] @ filter - ldr r5, [sp, #108] @ filterPos - ldr r6, [sp, #112] @ filterSize - add r10, r4, r6, lsl #1 @ filter2 = filter + filterSize * 2 -1: ldr r8, [r5], #4 @ filterPos[0] - ldr r9, [r5], #4 @ filterPos[1] - vmov.s32 q4, #0 @ val accumulator - vmov.s32 q5, #0 @ val accumulator - mov r7, r6 @ tmpfilterSize = filterSize - mov r0, r3 @ srcp -2: add r11, r0, r8 @ srcp + filterPos[0] - add r12, r0, r9 @ srcp + filterPos[1] - vld1.8 d0, [r11] @ srcp[filterPos[0] + {0..7}] - vld1.8 d2, [r12] @ srcp[filterPos[1] + {0..7}] - vld1.16 {q2}, [r4]! @ load 8x16-bit filter values - vld1.16 {q3}, [r10]! @ load 8x16-bit filter values - vmovl.u8 q0, d0 @ unpack src values to 16-bit - vmovl.u8 q1, d2 @ unpack src values to 16-bit - vmull.s16 q8, d0, d4 @ srcp[filterPos[0] + {0..7}] * filter[{0..7}] (part 1) - vmull.s16 q9, d1, d5 @ srcp[filterPos[0] + {0..7}] * filter[{0..7}] (part 2) - vmull.s16 q10, d2, d6 @ srcp[filterPos[1] + {0..7}] * filter[{0..7}] (part 1) - vmull.s16 q11, d3, d7 @ srcp[filterPos[1] + {0..7}] * filter[{0..7}] (part 2) - vpadd.s32 d16, d16, d17 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 1) - vpadd.s32 d17, d18, d19 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 2) - vpadd.s32 d20, d20, d21 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 1) - vpadd.s32 d21, d22, d23 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 2) - vadd.s32 q4, q8 @ update val accumulator - vadd.s32 q5, q10 @ update val accumulator - add r0, #8 @ srcp += 8 - subs r7, #8 @ tmpfilterSize -= 8 - bgt 2b @ loop until tmpfilterSize is consumed - mov r4, r10 @ filter = filter2 - add r10, r10, r6, lsl #1 @ filter2 += filterSize * 2 - vpadd.s32 d8, d8, d9 @ horizontal pair adding of the 8x32-bit sums into 4x32-bit (part 1) - vpadd.s32 d9, d10, d11 @ horizontal pair adding of the 8x32-bit sums into 4x32-bit (part 2) - vpadd.s32 d8, d8, d9 @ horizontal pair adding of the 4x32-bit sums into 2x32-bit - vqshrn.s32 d8, q4, #7 @ shift and clip the 2x16-bit final values - vst1.32 {d8[0]},[r1]! @ write destination - subs r2, #2 @ dstW -= 2 - bgt 1b @ loop until end of line - vpop {q4-q7} - pop {r4-r12, pc} + push {r4-r12, lr} + vpush {q4-q7} + ldr r4, [sp, #104] @ filter + ldr r5, [sp, #108] @ filterPos + ldr r6, [sp, #112] @ filterSize + add r10, r4, r6, lsl #1 @ filter2 = filter + filterSize * 2 +1: ldr r8, [r5], #4 @ filterPos[0] + ldr r9, [r5], #4 @ filterPos[1] + vmov.s32 q4, #0 @ val accumulator + vmov.s32 q5, #0 @ val accumulator + mov r7, r6 @ tmpfilterSize = filterSize + mov r0, r3 @ srcp +2: add r11, r0, r8 @ srcp + filterPos[0] + add r12, r0, r9 @ srcp + filterPos[1] + vld1.8 d0, [r11] @ srcp[filterPos[0] + {0..7}] + vld1.8 d2, [r12] @ srcp[filterPos[1] + {0..7}] + vld1.16 {q2}, [r4]! @ load 8x16-bit filter values + vld1.16 {q3}, [r10]! @ load 8x16-bit filter values + vmovl.u8 q0, d0 @ unpack src values to 16-bit + vmovl.u8 q1, d2 @ unpack src values to 16-bit + vmull.s16 q8, d0, d4 @ srcp[filterPos[0] + {0..7}] * filter[{0..7}] (part 1) + vmull.s16 q9, d1, d5 @ srcp[filterPos[0] + {0..7}] * filter[{0..7}] (part 2) + vmull.s16 q10, d2, d6 @ srcp[filterPos[1] + {0..7}] * filter[{0..7}] (part 1) + vmull.s16 q11, d3, d7 @ srcp[filterPos[1] + {0..7}] * filter[{0..7}] (part 2) + vpadd.s32 d16, d16, d17 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 1) + vpadd.s32 d17, d18, d19 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 2) + vpadd.s32 d20, d20, d21 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 1) + vpadd.s32 d21, d22, d23 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 2) + vadd.s32 q4, q8 @ update val accumulator + vadd.s32 q5, q10 @ update val accumulator + add r0, #8 @ srcp += 8 + subs r7, #8 @ tmpfilterSize -= 8 + bgt 2b @ loop until tmpfilterSize is consumed + mov r4, r10 @ filter = filter2 + add r10, r10, r6, lsl #1 @ filter2 += filterSize * 2 + vpadd.s32 d8, d8, d9 @ horizontal pair adding of the 8x32-bit sums into 4x32-bit (part 1) + vpadd.s32 d9, d10, d11 @ horizontal pair adding of the 8x32-bit sums into 4x32-bit (part 2) + vpadd.s32 d8, d8, d9 @ horizontal pair adding of the 4x32-bit sums into 2x32-bit + vqshrn.s32 d8, q4, #7 @ shift and clip the 2x16-bit final values + vst1.32 {d8[0]},[r1]! @ write destination + subs r2, #2 @ dstW -= 2 + bgt 1b @ loop until end of line + vpop {q4-q7} + pop {r4-r12, pc} endfunc diff --git a/libswscale/arm/output.S b/libswscale/arm/output.S index 670c6189e0..8a2922f885 100644 --- a/libswscale/arm/output.S +++ b/libswscale/arm/output.S @@ -22,56 +22,56 @@ #include "libavutil/arm/asm.S" function ff_yuv2planeX_8_neon, export=1 - push {r4-r12, lr} - vpush {q4-q7} - ldr r4, [sp, #104] @ dstW - ldr r5, [sp, #108] @ dither - ldr r6, [sp, #112] @ offset - vld1.8 {d0}, [r5] @ load 8x8-bit dither values - cmp r6, #0 @ check offsetting which can be 0 or 3 only - beq 1f - vext.u8 d0, d0, d0, #3 @ honor offsetting which can be 3 only -1: vmovl.u8 q0, d0 @ extend dither to 16-bit - vshll.u16 q1, d0, #12 @ extend dither to 32-bit with left shift by 12 (part 1) - vshll.u16 q2, d1, #12 @ extend dither to 32-bit with left shift by 12 (part 2) - mov r7, #0 @ i = 0 -2: vmov.u8 q3, q1 @ initialize accumulator with dithering values (part 1) - vmov.u8 q4, q2 @ initialize accumulator with dithering values (part 2) - mov r8, r1 @ tmpFilterSize = filterSize - mov r9, r2 @ srcp - mov r10, r0 @ filterp -3: ldr r11, [r9], #4 @ get pointer @ src[j] - ldr r12, [r9], #4 @ get pointer @ src[j+1] - add r11, r11, r7, lsl #1 @ &src[j][i] - add r12, r12, r7, lsl #1 @ &src[j+1][i] - vld1.16 {q5}, [r11] @ read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H - vld1.16 {q6}, [r12] @ read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P - ldr r11, [r10], #4 @ read 2x16-bit coeffs (X, Y) at (filter[j], filter[j+1]) - vmov.16 q7, q5 @ copy 8x16-bit @ src[j ][i + {0..7}] for following inplace zip instruction - vmov.16 q8, q6 @ copy 8x16-bit @ src[j+1][i + {0..7}] for following inplace zip instruction - vzip.16 q7, q8 @ A,I,B,J,C,K,D,L,E,M,F,N,G,O,H,P - vdup.32 q15, r11 @ X,Y,X,Y,X,Y,X,Y - vmull.s16 q9, d14, d30 @ A*X,I*Y,B*X,J*Y - vmull.s16 q10, d15, d31 @ C*X,K*Y,D*X,L*Y - vmull.s16 q11, d16, d30 @ E*X,M*Y,F*X,N*Y - vmull.s16 q12, d17, d31 @ G*X,O*Y,H*X,P*Y - vpadd.s32 d10, d18, d19 @ A*X+I*Y,B*X+J*Y - vpadd.s32 d11, d20, d21 @ C*X+K*Y,D*X+L*Y - vpadd.s32 d12, d22, d23 @ E*X+M*Y,F*X+N*Y - vpadd.s32 d13, d24, d25 @ G*X+O*Y,H*X+P*Y - vadd.s32 q3, q5 @ update val accumulator (part 1) - vadd.s32 q4, q6 @ update val accumulator (part 2) - subs r8, #2 @ tmpFilterSize -= 2 - bgt 3b @ loop until filterSize is consumed - vshr.s32 q3, q3, #19 @ val>>19 (part 1) - vshr.s32 q4, q4, #19 @ val>>19 (part 2) - vqmovun.s32 d6, q3 @ clip16(val>>19) (part 1) - vqmovun.s32 d7, q4 @ clip16(val>>19) (part 2) - vqmovn.u16 d6, q3 @ merge part 1 and part 2 - vst1.8 {d6}, [r3]! @ write destination - add r7, #8 @ i += 8 - subs r4, r4, #8 @ dstW -= 8 - bgt 2b @ loop until width is consumed - vpop {q4-q7} - pop {r4-r12, pc} + push {r4-r12, lr} + vpush {q4-q7} + ldr r4, [sp, #104] @ dstW + ldr r5, [sp, #108] @ dither + ldr r6, [sp, #112] @ offset + vld1.8 {d0}, [r5] @ load 8x8-bit dither values + cmp r6, #0 @ check offsetting which can be 0 or 3 only + beq 1f + vext.u8 d0, d0, d0, #3 @ honor offsetting which can be 3 only +1: vmovl.u8 q0, d0 @ extend dither to 16-bit + vshll.u16 q1, d0, #12 @ extend dither to 32-bit with left shift by 12 (part 1) + vshll.u16 q2, d1, #12 @ extend dither to 32-bit with left shift by 12 (part 2) + mov r7, #0 @ i = 0 +2: vmov.u8 q3, q1 @ initialize accumulator with dithering values (part 1) + vmov.u8 q4, q2 @ initialize accumulator with dithering values (part 2) + mov r8, r1 @ tmpFilterSize = filterSize + mov r9, r2 @ srcp + mov r10, r0 @ filterp +3: ldr r11, [r9], #4 @ get pointer @ src[j] + ldr r12, [r9], #4 @ get pointer @ src[j+1] + add r11, r11, r7, lsl #1 @ &src[j][i] + add r12, r12, r7, lsl #1 @ &src[j+1][i] + vld1.16 {q5}, [r11] @ read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H + vld1.16 {q6}, [r12] @ read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P + ldr r11, [r10], #4 @ read 2x16-bit coeffs (X, Y) at (filter[j], filter[j+1]) + vmov.16 q7, q5 @ copy 8x16-bit @ src[j ][i + {0..7}] for following inplace zip instruction + vmov.16 q8, q6 @ copy 8x16-bit @ src[j+1][i + {0..7}] for following inplace zip instruction + vzip.16 q7, q8 @ A,I,B,J,C,K,D,L,E,M,F,N,G,O,H,P + vdup.32 q15, r11 @ X,Y,X,Y,X,Y,X,Y + vmull.s16 q9, d14, d30 @ A*X,I*Y,B*X,J*Y + vmull.s16 q10, d15, d31 @ C*X,K*Y,D*X,L*Y + vmull.s16 q11, d16, d30 @ E*X,M*Y,F*X,N*Y + vmull.s16 q12, d17, d31 @ G*X,O*Y,H*X,P*Y + vpadd.s32 d10, d18, d19 @ A*X+I*Y,B*X+J*Y + vpadd.s32 d11, d20, d21 @ C*X+K*Y,D*X+L*Y + vpadd.s32 d12, d22, d23 @ E*X+M*Y,F*X+N*Y + vpadd.s32 d13, d24, d25 @ G*X+O*Y,H*X+P*Y + vadd.s32 q3, q5 @ update val accumulator (part 1) + vadd.s32 q4, q6 @ update val accumulator (part 2) + subs r8, #2 @ tmpFilterSize -= 2 + bgt 3b @ loop until filterSize is consumed + vshr.s32 q3, q3, #19 @ val>>19 (part 1) + vshr.s32 q4, q4, #19 @ val>>19 (part 2) + vqmovun.s32 d6, q3 @ clip16(val>>19) (part 1) + vqmovun.s32 d7, q4 @ clip16(val>>19) (part 2) + vqmovn.u16 d6, q3 @ merge part 1 and part 2 + vst1.8 {d6}, [r3]! @ write destination + add r7, #8 @ i += 8 + subs r4, r4, #8 @ dstW -= 8 + bgt 2b @ loop until width is consumed + vpop {q4-q7} + pop {r4-r12, pc} endfunc diff --git a/libswscale/arm/rgb2yuv_neon_16.S b/libswscale/arm/rgb2yuv_neon_16.S index ad7e679ca9..5d7008be80 100644 --- a/libswscale/arm/rgb2yuv_neon_16.S +++ b/libswscale/arm/rgb2yuv_neon_16.S @@ -36,34 +36,34 @@ alias y16x16_h, q14 alias_qw y8x16, q15 .macro init src - vld3.i32 {q13_l, q14_l, q15_l}, [\src]! - vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src] - vrshrn.i32 CO_R, q13, #7 - vrshrn.i32 CO_G, q14, #7 - vrshrn.i32 CO_B, q15, #7 + vld3.i32 {q13_l, q14_l, q15_l}, [\src]! + vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src] + vrshrn.i32 CO_R, q13, #7 + vrshrn.i32 CO_G, q14, #7 + vrshrn.i32 CO_B, q15, #7 - vmov.u8 BIAS_Y, #16 - vmov.u8 BIAS_U, #128 + vmov.u8 BIAS_Y, #16 + vmov.u8 BIAS_U, #128 .endm .macro compute_y_16x1_step action, s8x16, coeff - vmovl.u8 n16x16_l, \s8x16\()_l - vmovl.u8 n16x16_h, \s8x16\()_h + vmovl.u8 n16x16_l, \s8x16\()_l + vmovl.u8 n16x16_h, \s8x16\()_h - \action y16x16_l, n16x16_l, \coeff - \action y16x16_h, n16x16_h, \coeff + \action y16x16_l, n16x16_l, \coeff + \action y16x16_h, n16x16_h, \coeff .endm .macro compute_y_16x1 - compute_y_16x1_step vmul, r8x16, CO_RY - compute_y_16x1_step vmla, g8x16, CO_GY - compute_y_16x1_step vmla, b8x16, CO_BY + compute_y_16x1_step vmul, r8x16, CO_RY + compute_y_16x1_step vmla, g8x16, CO_GY + compute_y_16x1_step vmla, b8x16, CO_BY - vrshrn.i16 y8x16_l, y16x16_l, #8 - vrshrn.i16 y8x16_h, y16x16_h, #8 + vrshrn.i16 y8x16_l, y16x16_l, #8 + vrshrn.i16 y8x16_h, y16x16_h, #8 - vadd.u8 y8x16, y8x16, BIAS_Y + vadd.u8 y8x16, y8x16, BIAS_Y .endm alias c16x8, q15 @@ -71,13 +71,13 @@ alias_qw c8x8x2, q10 .macro compute_chroma_8x1 c, C - vmul c16x8, r16x8, CO_R\C - vmla c16x8, g16x8, CO_G\C - vmla c16x8, b16x8, CO_B\C + vmul c16x8, r16x8, CO_R\C + vmla c16x8, g16x8, CO_G\C + vmla c16x8, b16x8, CO_B\C - vrshrn.i16 \c\()8x8, c16x8, #8 - vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C + vrshrn.i16 \c\()8x8, c16x8, #8 + vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C .endm - loop_420sp rgbx, nv12, init, kernel_420_16x2, 16 + loop_420sp rgbx, nv12, init, kernel_420_16x2, 16 #endif diff --git a/libswscale/arm/rgb2yuv_neon_32.S b/libswscale/arm/rgb2yuv_neon_32.S index 4fd0f64a09..b52093845a 100644 --- a/libswscale/arm/rgb2yuv_neon_32.S +++ b/libswscale/arm/rgb2yuv_neon_32.S @@ -48,27 +48,27 @@ alias y8x16, y16x16_e .macro init src - // load s32x3x3, narrow to s16x3x3 - vld3.i32 {q13_l, q14_l, q15_l}, [\src]! - vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src] + // load s32x3x3, narrow to s16x3x3 + vld3.i32 {q13_l, q14_l, q15_l}, [\src]! + vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src] - vmovn.i32 CO_R, q13 - vmovn.i32 CO_G, q14 - vmovn.i32 CO_B, q15 + vmovn.i32 CO_R, q13 + vmovn.i32 CO_G, q14 + vmovn.i32 CO_B, q15 - vmov.u8 BIAS_Y, #16 - vmov.u8 BIAS_U, #128 + vmov.u8 BIAS_Y, #16 + vmov.u8 BIAS_U, #128 .endm .macro compute_y_16x1_step action, s8x16, coeff - vmov.u8 n16x16_o, #0 - vtrn.u8 \s8x16, n16x16_o + vmov.u8 n16x16_o, #0 + vtrn.u8 \s8x16, n16x16_o - \action y32x16_el, \s8x16\()_l, \coeff - \action y32x16_eh, \s8x16\()_h, \coeff - \action y32x16_ol, n16x16_ol, \coeff - \action y32x16_oh, n16x16_oh, \coeff + \action y32x16_el, \s8x16\()_l, \coeff + \action y32x16_eh, \s8x16\()_h, \coeff + \action y32x16_ol, n16x16_ol, \coeff + \action y32x16_oh, n16x16_oh, \coeff .endm /* @@ -77,17 +77,17 @@ alias y8x16, y16x16_e * clobber: q11-q15, r8x16, g8x16, b8x16 */ .macro compute_y_16x1 - compute_y_16x1_step vmull, r8x16, CO_RY - compute_y_16x1_step vmlal, g8x16, CO_GY - compute_y_16x1_step vmlal, b8x16, CO_BY + compute_y_16x1_step vmull, r8x16, CO_RY + compute_y_16x1_step vmlal, g8x16, CO_GY + compute_y_16x1_step vmlal, b8x16, CO_BY - vrshrn.i32 y16x16_el, y32x16_el, #15 - vrshrn.i32 y16x16_eh, y32x16_eh, #15 - vrshrn.i32 y16x16_ol, y32x16_ol, #15 - vrshrn.i32 y16x16_oh, y32x16_oh, #15 + vrshrn.i32 y16x16_el, y32x16_el, #15 + vrshrn.i32 y16x16_eh, y32x16_eh, #15 + vrshrn.i32 y16x16_ol, y32x16_ol, #15 + vrshrn.i32 y16x16_oh, y32x16_oh, #15 - vtrn.8 y16x16_e, y16x16_o - vadd.u8 y8x16, y8x16, BIAS_Y + vtrn.8 y16x16_e, y16x16_o + vadd.u8 y8x16, y8x16, BIAS_Y .endm alias c32x8_l, q14 @@ -97,8 +97,8 @@ alias_qw c16x8, q13 alias_qw c8x8x2, q10 .macro compute_chroma_8x1_step action, s16x8, coeff - \action c32x8_l, \s16x8\()_l, \coeff - \action c32x8_h, \s16x8\()_h, \coeff + \action c32x8_l, \s16x8\()_l, \coeff + \action c32x8_h, \s16x8\()_h, \coeff .endm /* @@ -107,16 +107,16 @@ alias_qw c8x8x2, q10 * clobber: q14-q15 */ .macro compute_chroma_8x1 c, C - compute_chroma_8x1_step vmull, r16x8, CO_R\C - compute_chroma_8x1_step vmlal, g16x8, CO_G\C - compute_chroma_8x1_step vmlal, b16x8, CO_B\C + compute_chroma_8x1_step vmull, r16x8, CO_R\C + compute_chroma_8x1_step vmlal, g16x8, CO_G\C + compute_chroma_8x1_step vmlal, b16x8, CO_B\C - vrshrn.i32 c16x8_l, c32x8_l, #15 - vrshrn.i32 c16x8_h, c32x8_h, #15 - vmovn.i16 \c\()8x8, c16x8 - vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C + vrshrn.i32 c16x8_l, c32x8_l, #15 + vrshrn.i32 c16x8_h, c32x8_h, #15 + vmovn.i16 \c\()8x8, c16x8 + vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C .endm - loop_420sp rgbx, nv12, init, kernel_420_16x2, 32 +loop_420sp rgbx, nv12, init, kernel_420_16x2, 32 #endif diff --git a/libswscale/arm/rgb2yuv_neon_common.S b/libswscale/arm/rgb2yuv_neon_common.S index c0a7bb6716..f72d34764d 100644 --- a/libswscale/arm/rgb2yuv_neon_common.S +++ b/libswscale/arm/rgb2yuv_neon_common.S @@ -31,10 +31,10 @@ .altmacro .macro alias_dw_all qw, dw_l, dw_h - alias q\qw\()_l, d\dw_l - alias q\qw\()_h, d\dw_h + alias q\qw\()_l, d\dw_l + alias q\qw\()_h, d\dw_h .if \qw < 15 - alias_dw_all %(\qw + 1), %(\dw_l + 2), %(\dw_h + 2) + alias_dw_all %(\qw + 1), %(\dw_l + 2), %(\dw_h + 2) .endif .endm @@ -43,23 +43,23 @@ alias_dw_all 0, 0, 1 .noaltmacro .macro alias_qw name, qw, set=1 - alias \name\(), \qw, \set - alias \name\()_l, \qw\()_l, \set - alias \name\()_h, \qw\()_h, \set + alias \name\(), \qw, \set + alias \name\()_l, \qw\()_l, \set + alias \name\()_h, \qw\()_h, \set .endm .macro prologue - push {r4-r12, lr} - vpush {q4-q7} + push {r4-r12, lr} + vpush {q4-q7} .endm .macro epilogue - vpop {q4-q7} - pop {r4-r12, pc} + vpop {q4-q7} + pop {r4-r12, pc} .endm .macro load_arg reg, ix - ldr \reg, [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)] + ldr \reg, [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)] .endm @@ -69,167 +69,167 @@ alias_dw_all 0, 0, 1 * int32_t coeff_table[9]); */ .macro alias_loop_420sp set=1 - alias src, r0, \set - alias src0, src, \set - alias y, r1, \set - alias y0, y, \set - alias chroma, r2, \set - alias width, r3, \set - alias header, width, \set + alias src, r0, \set + alias src0, src, \set + alias y, r1, \set + alias y0, y, \set + alias chroma, r2, \set + alias width, r3, \set + alias header, width, \set - alias height, r4, \set - alias y_stride, r5, \set - alias c_stride, r6, \set - alias c_padding, c_stride, \set - alias src_stride, r7, \set + alias height, r4, \set + alias y_stride, r5, \set + alias c_stride, r6, \set + alias c_padding, c_stride, \set + alias src_stride, r7, \set - alias y0_end, r8, \set + alias y0_end, r8, \set - alias src_padding,r9, \set - alias y_padding, r10, \set + alias src_padding,r9, \set + alias y_padding, r10, \set - alias src1, r11, \set - alias y1, r12, \set + alias src1, r11, \set + alias y1, r12, \set - alias coeff_table,r12, \set + alias coeff_table,r12, \set .endm .macro loop_420sp s_fmt, d_fmt, init, kernel, precision function \s_fmt\()_to_\d_fmt\()_neon_\precision, export=1 - prologue + prologue - alias_loop_420sp + alias_loop_420sp - load_arg height, 4 - load_arg y_stride, 5 - load_arg c_stride, 6 - load_arg src_stride, 7 - load_arg coeff_table, 8 + load_arg height, 4 + load_arg y_stride, 5 + load_arg c_stride, 6 + load_arg src_stride, 7 + load_arg coeff_table, 8 - \init coeff_table + \init coeff_table - sub y_padding, y_stride, width - sub c_padding, c_stride, width - sub src_padding, src_stride, width, lsl #2 + sub y_padding, y_stride, width + sub c_padding, c_stride, width + sub src_padding, src_stride, width, lsl #2 - add y0_end, y0, width - and header, width, #15 + add y0_end, y0, width + and header, width, #15 - add y1, y0, y_stride - add src1, src0, src_stride + add y1, y0, y_stride + add src1, src0, src_stride 0: - cmp header, #0 - beq 1f + cmp header, #0 + beq 1f - \kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma, header + \kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma, header 1: - \kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma + \kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma - cmp y0, y0_end - blt 1b + cmp y0, y0_end + blt 1b 2: - add y0, y1, y_padding - add y0_end, y1, y_stride - add chroma, chroma, c_padding - add src0, src1, src_padding + add y0, y1, y_padding + add y0_end, y1, y_stride + add chroma, chroma, c_padding + add src0, src1, src_padding - add y1, y0, y_stride - add src1, src0, src_stride + add y1, y0, y_stride + add src1, src0, src_stride - subs height, height, #2 + subs height, height, #2 - bgt 0b + bgt 0b - epilogue + epilogue - alias_loop_420sp 0 + alias_loop_420sp 0 endfunc .endm .macro downsample - vpaddl.u8 r16x8, r8x16 - vpaddl.u8 g16x8, g8x16 - vpaddl.u8 b16x8, b8x16 + vpaddl.u8 r16x8, r8x16 + vpaddl.u8 g16x8, g8x16 + vpaddl.u8 b16x8, b8x16 .endm /* accumulate and right shift by 2 */ .macro downsample_ars2 - vpadal.u8 r16x8, r8x16 - vpadal.u8 g16x8, g8x16 - vpadal.u8 b16x8, b8x16 + vpadal.u8 r16x8, r8x16 + vpadal.u8 g16x8, g8x16 + vpadal.u8 b16x8, b8x16 - vrshr.u16 r16x8, r16x8, #2 - vrshr.u16 g16x8, g16x8, #2 - vrshr.u16 b16x8, b16x8, #2 + vrshr.u16 r16x8, r16x8, #2 + vrshr.u16 g16x8, g16x8, #2 + vrshr.u16 b16x8, b16x8, #2 .endm .macro store_y8_16x1 dst, count .ifc "\count","" - vstmia \dst!, {y8x16} + vstmia \dst!, {y8x16} .else - vstmia \dst, {y8x16} - add \dst, \dst, \count + vstmia \dst, {y8x16} + add \dst, \dst, \count .endif .endm .macro store_chroma_nv12_8x1 dst, count .ifc "\count","" - vst2.i8 {u8x8, v8x8}, [\dst]! + vst2.i8 {u8x8, v8x8}, [\dst]! .else - vst2.i8 {u8x8, v8x8}, [\dst], \count + vst2.i8 {u8x8, v8x8}, [\dst], \count .endif .endm .macro store_chroma_nv21_8x1 dst, count .ifc "\count","" - vst2.i8 {v8x8, u8x8}, [\dst]! + vst2.i8 {v8x8, u8x8}, [\dst]! .else - vst2.i8 {v8x8, u8x8}, [\dst], \count + vst2.i8 {v8x8, u8x8}, [\dst], \count .endif .endm .macro load_8888_16x1 a, b, c, d, src, count .ifc "\count","" - vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]! - vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]! + vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]! + vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]! .else - vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]! - vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src] - sub \src, \src, #32 - add \src, \src, \count, lsl #2 + vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]! + vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src] + sub \src, \src, #32 + add \src, \src, \count, lsl #2 .endif .endm .macro load_rgbx_16x1 src, count - load_8888_16x1 r, g, b, x, \src, \count + load_8888_16x1 r, g, b, x, \src, \count .endm .macro load_bgrx_16x1 src, count - load_8888_16x1 b, g, r, x, \src, \count + load_8888_16x1 b, g, r, x, \src, \count .endm .macro alias_src_rgbx set=1 - alias_src_8888 r, g, b, x, \set + alias_src_8888 r, g, b, x, \set .endm .macro alias_src_bgrx set=1 - alias_src_8888 b, g, r, x, \set + alias_src_8888 b, g, r, x, \set .endm .macro alias_dst_nv12 set=1 - alias u8x8, c8x8x2_l, \set - alias v8x8, c8x8x2_h, \set + alias u8x8, c8x8x2_l, \set + alias v8x8, c8x8x2_h, \set .endm .macro alias_dst_nv21 set=1 - alias v8x8, c8x8x2_l, \set - alias u8x8, c8x8x2_h, \set + alias v8x8, c8x8x2_l, \set + alias u8x8, c8x8x2_h, \set .endm @@ -259,33 +259,33 @@ alias BIAS_Y, q2 /* q3-q6 R8G8B8X8 x16 */ .macro alias_src_8888 a, b, c, d, set - alias_qw \a\()8x16, q3, \set - alias_qw \b\()8x16, q4, \set - alias_qw \c\()8x16, q5, \set - alias_qw \d\()8x16, q6, \set + alias_qw \a\()8x16, q3, \set + alias_qw \b\()8x16, q4, \set + alias_qw \c\()8x16, q5, \set + alias_qw \d\()8x16, q6, \set .endm .macro kernel_420_16x2 rgb_fmt, yuv_fmt, rgb0, rgb1, y0, y1, chroma, count - alias_src_\rgb_fmt - alias_dst_\yuv_fmt + alias_src_\rgb_fmt + alias_dst_\yuv_fmt - load_\rgb_fmt\()_16x1 \rgb0, \count + load_\rgb_fmt\()_16x1 \rgb0, \count - downsample - compute_y_16x1 - store_y8_16x1 \y0, \count + downsample + compute_y_16x1 + store_y8_16x1 \y0, \count - load_\rgb_fmt\()_16x1 \rgb1, \count - downsample_ars2 - compute_y_16x1 - store_y8_16x1 \y1, \count + load_\rgb_fmt\()_16x1 \rgb1, \count + downsample_ars2 + compute_y_16x1 + store_y8_16x1 \y1, \count - compute_chroma_8x1 u, U - compute_chroma_8x1 v, V + compute_chroma_8x1 u, U + compute_chroma_8x1 v, V - store_chroma_\yuv_fmt\()_8x1 \chroma, \count + store_chroma_\yuv_fmt\()_8x1 \chroma, \count - alias_dst_\yuv_fmt 0 - alias_src_\rgb_fmt 0 + alias_dst_\yuv_fmt 0 + alias_src_\rgb_fmt 0 .endm diff --git a/libswscale/arm/yuv2rgb_neon.S b/libswscale/arm/yuv2rgb_neon.S index 6777d625f9..bd4c8fcc24 100644 --- a/libswscale/arm/yuv2rgb_neon.S +++ b/libswscale/arm/yuv2rgb_neon.S @@ -23,254 +23,254 @@ .macro compute_premult - vsub.u16 q14,q11 @ q14 = U * (1 << 3) - 128 * (1 << 3) - vsub.u16 q15,q11 @ q15 = V * (1 << 3) - 128 * (1 << 3) - vqdmulh.s16 q8, q15, d1[0] @ q8 = V * v2r - vqdmulh.s16 q9, q14, d1[1] @ q9 = U * u2g - vqdmulh.s16 q5, q15, d1[2] @ q5 = V * v2g - vadd.s16 q9, q5 @ q9 = U * u2g + V * v2g - vqdmulh.s16 q10,q14, d1[3] @ q10 = U * u2b + vsub.u16 q14,q11 @ q14 = U * (1 << 3) - 128 * (1 << 3) + vsub.u16 q15,q11 @ q15 = V * (1 << 3) - 128 * (1 << 3) + vqdmulh.s16 q8, q15, d1[0] @ q8 = V * v2r + vqdmulh.s16 q9, q14, d1[1] @ q9 = U * u2g + vqdmulh.s16 q5, q15, d1[2] @ q5 = V * v2g + vadd.s16 q9, q5 @ q9 = U * u2g + V * v2g + vqdmulh.s16 q10,q14, d1[3] @ q10 = U * u2b .endm .macro compute_color dst_comp1 dst_comp2 pre - vadd.s16 q1, q14, \pre - vadd.s16 q2, q15, \pre - vqrshrun.s16 \dst_comp1, q1, #1 - vqrshrun.s16 \dst_comp2, q2, #1 + vadd.s16 q1, q14, \pre + vadd.s16 q2, q15, \pre + vqrshrun.s16 \dst_comp1, q1, #1 + vqrshrun.s16 \dst_comp2, q2, #1 .endm .macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2 - compute_color \r1, \r2, q8 - compute_color \g1, \g2, q9 - compute_color \b1, \b2, q10 - vmov.u8 \a1, #255 - vmov.u8 \a2, #255 + compute_color \r1, \r2, q8 + compute_color \g1, \g2, q9 + compute_color \b1, \b2, q10 + vmov.u8 \a1, #255 + vmov.u8 \a2, #255 .endm .macro compute dst ofmt - vshll.u8 q14, d14, #3 @ q14 = Y * (1 << 3) - vshll.u8 q15, d15, #3 @ q15 = Y * (1 << 3) - vsub.s16 q14, q12 @ q14 = (Y - y_offset) - vsub.s16 q15, q12 @ q15 = (Y - y_offset) - vqdmulh.s16 q14, q13 @ q14 = (Y - y_offset) * y_coeff - vqdmulh.s16 q15, q13 @ q15 = (Y - y_offset) * y_coeff + vshll.u8 q14, d14, #3 @ q14 = Y * (1 << 3) + vshll.u8 q15, d15, #3 @ q15 = Y * (1 << 3) + vsub.s16 q14, q12 @ q14 = (Y - y_offset) + vsub.s16 q15, q12 @ q15 = (Y - y_offset) + vqdmulh.s16 q14, q13 @ q14 = (Y - y_offset) * y_coeff + vqdmulh.s16 q15, q13 @ q15 = (Y - y_offset) * y_coeff .ifc \ofmt,argb - compute_rgba d7, d8, d9, d6, d11, d12, d13, d10 + compute_rgba d7, d8, d9, d6, d11, d12, d13, d10 .endif .ifc \ofmt,rgba - compute_rgba d6, d7, d8, d9, d10, d11, d12, d13 + compute_rgba d6, d7, d8, d9, d10, d11, d12, d13 .endif .ifc \ofmt,abgr - compute_rgba d9, d8, d7, d6, d13, d12, d11, d10 + compute_rgba d9, d8, d7, d6, d13, d12, d11, d10 .endif .ifc \ofmt,bgra - compute_rgba d8, d7, d6, d9, d12, d11, d10, d13 + compute_rgba d8, d7, d6, d9, d12, d11, d10, d13 .endif - vzip.8 d6, d10 @ d6 = R1R2R3R4R5R6R7R8 d10 = R9R10R11R12R13R14R15R16 - vzip.8 d7, d11 @ d7 = G1G2G3G4G5G6G7G8 d11 = G9G10G11G12G13G14G15G16 - vzip.8 d8, d12 @ d8 = B1B2B3B4B5B6B7B8 d12 = B9B10B11B12B13B14B15B16 - vzip.8 d9, d13 @ d9 = A1A2A3A4A5A6A7A8 d13 = A9A10A11A12A13A14A15A16 - vst4.8 {q3, q4}, [\dst]! - vst4.8 {q5, q6}, [\dst]! + vzip.8 d6, d10 @ d6 = R1R2R3R4R5R6R7R8 d10 = R9R10R11R12R13R14R15R16 + vzip.8 d7, d11 @ d7 = G1G2G3G4G5G6G7G8 d11 = G9G10G11G12G13G14G15G16 + vzip.8 d8, d12 @ d8 = B1B2B3B4B5B6B7B8 d12 = B9B10B11B12B13B14B15B16 + vzip.8 d9, d13 @ d9 = A1A2A3A4A5A6A7A8 d13 = A9A10A11A12A13A14A15A16 + vst4.8 {q3, q4}, [\dst]! + vst4.8 {q5, q6}, [\dst]! .endm .macro process_1l_internal dst src ofmt - vld2.8 {d14, d15}, [\src]! @ q7 = Y (interleaved) - compute \dst, \ofmt + vld2.8 {d14, d15}, [\src]! @ q7 = Y (interleaved) + compute \dst, \ofmt .endm .macro process_1l ofmt - compute_premult - process_1l_internal r2, r4, \ofmt + compute_premult + process_1l_internal r2, r4, \ofmt .endm .macro process_2l ofmt - compute_premult - process_1l_internal r2, r4, \ofmt - process_1l_internal r11,r12,\ofmt + compute_premult + process_1l_internal r2, r4, \ofmt + process_1l_internal r11,r12,\ofmt .endm .macro load_args_nv12 - push {r4-r12, lr} - vpush {q4-q7} - ldr r4, [sp, #104] @ r4 = srcY - ldr r5, [sp, #108] @ r5 = linesizeY - ldr r6, [sp, #112] @ r6 = srcC - ldr r7, [sp, #116] @ r7 = linesizeC - ldr r8, [sp, #120] @ r8 = table - ldr r9, [sp, #124] @ r9 = y_offset - ldr r10,[sp, #128] @ r10 = y_coeff - vdup.16 d0, r10 @ d0 = y_coeff - vld1.16 {d1}, [r8] @ d1 = *table - add r11, r2, r3 @ r11 = dst + linesize (dst2) - add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2) - lsl r3, r3, #1 - lsl r5, r5, #1 - sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding) - sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY) - sub r7, r7, r0 @ r7 = linesizeC - width (paddingC) + push {r4-r12, lr} + vpush {q4-q7} + ldr r4, [sp, #104] @ r4 = srcY + ldr r5, [sp, #108] @ r5 = linesizeY + ldr r6, [sp, #112] @ r6 = srcC + ldr r7, [sp, #116] @ r7 = linesizeC + ldr r8, [sp, #120] @ r8 = table + ldr r9, [sp, #124] @ r9 = y_offset + ldr r10,[sp, #128] @ r10 = y_coeff + vdup.16 d0, r10 @ d0 = y_coeff + vld1.16 {d1}, [r8] @ d1 = *table + add r11, r2, r3 @ r11 = dst + linesize (dst2) + add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2) + lsl r3, r3, #1 + lsl r5, r5, #1 + sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding) + sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY) + sub r7, r7, r0 @ r7 = linesizeC - width (paddingC) .endm .macro load_args_nv21 - load_args_nv12 + load_args_nv12 .endm .macro load_args_yuv420p - push {r4-r12, lr} - vpush {q4-q7} - ldr r4, [sp, #104] @ r4 = srcY - ldr r5, [sp, #108] @ r5 = linesizeY - ldr r6, [sp, #112] @ r6 = srcU - ldr r8, [sp, #128] @ r8 = table - ldr r9, [sp, #132] @ r9 = y_offset - ldr r10,[sp, #136] @ r10 = y_coeff - vdup.16 d0, r10 @ d0 = y_coeff - vld1.16 {d1}, [r8] @ d1 = *table - add r11, r2, r3 @ r11 = dst + linesize (dst2) - add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2) - lsl r3, r3, #1 - lsl r5, r5, #1 - sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding) - sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY) - ldr r10,[sp, #120] @ r10 = srcV + push {r4-r12, lr} + vpush {q4-q7} + ldr r4, [sp, #104] @ r4 = srcY + ldr r5, [sp, #108] @ r5 = linesizeY + ldr r6, [sp, #112] @ r6 = srcU + ldr r8, [sp, #128] @ r8 = table + ldr r9, [sp, #132] @ r9 = y_offset + ldr r10,[sp, #136] @ r10 = y_coeff + vdup.16 d0, r10 @ d0 = y_coeff + vld1.16 {d1}, [r8] @ d1 = *table + add r11, r2, r3 @ r11 = dst + linesize (dst2) + add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2) + lsl r3, r3, #1 + lsl r5, r5, #1 + sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding) + sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY) + ldr r10,[sp, #120] @ r10 = srcV .endm .macro load_args_yuv422p - push {r4-r12, lr} - vpush {q4-q7} - ldr r4, [sp, #104] @ r4 = srcY - ldr r5, [sp, #108] @ r5 = linesizeY - ldr r6, [sp, #112] @ r6 = srcU - ldr r7, [sp, #116] @ r7 = linesizeU - ldr r12,[sp, #124] @ r12 = linesizeV - ldr r8, [sp, #128] @ r8 = table - ldr r9, [sp, #132] @ r9 = y_offset - ldr r10,[sp, #136] @ r10 = y_coeff - vdup.16 d0, r10 @ d0 = y_coeff - vld1.16 {d1}, [r8] @ d1 = *table - sub r3, r3, r0, lsl #2 @ r3 = linesize - width * 4 (padding) - sub r5, r5, r0 @ r5 = linesizeY - width (paddingY) - sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU) - sub r12,r12,r0, lsr #1 @ r12 = linesizeV - width / 2 (paddingV) - ldr r10,[sp, #120] @ r10 = srcV + push {r4-r12, lr} + vpush {q4-q7} + ldr r4, [sp, #104] @ r4 = srcY + ldr r5, [sp, #108] @ r5 = linesizeY + ldr r6, [sp, #112] @ r6 = srcU + ldr r7, [sp, #116] @ r7 = linesizeU + ldr r12,[sp, #124] @ r12 = linesizeV + ldr r8, [sp, #128] @ r8 = table + ldr r9, [sp, #132] @ r9 = y_offset + ldr r10,[sp, #136] @ r10 = y_coeff + vdup.16 d0, r10 @ d0 = y_coeff + vld1.16 {d1}, [r8] @ d1 = *table + sub r3, r3, r0, lsl #2 @ r3 = linesize - width * 4 (padding) + sub r5, r5, r0 @ r5 = linesizeY - width (paddingY) + sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU) + sub r12,r12,r0, lsr #1 @ r12 = linesizeV - width / 2 (paddingV) + ldr r10,[sp, #120] @ r10 = srcV .endm .macro load_chroma_nv12 - pld [r12, #64*3] + pld [r12, #64*3] - vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line - vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3) - vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3) + vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line + vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3) + vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3) .endm .macro load_chroma_nv21 - pld [r12, #64*3] + pld [r12, #64*3] - vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line - vshll.u8 q14, d3, #3 @ q14 = U * (1 << 3) - vshll.u8 q15, d2, #3 @ q15 = V * (1 << 3) + vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line + vshll.u8 q14, d3, #3 @ q14 = U * (1 << 3) + vshll.u8 q15, d2, #3 @ q15 = V * (1 << 3) .endm .macro load_chroma_yuv420p - pld [r10, #64*3] - pld [r12, #64*3] + pld [r10, #64*3] + pld [r12, #64*3] - vld1.8 d2, [r6]! @ d2: chroma red line - vld1.8 d3, [r10]! @ d3: chroma blue line - vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3) - vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3) + vld1.8 d2, [r6]! @ d2: chroma red line + vld1.8 d3, [r10]! @ d3: chroma blue line + vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3) + vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3) .endm .macro load_chroma_yuv422p - pld [r10, #64*3] + pld [r10, #64*3] - vld1.8 d2, [r6]! @ d2: chroma red line - vld1.8 d3, [r10]! @ d3: chroma blue line - vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3) - vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3) + vld1.8 d2, [r6]! @ d2: chroma red line + vld1.8 d3, [r10]! @ d3: chroma blue line + vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3) + vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3) .endm .macro increment_and_test_nv12 - add r11, r11, r3 @ dst2 += padding - add r12, r12, r5 @ srcY2 += paddingY - add r6, r6, r7 @ srcC += paddingC - subs r1, r1, #2 @ height -= 2 + add r11, r11, r3 @ dst2 += padding + add r12, r12, r5 @ srcY2 += paddingY + add r6, r6, r7 @ srcC += paddingC + subs r1, r1, #2 @ height -= 2 .endm .macro increment_and_test_nv21 - increment_and_test_nv12 + increment_and_test_nv12 .endm .macro increment_and_test_yuv420p - add r11, r11, r3 @ dst2 += padding - add r12, r12, r5 @ srcY2 += paddingY - ldr r7, [sp, #116] @ r7 = linesizeU - sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU) - add r6, r6, r7 @ srcU += paddingU - ldr r7, [sp, #124] @ r7 = linesizeV - sub r7, r7, r0, lsr #1 @ r7 = linesizeV - width / 2 (paddingV) - add r10, r10, r7 @ srcV += paddingV - subs r1, r1, #2 @ height -= 2 + add r11, r11, r3 @ dst2 += padding + add r12, r12, r5 @ srcY2 += paddingY + ldr r7, [sp, #116] @ r7 = linesizeU + sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU) + add r6, r6, r7 @ srcU += paddingU + ldr r7, [sp, #124] @ r7 = linesizeV + sub r7, r7, r0, lsr #1 @ r7 = linesizeV - width / 2 (paddingV) + add r10, r10, r7 @ srcV += paddingV + subs r1, r1, #2 @ height -= 2 .endm .macro increment_and_test_yuv422p - add r6, r6, r7 @ srcU += paddingU - add r10,r10,r12 @ srcV += paddingV - subs r1, r1, #1 @ height -= 1 + add r6, r6, r7 @ srcU += paddingU + add r10,r10,r12 @ srcV += paddingV + subs r1, r1, #1 @ height -= 1 .endm .macro process_nv12 ofmt - process_2l \ofmt + process_2l \ofmt .endm .macro process_nv21 ofmt - process_2l \ofmt + process_2l \ofmt .endm .macro process_yuv420p ofmt - process_2l \ofmt + process_2l \ofmt .endm .macro process_yuv422p ofmt - process_1l \ofmt + process_1l \ofmt .endm .macro declare_func ifmt ofmt function ff_\ifmt\()_to_\ofmt\()_neon, export=1 - load_args_\ifmt - vmov.u16 q11, #1024 @ q11 = 128 * (1 << 3) - vdup.16 q12, r9 @ q12 = y_offset - vmov d26, d0 @ q13 = y_coeff - vmov d27, d0 @ q13 = y_coeff + load_args_\ifmt + vmov.u16 q11, #1024 @ q11 = 128 * (1 << 3) + vdup.16 q12, r9 @ q12 = y_offset + vmov d26, d0 @ q13 = y_coeff + vmov d27, d0 @ q13 = y_coeff 1: - mov r8, r0 @ r8 = width + mov r8, r0 @ r8 = width 2: - pld [r6, #64*3] - pld [r4, #64*3] - vmov.i8 d10, #128 - load_chroma_\ifmt - process_\ifmt \ofmt - subs r8, r8, #16 @ width -= 16 - bgt 2b - add r2, r2, r3 @ dst += padding - add r4, r4, r5 @ srcY += paddingY - increment_and_test_\ifmt - bgt 1b - vpop {q4-q7} - pop {r4-r12, pc} + pld [r6, #64*3] + pld [r4, #64*3] + vmov.i8 d10, #128 + load_chroma_\ifmt + process_\ifmt \ofmt + subs r8, r8, #16 @ width -= 16 + bgt 2b + add r2, r2, r3 @ dst += padding + add r4, r4, r5 @ srcY += paddingY + increment_and_test_\ifmt + bgt 1b + vpop {q4-q7} + pop {r4-r12, pc} endfunc .endm .macro declare_rgb_funcs ifmt - declare_func \ifmt, argb - declare_func \ifmt, rgba - declare_func \ifmt, abgr - declare_func \ifmt, bgra + declare_func \ifmt, argb + declare_func \ifmt, rgba + declare_func \ifmt, abgr + declare_func \ifmt, bgra .endm declare_rgb_funcs nv12