www.pudn.com > X264_20060729.rar > pixel-a.asm
;***************************************************************************** ;* pixel.asm: h264 encoder library ;***************************************************************************** ;* Copyright (C) 2003 x264 project ;* $Id: pixel.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $ ;* ;* Authors: Laurent Aimar;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. ;***************************************************************************** BITS 64 ;============================================================================= ; Macros and other preprocessor constants ;============================================================================= %include "amd64inc.asm" ; sad %macro SAD_INC_2x16P 0 movq mm1, [parm1q] movq mm2, [parm1q+8] movq mm3, [parm1q+parm2q] movq mm4, [parm1q+parm2q+8] psadbw mm1, [parm3q] psadbw mm2, [parm3q+8] psadbw mm3, [parm3q+parm4q] psadbw mm4, [parm3q+parm4q+8] lea parm1q, [parm1q+2*parm2q] paddw mm1, mm2 paddw mm3, mm4 lea parm3q, [parm3q+2*parm4q] paddw mm0, mm1 paddw mm0, mm3 %endmacro %macro SAD_INC_2x8P 0 movq mm1, [parm1q] movq mm2, [parm1q+parm2q] psadbw mm1, [parm3q] psadbw mm2, [parm3q+parm4q] lea parm1q, [parm1q+2*parm2q] paddw mm0, mm1 paddw mm0, mm2 lea parm3q, [parm3q+2*parm4q] %endmacro %macro SAD_INC_2x4P 0 movd mm1, [parm1q] movd mm2, [parm3q] movd mm3, [parm1q+parm2q] movd mm4, [parm3q+parm4q] psadbw mm1, mm2 psadbw mm3, mm4 paddw mm0, mm1 paddw mm0, mm3 lea parm1q, [parm1q+2*parm2q] lea parm3q, [parm3q+2*parm4q] %endmacro ; sad x3 / x4 %macro SAD_X3_START_1x8P 1 mov%1 mm3, [parm1q] mov%1 mm0, [parm2q] mov%1 mm1, [parm3q] mov%1 mm2, [parm4q] psadbw mm0, mm3 psadbw mm1, mm3 psadbw mm2, mm3 %endmacro %macro SAD_X3_1x8P 3 mov%1 mm3, [parm1q+%2] mov%1 mm4, [parm2q+%3] mov%1 mm5, [parm3q+%3] mov%1 mm6, [parm4q+%3] psadbw mm4, mm3 psadbw mm5, mm3 psadbw mm6, mm3 paddw mm0, mm4 paddw mm1, mm5 paddw mm2, mm6 %endmacro %macro SAD_X3_2x16P 1 %if %1 SAD_X3_START_1x8P q %else SAD_X3_1x8P q, 0, 0 %endif SAD_X3_1x8P q, 8, 8 SAD_X3_1x8P q, FENC_STRIDE, parm5q SAD_X3_1x8P q, FENC_STRIDE+8, parm5q+8 add parm1q, 2*FENC_STRIDE lea parm2q, [parm2q+2*parm5q] lea parm3q, [parm3q+2*parm5q] lea parm4q, [parm4q+2*parm5q] %endmacro %macro SAD_X3_2x8P 1 %if %1 SAD_X3_START_1x8P q %else SAD_X3_1x8P q, 0, 0 %endif SAD_X3_1x8P q, FENC_STRIDE, parm5q add parm1q, 2*FENC_STRIDE lea parm2q, [parm2q+2*parm5q] lea parm3q, [parm3q+2*parm5q] lea parm4q, [parm4q+2*parm5q] %endmacro %macro SAD_X3_2x4P 1 %if %1 SAD_X3_START_1x8P d %else SAD_X3_1x8P d, 0, 0 %endif SAD_X3_1x8P d, FENC_STRIDE, parm5q add parm1q, 2*FENC_STRIDE lea parm2q, [parm2q+2*parm5q] lea parm3q, [parm3q+2*parm5q] lea parm4q, [parm4q+2*parm5q] %endmacro %macro SAD_X4_START_1x8P 1 mov%1 mm7, [parm1q] mov%1 mm0, [parm2q] mov%1 mm1, [parm3q] mov%1 mm2, [parm4q] mov%1 mm3, [parm5q] psadbw mm0, mm7 psadbw mm1, mm7 psadbw mm2, mm7 psadbw mm3, mm7 %endmacro %macro SAD_X4_1x8P 2 movq mm7, [parm1q+%1] movq mm4, [parm2q+%2] movq mm5, [parm3q+%2] movq mm6, [parm4q+%2] psadbw mm4, mm7 psadbw mm5, mm7 psadbw mm6, mm7 psadbw mm7, [parm5q+%2] paddw mm0, mm4 paddw mm1, mm5 paddw mm2, mm6 paddw mm3, mm7 %endmacro %macro SAD_X4_1x4P 2 movd mm7, [parm1q+%1] movd mm4, [parm2q+%2] movd mm5, [parm3q+%2] movd mm6, [parm4q+%2] psadbw mm4, mm7 psadbw mm5, mm7 paddw mm0, mm4 psadbw mm6, mm7 movd mm4, [parm5q+%2] paddw mm1, mm5 psadbw mm4, mm7 paddw mm2, mm6 paddw mm3, mm4 %endmacro %macro SAD_X4_2x16P 1 %if %1 SAD_X4_START_1x8P q %else SAD_X4_1x8P 0, 0 %endif SAD_X4_1x8P 8, 8 SAD_X4_1x8P FENC_STRIDE, parm6q SAD_X4_1x8P FENC_STRIDE+8, parm6q+8 add parm1q, 2*FENC_STRIDE lea parm2q, [parm2q+2*parm6q] lea parm3q, [parm3q+2*parm6q] lea parm4q, [parm4q+2*parm6q] lea parm5q, [parm5q+2*parm6q] %endmacro %macro SAD_X4_2x8P 1 %if %1 SAD_X4_START_1x8P q %else SAD_X4_1x8P 0, 0 %endif SAD_X4_1x8P FENC_STRIDE, parm6q add parm1q, 2*FENC_STRIDE lea parm2q, [parm2q+2*parm6q] lea parm3q, [parm3q+2*parm6q] lea parm4q, [parm4q+2*parm6q] lea parm5q, [parm5q+2*parm6q] %endmacro %macro SAD_X4_2x4P 1 %if %1 SAD_X4_START_1x8P d %else SAD_X4_1x4P 0, 0 %endif SAD_X4_1x4P FENC_STRIDE, parm6q add parm1q, 2*FENC_STRIDE lea parm2q, [parm2q+2*parm6q] lea parm3q, [parm3q+2*parm6q] lea parm4q, [parm4q+2*parm6q] lea parm5q, [parm5q+2*parm6q] %endmacro %macro SAD_X3_END 0 movd [parm6q+0], mm0 movd [parm6q+4], mm1 movd [parm6q+8], mm2 ret %endmacro %macro SAD_X4_END 0 mov rax, parm7q movd [rax+0], mm0 movd [rax+4], mm1 movd [rax+8], mm2 movd [rax+12], mm3 ret %endmacro ; ssd %macro SSD_INC_1x16P 0 movq mm1, [parm1q] movq mm2, [parm3q] movq mm3, [parm1q+8] movq mm4, [parm3q+8] movq mm5, mm2 movq mm6, mm4 psubusb mm2, mm1 psubusb mm4, mm3 psubusb mm1, mm5 psubusb mm3, mm6 por mm1, mm2 por mm3, mm4 movq mm2, mm1 movq mm4, mm3 punpcklbw mm1, mm7 punpcklbw mm3, mm7 punpckhbw mm2, mm7 punpckhbw mm4, mm7 pmaddwd mm1, mm1 pmaddwd mm2, mm2 pmaddwd mm3, mm3 pmaddwd mm4, mm4 add parm1q, parm2q add parm3q, parm4q paddd mm0, mm1 paddd mm0, mm2 paddd mm0, mm3 paddd mm0, mm4 %endmacro %macro SSD_INC_1x8P 0 movq mm1, [parm1q] movq mm2, [parm3q] movq mm5, mm2 psubusb mm2, mm1 psubusb mm1, mm5 por mm1, mm2 ; mm1 = 8bit abs diff movq mm2, mm1 punpcklbw mm1, mm7 punpckhbw mm2, mm7 ; (mm1,mm2) = 16bit abs diff pmaddwd mm1, mm1 pmaddwd mm2, mm2 add parm1q, parm2q add parm3q, parm4q paddd mm0, mm1 paddd mm0, mm2 %endmacro %macro SSD_INC_1x4P 0 movd mm1, [parm1q] movd mm2, [parm3q] movq mm5, mm2 psubusb mm2, mm1 psubusb mm1, mm5 por mm1, mm2 punpcklbw mm1, mm7 pmaddwd mm1, mm1 add parm1q, parm2q add parm3q, parm4q paddd mm0, mm1 %endmacro ; satd %macro LOAD_DIFF_4P 4 ; MMP, MMT, [pix1], [pix2] movd %1, %3 movd %2, %4 punpcklbw %1, %2 punpcklbw %2, %2 psubw %1, %2 %endmacro %macro HADAMARD4_SUB_BADC 4 paddw %1, %2 paddw %3, %4 paddw %2, %2 paddw %4, %4 psubw %2, %1 psubw %4, %3 %endmacro %macro HADAMARD4x4 4 HADAMARD4_SUB_BADC %1, %2, %3, %4 HADAMARD4_SUB_BADC %1, %3, %2, %4 %endmacro %macro SBUTTERFLYwd 3 movq %3, %1 punpcklwd %1, %2 punpckhwd %3, %2 %endmacro %macro SBUTTERFLYdq 3 movq %3, %1 punpckldq %1, %2 punpckhdq %3, %2 %endmacro %macro TRANSPOSE4x4 5 ; abcd-t -> adtc SBUTTERFLYwd %1, %2, %5 SBUTTERFLYwd %3, %4, %2 SBUTTERFLYdq %1, %3, %4 SBUTTERFLYdq %5, %2, %3 %endmacro %macro MMX_ABS 2 ; mma, tmp pxor %2, %2 psubw %2, %1 pmaxsw %1, %2 %endmacro %macro MMX_ABS_TWO 4 ; mma, mmb, tmp0, tmp1 pxor %3, %3 pxor %4, %4 psubw %3, %1 psubw %4, %2 pmaxsw %1, %3 pmaxsw %2, %4 %endmacro %macro HADAMARD4x4_SUM 1 ; %1 = dest (row sum of one block) HADAMARD4x4 mm4, mm5, mm6, mm7 TRANSPOSE4x4 mm4, mm5, mm6, mm7, %1 HADAMARD4x4 mm4, mm7, %1, mm6 MMX_ABS_TWO mm4, mm7, mm3, mm5 MMX_ABS_TWO %1, mm6, mm3, mm5 paddw %1, mm4 paddw mm6, mm7 pavgw %1, mm6 %endmacro ; in: r10=3*stride1, r11=3*stride2 ; in: %2 = horizontal offset ; in: %3 = whether we need to increment pix1 and pix2 ; clobber: mm3..mm7 ; out: %1 = satd %macro LOAD_DIFF_HADAMARD_SUM 3 LOAD_DIFF_4P mm4, mm3, [parm1q+%2], [parm3q+%2] LOAD_DIFF_4P mm5, mm3, [parm1q+parm2q+%2], [parm3q+parm4q+%2] LOAD_DIFF_4P mm6, mm3, [parm1q+2*parm2q+%2], [parm3q+2*parm4q+%2] LOAD_DIFF_4P mm7, mm3, [parm1q+r10+%2], [parm3q+r11+%2] %if %3 lea parm1q, [parm1q+4*parm2q] lea parm3q, [parm3q+4*parm4q] %endif HADAMARD4x4_SUM %1 %endmacro ;============================================================================= ; Code ;============================================================================= SECTION .text cglobal x264_pixel_sad_16x16_mmxext cglobal x264_pixel_sad_16x8_mmxext cglobal x264_pixel_sad_8x16_mmxext cglobal x264_pixel_sad_8x8_mmxext cglobal x264_pixel_sad_8x4_mmxext cglobal x264_pixel_sad_4x8_mmxext cglobal x264_pixel_sad_4x4_mmxext cglobal x264_pixel_sad_x3_16x16_mmxext cglobal x264_pixel_sad_x3_16x8_mmxext cglobal x264_pixel_sad_x3_8x16_mmxext cglobal x264_pixel_sad_x3_8x8_mmxext cglobal x264_pixel_sad_x3_8x4_mmxext cglobal x264_pixel_sad_x3_4x8_mmxext cglobal x264_pixel_sad_x3_4x4_mmxext cglobal x264_pixel_sad_x4_16x16_mmxext cglobal x264_pixel_sad_x4_16x8_mmxext cglobal x264_pixel_sad_x4_8x16_mmxext cglobal x264_pixel_sad_x4_8x8_mmxext cglobal x264_pixel_sad_x4_8x4_mmxext cglobal x264_pixel_sad_x4_4x8_mmxext cglobal x264_pixel_sad_x4_4x4_mmxext cglobal x264_pixel_sad_pde_16x16_mmxext cglobal x264_pixel_sad_pde_16x8_mmxext cglobal x264_pixel_sad_pde_8x16_mmxext cglobal x264_pixel_ssd_16x16_mmx cglobal x264_pixel_ssd_16x8_mmx cglobal x264_pixel_ssd_8x16_mmx cglobal x264_pixel_ssd_8x8_mmx cglobal x264_pixel_ssd_8x4_mmx cglobal x264_pixel_ssd_4x8_mmx cglobal x264_pixel_ssd_4x4_mmx cglobal x264_pixel_satd_4x4_mmxext cglobal x264_pixel_satd_4x8_mmxext cglobal x264_pixel_satd_8x4_mmxext cglobal x264_pixel_satd_8x8_mmxext cglobal x264_pixel_satd_16x8_mmxext cglobal x264_pixel_satd_8x16_mmxext cglobal x264_pixel_satd_16x16_mmxext cglobal x264_intra_satd_x3_4x4_mmxext cglobal x264_intra_satd_x3_8x8c_mmxext cglobal x264_intra_satd_x3_16x16_mmxext %macro SAD_START 0 pxor mm0, mm0 %endmacro %macro SAD_END 0 movd eax, mm0 ret %endmacro ;----------------------------------------------------------------------------- ; int x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- %macro SAD 2 ALIGN 16 x264_pixel_sad_%1x%2_mmxext: SAD_START %rep %2/2 SAD_INC_2x%1P %endrep SAD_END %endmacro SAD 16, 16 SAD 16, 8 SAD 8, 16 SAD 8, 8 SAD 8, 4 SAD 4, 8 SAD 4, 4 ;----------------------------------------------------------------------------- ; void x264_pixel_sad_x3_16x16_mmxext( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, ; uint8_t *pix2, int i_stride, int scores[3] ) ;----------------------------------------------------------------------------- %macro SAD_X 3 ALIGN 16 x264_pixel_sad_x%1_%2x%3_mmxext: SAD_X%1_2x%2P 1 %rep %3/2-1 SAD_X%1_2x%2P 0 %endrep SAD_X%1_END %endmacro SAD_X 3, 16, 16 SAD_X 3, 16, 8 SAD_X 3, 8, 16 SAD_X 3, 8, 8 SAD_X 3, 8, 4 SAD_X 3, 4, 8 SAD_X 3, 4, 4 SAD_X 4, 16, 16 SAD_X 4, 16, 8 SAD_X 4, 8, 16 SAD_X 4, 8, 8 SAD_X 4, 8, 4 SAD_X 4, 4, 8 SAD_X 4, 4, 4 %macro PDE_CHECK 0 movd eax, mm0 cmp eax, parm5d ; prev_score jl .continue ret ALIGN 4 .continue: %endmacro ;----------------------------------------------------------------------------- ; int x264_pixel_sad_pde_16x16_mmxext (uint8_t *, int, uint8_t *, int, int ) ;----------------------------------------------------------------------------- %macro SAD_PDE 2 ALIGN 16 x264_pixel_sad_pde_%1x%2_mmxext: SAD_START %rep %2/4 SAD_INC_2x%1P %endrep movd eax, mm0 cmp eax, parm5d ; prev_score jl .continue ret ALIGN 4 .continue: %rep %2/4 SAD_INC_2x%1P %endrep SAD_END %endmacro SAD_PDE 16, 16 SAD_PDE 16 , 8 SAD_PDE 8, 16 %macro SSD_START 0 pxor mm7, mm7 ; zero pxor mm0, mm0 ; mm0 holds the sum %endmacro %macro SSD_END 0 movq mm1, mm0 psrlq mm1, 32 paddd mm0, mm1 movd eax, mm0 ret %endmacro ;----------------------------------------------------------------------------- ; int x264_pixel_ssd_16x16_mmx (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- %macro SSD 2 ALIGN 16 x264_pixel_ssd_%1x%2_mmx: SSD_START %rep %2 SSD_INC_1x%1P %endrep SSD_END %endmacro SSD 16, 16 SSD 16, 8 SSD 8, 16 SSD 8, 8 SSD 8, 4 SSD 4, 8 SSD 4, 4 %macro SATD_START 0 lea r10, [3*parm2q] ; 3*stride1 lea r11, [3*parm4q] ; 3*stride2 %endmacro %macro SATD_END 0 pshufw mm1, mm0, 01001110b paddw mm0, mm1 pshufw mm1, mm0, 10110001b paddw mm0, mm1 movd eax, mm0 and eax, 0xffff ret %endmacro ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_satd_4x4_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- x264_pixel_satd_4x4_mmxext: SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 SATD_END ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_satd_4x8_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- x264_pixel_satd_4x8_mmxext: SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 1 LOAD_DIFF_HADAMARD_SUM mm1, 0, 0 paddw mm0, mm1 SATD_END ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_satd_8x4_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- x264_pixel_satd_8x4_mmxext: SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 paddw mm0, mm1 SATD_END ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_satd_8x8_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- x264_pixel_satd_8x8_mmxext: SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 paddw mm0, mm2 paddw mm0, mm1 SATD_END ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_satd_16x8_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- x264_pixel_satd_16x8_mmxext: SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 LOAD_DIFF_HADAMARD_SUM mm2, 8, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 12, 1 paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 8, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 12, 1 paddw mm0, mm2 paddw mm0, mm1 SATD_END ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_satd_8x16_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- x264_pixel_satd_8x16_mmxext: SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 paddw mm1, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 paddw mm0, mm2 paddw mm0, mm1 SATD_END ALIGN 16 ;----------------------------------------------------------------------------- ; int x264_pixel_satd_16x16_mmxext (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- x264_pixel_satd_16x16_mmxext: SATD_START LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 LOAD_DIFF_HADAMARD_SUM mm2, 8, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 12, 1 paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 8, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 12, 1 paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 8, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 12, 1 paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 8, 0 paddw mm0, mm1 LOAD_DIFF_HADAMARD_SUM mm1, 12, 0 paddw mm0, mm2 paddw mm0, mm1 pxor mm3, mm3 pshufw mm1, mm0, 01001110b paddw mm0, mm1 punpcklwd mm0, mm3 pshufw mm1, mm0, 01001110b paddd mm0, mm1 movd eax, mm0 ret ; in: parm1 = fenc ; out: mm0..mm3 = hadamard coefs ALIGN 16 load_hadamard: pxor mm7, mm7 movd mm0, [parm1q+0*FENC_STRIDE] movd mm4, [parm1q+1*FENC_STRIDE] movd mm3, [parm1q+2*FENC_STRIDE] movd mm1, [parm1q+3*FENC_STRIDE] punpcklbw mm0, mm7 punpcklbw mm4, mm7 punpcklbw mm3, mm7 punpcklbw mm1, mm7 HADAMARD4x4 mm0, mm4, mm3, mm1 TRANSPOSE4x4 mm0, mm4, mm3, mm1, mm2 HADAMARD4x4 mm0, mm1, mm2, mm3 ret %macro SCALAR_SUMSUB 4 add %1, %2 add %3, %4 add %2, %2 add %4, %4 sub %2, %1 sub %4, %3 %endmacro %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op pxor %7, %7 pshufw %4, %1, 01001110b pshufw %5, %2, 01001110b pshufw %6, %3, 01001110b paddw %1, %4 paddw %2, %5 paddw %3, %6 punpcklwd %1, %7 punpcklwd %2, %7 punpcklwd %3, %7 pshufw %4, %1, 01001110b pshufw %5, %2, 01001110b pshufw %6, %3, 01001110b %8 %1, %4 %8 %2, %5 %8 %3, %6 %endmacro ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- x264_intra_satd_x3_4x4_mmxext: %define top_1d rsp-8 ; +8 %define left_1d rsp-16 ; +8 call load_hadamard movzx r8d, byte [parm2q-1+0*FDEC_STRIDE] movzx r9d, byte [parm2q-1+1*FDEC_STRIDE] movzx r10d, byte [parm2q-1+2*FDEC_STRIDE] movzx r11d, byte [parm2q-1+3*FDEC_STRIDE] SCALAR_SUMSUB r8d, r9d, r10d, r11d SCALAR_SUMSUB r8d, r10d, r9d, r11d ; 1x4 hadamard mov [left_1d+0], r8w mov [left_1d+2], r9w mov [left_1d+4], r10w mov [left_1d+6], r11w mov eax, r8d ; dc movzx r8d, byte [parm2q-FDEC_STRIDE+0] movzx r9d, byte [parm2q-FDEC_STRIDE+1] movzx r10d, byte [parm2q-FDEC_STRIDE+2] movzx r11d, byte [parm2q-FDEC_STRIDE+3] SCALAR_SUMSUB r8d, r9d, r10d, r11d SCALAR_SUMSUB r8d, r10d, r9d, r11d ; 4x1 hadamard lea rax, [rax + r8 + 4] ; dc mov [top_1d+0], r8w mov [top_1d+2], r9w mov [top_1d+4], r10w mov [top_1d+6], r11w and eax, -8 shl eax, 1 movq mm4, mm1 movq mm5, mm2 MMX_ABS_TWO mm4, mm5, mm6, mm7 movq mm7, mm3 paddw mm4, mm5 MMX_ABS mm7, mm6 paddw mm7, mm4 ; 3x4 sum movq mm4, [left_1d] movd mm5, eax psllw mm4, 2 psubw mm4, mm0 psubw mm5, mm0 punpcklwd mm0, mm1 punpcklwd mm2, mm3 punpckldq mm0, mm2 ; transpose movq mm1, [top_1d] psllw mm1, 2 psubw mm0, mm1 MMX_ABS mm4, mm3 ; 1x4 sum MMX_ABS mm5, mm2 ; 1x4 sum MMX_ABS mm0, mm1 ; 4x1 sum paddw mm4, mm7 paddw mm5, mm7 movq mm1, mm5 psrlq mm1, 16 ; 4x3 sum paddw mm0, mm1 SUM_MM_X3 mm0, mm4, mm5, mm1, mm2, mm3, mm6, pavgw movd [parm3q+0], mm0 ; i4x4_v satd movd [parm3q+4], mm4 ; i4x4_h satd movd [parm3q+8], mm5 ; i4x4_dc satd ret ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- x264_intra_satd_x3_16x16_mmxext: %define sums rsp-32 ; +24 %define top_1d rsp-64 ; +32 %define left_1d rsp-96 ; +32 mov qword [sums+0], 0 mov qword [sums+8], 0 mov qword [sums+16], 0 ; 1D hadamards xor ecx, ecx mov eax, 12 .loop_edge: ; left shl eax, 5 ; log(FDEC_STRIDE) movzx r8d, byte [parm2q+rax-1+0*FDEC_STRIDE] movzx r9d, byte [parm2q+rax-1+1*FDEC_STRIDE] movzx r10d, byte [parm2q+rax-1+2*FDEC_STRIDE] movzx r11d, byte [parm2q+rax-1+3*FDEC_STRIDE] shr eax, 5 SCALAR_SUMSUB r8d, r9d, r10d, r11d SCALAR_SUMSUB r8d, r10d, r9d, r11d add ecx, r8d mov [left_1d+2*rax+0], r8w mov [left_1d+2*rax+2], r9w mov [left_1d+2*rax+4], r10w mov [left_1d+2*rax+6], r11w ; top movzx r8d, byte [parm2q+rax-FDEC_STRIDE+0] movzx r9d, byte [parm2q+rax-FDEC_STRIDE+1] movzx r10d, byte [parm2q+rax-FDEC_STRIDE+2] movzx r11d, byte [parm2q+rax-FDEC_STRIDE+3] SCALAR_SUMSUB r8d, r9d, r10d, r11d SCALAR_SUMSUB r8d, r10d, r9d, r11d add ecx, r8d mov [top_1d+2*rax+0], r8w mov [top_1d+2*rax+2], r9w mov [top_1d+2*rax+4], r10w mov [top_1d+2*rax+6], r11w sub eax, 4 jge .loop_edge ; dc shr ecx, 1 add ecx, 8 and ecx, -16 ; 2D hadamards xor eax, eax .loop_y: xor esi, esi .loop_x: call load_hadamard movq mm4, mm1 movq mm5, mm2 MMX_ABS_TWO mm4, mm5, mm6, mm7 movq mm7, mm3 paddw mm4, mm5 MMX_ABS mm7, mm6 paddw mm7, mm4 ; 3x4 sum movq mm4, [left_1d+8*rax] movd mm5, ecx psllw mm4, 2 psubw mm4, mm0 psubw mm5, mm0 punpcklwd mm0, mm1 punpcklwd mm2, mm3 punpckldq mm0, mm2 ; transpose movq mm1, [top_1d+8*rsi] psllw mm1, 2 psubw mm0, mm1 MMX_ABS mm4, mm3 ; 1x4 sum MMX_ABS mm5, mm2 ; 1x4 sum MMX_ABS mm0, mm1 ; 4x1 sum pavgw mm4, mm7 pavgw mm5, mm7 paddw mm0, [sums+0] ; i4x4_v satd paddw mm4, [sums+8] ; i4x4_h satd paddw mm5, [sums+16] ; i4x4_dc satd movq [sums+0], mm0 movq [sums+8], mm4 movq [sums+16], mm5 add parm1q, 4 inc esi cmp esi, 4 jl .loop_x add parm1q, 4*FENC_STRIDE-16 inc eax cmp eax, 4 jl .loop_y ; horizontal sum movq mm2, [sums+16] movq mm1, [sums+8] movq mm0, [sums+0] movq mm7, mm2 SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd psrld mm0, 1 pslld mm7, 16 psrld mm7, 16 paddd mm0, mm2 psubd mm0, mm7 movd [parm3q+8], mm2 ; i16x16_dc satd movd [parm3q+4], mm1 ; i16x16_h satd movd [parm3q+0], mm0 ; i16x16_v satd ret ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) ;----------------------------------------------------------------------------- x264_intra_satd_x3_8x8c_mmxext: %define sums rsp-32 ; +24 %define top_1d rsp-48 ; +16 %define left_1d rsp-64 ; +16 mov qword [sums+0], 0 mov qword [sums+8], 0 mov qword [sums+16], 0 ; 1D hadamards mov eax, 4 .loop_edge: ; left shl eax, 5 ; log(FDEC_STRIDE) movzx r8d, byte [parm2q+rax-1+0*FDEC_STRIDE] movzx r9d, byte [parm2q+rax-1+1*FDEC_STRIDE] movzx r10d, byte [parm2q+rax-1+2*FDEC_STRIDE] movzx r11d, byte [parm2q+rax-1+3*FDEC_STRIDE] shr eax, 5 SCALAR_SUMSUB r8d, r9d, r10d, r11d SCALAR_SUMSUB r8d, r10d, r9d, r11d mov [left_1d+2*rax+0], r8w mov [left_1d+2*rax+2], r9w mov [left_1d+2*rax+4], r10w mov [left_1d+2*rax+6], r11w ; top movzx r8d, byte [parm2q+rax-FDEC_STRIDE+0] movzx r9d, byte [parm2q+rax-FDEC_STRIDE+1] movzx r10d, byte [parm2q+rax-FDEC_STRIDE+2] movzx r11d, byte [parm2q+rax-FDEC_STRIDE+3] SCALAR_SUMSUB r8d, r9d, r10d, r11d SCALAR_SUMSUB r8d, r10d, r9d, r11d mov [top_1d+2*rax+0], r8w mov [top_1d+2*rax+2], r9w mov [top_1d+2*rax+4], r10w mov [top_1d+2*rax+6], r11w sub eax, 4 jge .loop_edge ; dc movzx r8d, word [left_1d+0] movzx r9d, word [top_1d+0] movzx r10d, word [left_1d+8] movzx r11d, word [top_1d+8] add r8d, r9d lea r9, [r10 + r11] lea r8, [2*r8 + 8] lea r9, [2*r9 + 8] lea r10, [4*r10 + 8] lea r11, [4*r11 + 8] and r8d, -16 ; tl and r9d, -16 ; br and r10d, -16 ; bl and r11d, -16 ; tr shl r9, 16 mov r9w, r10w shl r9, 16 mov r9w, r11w shl r9, 16 mov r9w, r8w ; 2D hadamards xor eax, eax .loop_y: xor esi, esi .loop_x: call load_hadamard movq mm4, mm1 movq mm5, mm2 MMX_ABS_TWO mm4, mm5, mm6, mm7 movq mm7, mm3 paddw mm4, mm5 MMX_ABS mm7, mm6 paddw mm7, mm4 ; 3x4 sum movq mm4, [left_1d+8*rax] movzx ecx, r9w shr r9, 16 movd mm5, ecx psllw mm4, 2 psubw mm4, mm0 psubw mm5, mm0 punpcklwd mm0, mm1 punpcklwd mm2, mm3 punpckldq mm0, mm2 ; transpose movq mm1, [top_1d+8*rsi] psllw mm1, 2 psubw mm0, mm1 MMX_ABS mm4, mm3 ; 1x4 sum MMX_ABS mm5, mm2 ; 1x4 sum MMX_ABS mm0, mm1 ; 4x1 sum pavgw mm4, mm7 pavgw mm5, mm7 paddw mm0, [sums+16] ; i4x4_v satd paddw mm4, [sums+8] ; i4x4_h satd paddw mm5, [sums+0] ; i4x4_dc satd movq [sums+16], mm0 movq [sums+8], mm4 movq [sums+0], mm5 add parm1q, 4 inc esi cmp esi, 2 jl .loop_x add parm1q, 4*FENC_STRIDE-8 inc eax cmp eax, 2 jl .loop_y ; horizontal sum movq mm0, [sums+0] movq mm1, [sums+8] movq mm2, [sums+16] movq mm7, mm0 psrlq mm7, 15 paddw mm2, mm7 SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd psrld mm2, 1 movd [parm3q+0], mm0 ; i8x8c_dc satd movd [parm3q+4], mm1 ; i8x8c_h satd movd [parm3q+8], mm2 ; i8x8c_v satd ret