www.pudn.com > T264-src-0.02.zip > dct_sse2.asm
;/***************************************************************************** ; * ; * T264 AVC CODEC ; * ; * Copyright(C) 2004-2005 llcc; * 2004-2005 visionany ; * ; * This program is free software ; you can redistribute it and/or modify ; * it under the terms of the GNU General Public License as published by ; * the Free Software Foundation ; either version 2 of the License, or ; * (at your option) any later version. ; * ; * This program is distributed in the hope that it will be useful, ; * but WITHOUT ANY WARRANTY ; without even the implied warranty of ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; * GNU General Public License for more details. ; * ; * You should have received a copy of the GNU General Public License ; * along with this program ; if not, write to the Free Software ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * ; ****************************************************************************/ bits 32 ; ideal from xvid %macro cglobal 1 global _%1 %define %1 _%1 %endmacro %macro cextern 1 extern _%1 %define %1 _%1 %endmacro ; input 0 1 2 3, output 3 4 1 0 %macro transpose 5 movq %5, %1 punpckhwd %5, %2 ; mm4 = 8 4 7 3 punpcklwd %1, %2 ; mm0 = 6 2 5 1 movq %2, %3 punpckhwd %2, %4 ; mm1 = 16 12 15 11 punpcklwd %3, %4 ; mm2 = 14 10 13 9 movq %4, %5 punpckhdq %4, %2 ; mm3 = 16 12 8 4 punpckldq %5, %2 ; mm4 = 15 11 7 3 movq %2, %1 punpckhdq %2, %3 ; mm1 = 14 10 6 2 punpckldq %1, %3 ; mm0 = 13 9 5 1 %endmacro %macro addsub 5 movq %5, %1 paddw %1, %4 ; %0 = s[0] psubw %5, %4 ; %4 = s[3] movq %4, %2 paddw %2, %3 ; %1 = s[1] psubw %4, %3 ; %3 = s[2] %endmacro %macro addsub2 5 movq %5, %1 ; %5 = s[0] paddw %1, %2 ; d[0] = s[0] + s[1] psubw %5, %2 ; d[2] = tmp - s[1] movq %2, %4 ; %2 = s[3] paddw %2, %2 ; %2 = %2 + %2 paddw %2, %3 ; d[1] = %2 + s[2] paddw %3, %3 ; s[2] = s[2]+ s[2] psubw %4, %3 ; d[3] = s[3]- s[2] %endmacro ; output 0 4 1 2 %macro idct_addsub2 5 movq %5, %1 ; %5 = d[0] paddw %1, %3 ; s[0] = d[0] + d[2] psubw %5, %3 ; s[1] = d[0] - d[2] movq %3, %2 ; %3 = d[1] psraw %2, 1 ; %2 = %2 / 2 psubw %2, %4 ; s[2] = %2 - d[3] psraw %4, 1 ; d[3] = d[3] / 2 paddw %3, %4 ; s[3] = d[1] + d[3] %endmacro ; %1 = mmx content, %2 = tmp mmx, %3 = zero mmx, %4 = xmm content, %5 = xmm tmp %macro word2dw 5 movq %2, %1 punpcklwd %2, %3 ; dcba->0b0a punpckhwd %1, %3 ; dcba->0d0c movq2dq %4, %1 ; 00 00 0d 0c pslldq %4, 8 ; 0d 0c 00 00 movq2dq %5, %2 ; 00 00 0b 0a por %4, %5 ; 0d 0c 0b 0a %endmacro section .rodata data align=16 align 16 sse2_neg1 dw -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 sse2_1 dw 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 align 16 mmx1 dw 1, 1, 1, 1 align 16 mmx32 dw 32, 32, 32, 32 cextern quant cextern dequant align 16 section .text ;====================================================== ; ; void ; dct4x4_mmx(int16_t* data) ; ;====================================================== align 16 cglobal dct4x4_mmx dct4x4_mmx mov eax, [esp + 4] ; data movq mm0, [eax + 0] movq mm1, [eax + 8] movq mm2, [eax + 16] movq mm3, [eax + 24] transpose mm0, mm1, mm2, mm3, mm4 ; input 0 1 2 3, output 0 1 4 3 addsub mm0, mm1, mm4, mm3, mm2 ; input 0 1 2 3, output 0 1 3 4 ; s[0] = mm0, s[1] = mm1, s[2] = mm3, s[3] = mm2 addsub2 mm0, mm1, mm3, mm2, mm4 ; input 0 1 2 3, output 0 1 4 3 transpose mm0, mm1, mm4, mm2, mm3 ; input 0 1 2 3, output 0 1 4 3 addsub mm0, mm1, mm3, mm2, mm4 ; input 0 1 2 3, output 0 1 3 4 ; s[0] = mm0, s[1] = mm1, s[2] = mm2, s[3] = mm4 addsub2 mm0, mm1, mm2, mm4, mm3 ; input 0 1 2 3, output 0 1 4 3 movq [eax + 0], mm0 movq [eax + 8], mm1 movq [eax +16], mm3 movq [eax +24], mm4 ret ;====================================================== ; ; void ; dct4x4dc_mmx(int16_t* data) ; ;====================================================== align 16 cglobal dct4x4dc_mmx dct4x4dc_mmx mov eax, [esp + 4] ; data movq mm0, [eax + 0] movq mm1, [eax + 8] movq mm2, [eax + 16] movq mm3, [eax + 24] transpose mm0, mm1, mm2, mm3, mm4 ; input 0 1 2 3, output 0 1 4 3 addsub mm0, mm1, mm4, mm3, mm2 ; input 0 1 2 3, output 0 1 3 4 ; s[0] = mm0, s[1] = mm1, s[2] = mm3, s[3] = mm2 addsub mm0, mm2, mm3, mm1, mm4 ; input 0 1 2 3, output 0 1 4 3 transpose mm0, mm2, mm4, mm1, mm3 ; input 0 1 2 3, output 0 1 4 3 addsub mm0, mm2, mm3, mm1, mm4 ; input 0 1 2 3, output 0 1 3 4 ; s[0] = mm0, s[1] = mm2, s[2] = mm1, s[3] = mm4 addsub mm0, mm4, mm1, mm2, mm3 ; input 0 1 2 3, output 0 1 4 3 movq mm1, [mmx1] paddw mm0, mm1 paddw mm4, mm1 paddw mm3, mm1 paddw mm2, mm1 psraw mm0, 1 psraw mm4, 1 psraw mm3, 1 psraw mm2, 1 movq [eax + 0], mm0 movq [eax + 8], mm4 movq [eax +16], mm3 movq [eax +24], mm2 ret ;====================================================== ; ; void ; idct4x4_mmx(int16_t* data) ; ;====================================================== align 16 cglobal idct4x4_mmx idct4x4_mmx mov eax, [esp + 4] ; data movq mm0, [eax + 0] movq mm1, [eax + 8] movq mm2, [eax + 16] movq mm3, [eax + 24] transpose mm0, mm1, mm2, mm3, mm4 ; input 0 1 2 3, output 0 1 4 3 idct_addsub2 mm0, mm1, mm4, mm3, mm2 ; input 0 1 2 3, output 0 4 1 2 ; s[0] = mm0, s[1] = mm2, s[2] = mm1, s[3] = mm4 addsub mm0, mm2, mm1, mm4, mm3 ; input 0 1 2 3, output 0 1 3 4 transpose mm0, mm2, mm4, mm3, mm1 ; input 0 1 2 3, output 0 1 4 3 idct_addsub2 mm0, mm2, mm1, mm3, mm4 ; input 0 1 2 3, output 0 4 1 2 ; s[0] = mm0, s[1] = mm4, s[2] = mm2, s[3] = mm1 addsub mm0, mm4, mm2, mm1, mm3 ; input 0 1 2 3, output 0 1 3 4 movq mm2, [mmx32] paddw mm0, mm2 paddw mm4, mm2 paddw mm1, mm2 paddw mm3, mm2 psraw mm0, 6 psraw mm4, 6 psraw mm1, 6 psraw mm3, 6 movq [eax + 0], mm0 movq [eax + 8], mm4 movq [eax +16], mm1 movq [eax +24], mm3 ret ;====================================================== ; ; void ; idct4x4dc_mmx(int16_t* data) ; ;====================================================== align 16 cglobal idct4x4dc_mmx idct4x4dc_mmx mov eax, [esp + 4] ; data movq mm0, [eax + 0] movq mm1, [eax + 8] movq mm2, [eax + 16] movq mm3, [eax + 24] transpose mm0, mm1, mm2, mm3, mm4 ; input 0 1 2 3, output 0 1 4 3 addsub mm0, mm1, mm3, mm4, mm2 ; input 0 1 2 3, output 0 4 3 1 ; s[0] = mm0, s[1] = mm2, s[2] = mm4, s[3] = mm1 addsub mm0, mm2, mm4, mm1, mm3 ; input 0 1 2 3, output 0 1 3 4 transpose mm0, mm2, mm1, mm3, mm4 ; input 0 1 2 3, output 0 1 4 3 addsub mm0, mm2, mm3, mm4, mm1 ; input 0 1 2 3, output 0 4 3 1 ; s[0] = mm0, s[1] = mm1, s[2] = mm4, s[3] = mm2 addsub mm0, mm1, mm4, mm2, mm3 ; input 0 1 2 3, output 0 1 3 4 movq [eax + 0], mm0 movq [eax + 8], mm1 movq [eax +16], mm2 movq [eax +24], mm3 ret ;====================================================== ; ; void ; quant4x4_sse2(int16_t* data, const int32_t Qp, int32_t is_intra) ; ;====================================================== align 16 cglobal quant4x4_sse2 quant4x4_sse2 push ebx push esi push edi push ebp mov edi, [esp + 4 + 16] ; data mov eax, [esp + 8 + 16] ; qp cdq mov ebp, [esp + 12 + 16] ; is_intra mov ebx, 6 idiv ebx add eax, 15 ; qbits(eax) = 15 + qp / 6, mf_index(edx) = qp % 6 mov esi, edx shl esi, 5 add esi, _quant ; esi = quant[mf_index] mov ecx, eax ; ecx = qbits neg ebp sbb ebp, ebp and ebp, 0xfffffffd add ebp, 6 ; is_intra(ecx) ? 3 : 6 mov eax, 1 shl eax, cl ; 1 << qbits cdq idiv ebp ; 1 << qbits / is_intra(ecx) ? 3 : 6 ; eax = f, ecx = qbits, esi = quant[mf_index], edi = data movd mm0, eax movd mm1, ecx pshufw mm0, mm0, 0x44 movq2dq xmm6, mm0 movq2dq xmm7, mm1 pshufd xmm6, xmm6, 0x44 ; f pxor mm3, mm3 movdqa xmm0, [edi + 0] ; data movdqa xmm1, [esi + 0] ; quant ; > 0 pxor xmm4, xmm4 movdqa xmm2, xmm0 pcmpgtw xmm0, xmm4 movdqa xmm4, xmm0 pand xmm0, xmm2 movdqa xmm3, xmm0 pmullw xmm0, xmm1 ; low part pmulhw xmm3, xmm1 ; high part movdqa xmm5, xmm0 punpcklwd xmm0, xmm3 ; low 4 - 32 bits punpckhwd xmm5, xmm3 ; high 4 - 32 bits movdqa xmm3, xmm4 punpcklwd xmm4, xmm4 pand xmm4, xmm6 paddd xmm0, xmm4 ; data * quant + f psrad xmm0, xmm7 ; data * quant + f >> qbits punpckhwd xmm3, xmm3 pand xmm3, xmm6 paddd xmm5, xmm3 ; data * quant + f psrad xmm5, xmm7 ; data * quant + f >> qbits packssdw xmm0, xmm5 ; < 0 pxor xmm4, xmm4 movdqa xmm5, xmm2 pcmpgtw xmm4, xmm2 pand xmm5, xmm4 pmullw xmm5, [sse2_neg1] movdqa xmm3, xmm5 pmullw xmm5, xmm1 pmulhw xmm3, xmm1 movdqa xmm1, xmm5 punpcklwd xmm5, xmm3 punpckhwd xmm1, xmm3 movdqa xmm3, xmm4 punpcklwd xmm4, xmm4 pand xmm4, xmm6 paddd xmm5, xmm4 ; data * quant - f psrad xmm5, xmm7 punpckhwd xmm3, xmm3 pand xmm3, xmm6 paddd xmm1, xmm3 psrad xmm1, xmm7 packssdw xmm5, xmm1 pmullw xmm5, [sse2_neg1] por xmm5, xmm0 movdqa [edi + 0], xmm5 movdqa xmm0, [edi + 16] ; data movdqa xmm1, [esi + 16] ; quant ; > 0 pxor xmm4, xmm4 movdqa xmm2, xmm0 pcmpgtw xmm0, xmm4 movdqa xmm4, xmm0 pand xmm0, xmm2 movdqa xmm3, xmm0 pmullw xmm0, xmm1 ; low part pmulhw xmm3, xmm1 ; high part movdqa xmm5, xmm0 punpcklwd xmm0, xmm3 ; low 4 - 32 bits punpckhwd xmm5, xmm3 ; high 4 - 32 bits movdqa xmm3, xmm4 punpcklwd xmm4, xmm4 pand xmm4, xmm6 paddd xmm0, xmm4 ; data * quant + f psrad xmm0, xmm7 ; data * quant + f >> qbits punpckhwd xmm3, xmm3 pand xmm3, xmm6 paddd xmm5, xmm3 ; data * quant + f psrad xmm5, xmm7 ; data * quant + f >> qbits packssdw xmm0, xmm5 ; < 0 pxor xmm4, xmm4 movdqa xmm5, xmm2 pcmpgtw xmm4, xmm2 pand xmm5, xmm4 pmullw xmm5, [sse2_neg1] movdqa xmm3, xmm5 pmullw xmm5, xmm1 pmulhw xmm3, xmm1 movdqa xmm1, xmm5 punpcklwd xmm5, xmm3 punpckhwd xmm1, xmm3 movdqa xmm3, xmm4 punpcklwd xmm4, xmm4 pand xmm4, xmm6 paddd xmm5, xmm4 ; data * quant - f psrad xmm5, xmm7 punpckhwd xmm3, xmm3 pand xmm3, xmm6 paddd xmm1, xmm3 psrad xmm1, xmm7 packssdw xmm5, xmm1 pmullw xmm5, [sse2_neg1] por xmm5, xmm0 movdqa [edi + 16], xmm5 pop ebp pop edi pop esi pop ebx ret ;====================================================== ; ; void ; iquant4x4_sse2(int16_t* data, const int32_t Qp) ; ;====================================================== align 16 cglobal iquant4x4_sse2 iquant4x4_sse2 mov eax, [esp + 8] ; qp cdq mov ecx, 6 idiv ecx ; qbits(eax) = qp / 6, mf_index(edx) = qp % 6 mov ecx, edx shl ecx, 5 add ecx, _dequant ; ecx = quant[mf_index] mov edx, [esp + 4] ; data ; eax = qbits, ecx = quant[mf_index], edx = data movd mm0, eax movq2dq xmm7, mm0 movdqa xmm6, [sse2_1] pxor xmm5, xmm5 psllw xmm6, xmm7 ; << qbits movdqa xmm0, [edx + 0] movdqa xmm1, [ecx + 0] pmullw xmm0, xmm1 pmullw xmm0, xmm6 movdqa [edx + 0], xmm0 movdqa xmm0, [edx + 16] movdqa xmm1, [ecx + 16] pmullw xmm0, xmm1 pmullw xmm0, xmm6 movdqa [edx + 16], xmm0 ret