www.pudn.com > T264-src-0.02.zip > interpolate_sse2.asm
;/***************************************************************************** ; * ; * T264 AVC CODEC ; * ; * Copyright(C) 2004-2005 llcc; * 2004-2005 visionany ; * ; * This program is free software ; you can redistribute it and/or modify ; * it under the terms of the GNU General Public License as published by ; * the Free Software Foundation ; either version 2 of the License, or ; * (at your option) any later version. ; * ; * This program is distributed in the hope that it will be useful, ; * but WITHOUT ANY WARRANTY ; without even the implied warranty of ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; * GNU General Public License for more details. ; * ; * You should have received a copy of the GNU General Public License ; * along with this program ; if not, write to the Free Software ; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ; * ; ****************************************************************************/ bits 32 ; ideal from xvid %macro cglobal 1 global _%1 %define %1 _%1 %endmacro ; %1 dst=xmm, %2 src addr = esi, %3, 4 = mm tmp, %5 = mm 0, %6 = xmm tmp %macro load8b 6 movq %3, [%2] ; 8 bytes movq %4, %3 punpcklbw %3, %5 ; low 4 words punpckhbw %4, %5 ; high 4 words movq2dq %1, %4 movq2dq %6, %3 pslldq %1, 8 por %1, %6 %endmacro ; %1 dst=xmm, %2 src addr = esi, %3, 4 = mm tmp, %5 = mm 0, %6 = xmm tmp %macro load8bv 6 %if (%2 == -2) mov ebx, eax add ebx, eax neg ebx %elif (%2 == -1) mov ebx, eax neg ebx %elif (%2 == 0) xor ebx, ebx %elif (%2 == 1) mov ebx, eax %elif (%2 == 2) mov ebx, eax add ebx, eax %elif (%2 == 3) mov ebx, eax add ebx, eax add ebx, eax %endif add ebx, esi add ebx, ecx load8b %1, ebx, %3, %4, %5, %6 %endmacro section .rodata data align=16 align 16 sse2_20 times 8 dw 20 sse2_5n times 8 dw -5 sse2_16 times 8 dw 16 section .text ;====================================================== ; ; void ; interpolate_halfpel_h_sse2(uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t width, int32_t height) ; ;====================================================== align 16 cglobal interpolate_halfpel_h_sse2 interpolate_halfpel_h_sse2 push ebx push esi push edi push ebp mov esi, [esp + 16 + 4] ; src mov eax, [esp + 16 + 8] ; src_stride mov edi, [esp + 16 + 12] ; dst mov ebx, [esp + 16 + 16] ; dst_stride mov ebp, [esp + 16 + 20] ; width pxor mm1, mm1 xor edx, edx .looprow xor ecx, ecx .loopcol load8b xmm0, esi + ecx - 2, mm0, mm2, mm1, xmm6 load8b xmm1, esi + ecx - 1, mm0, mm2, mm1, xmm6 load8b xmm2, esi + ecx - 0, mm0, mm2, mm1, xmm6 load8b xmm3, esi + ecx + 1, mm0, mm2, mm1, xmm6 load8b xmm4, esi + ecx + 2, mm0, mm2, mm1, xmm6 load8b xmm5, esi + ecx + 3, mm0, mm2, mm1, xmm6 movdqa xmm7, [sse2_20] movdqa xmm6, [sse2_5n] pmullw xmm1, xmm6 pmullw xmm4, xmm6 pmullw xmm2, xmm7 pmullw xmm3, xmm7 movdqa xmm6, [sse2_16] paddw xmm0, xmm1 paddw xmm2, xmm3 paddw xmm4, xmm5 paddw xmm0, xmm2 paddw xmm0, xmm4 paddw xmm0, xmm6 psraw xmm0, 5 ; pack packuswb xmm0, xmm0 movq [edi + ecx], xmm0 add ecx, 8 cmp ecx, ebp jnz .loopcol inc edx add esi, eax add edi, ebx cmp edx, [esp + 16 + 24] ; height jnz .looprow pop ebp pop edi pop esi pop ebx ret ;====================================================== ; ; void ; interpolate_halfpel_v_sse2(uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t width, int32_t height) ; ;====================================================== align 16 cglobal interpolate_halfpel_v_sse2 interpolate_halfpel_v_sse2 push ebx push esi push edi push ebp mov esi, [esp + 16 + 4] ; src mov eax, [esp + 16 + 8] ; src_stride mov edi, [esp + 16 + 12] ; dst mov ebp, [esp + 16 + 20] ; width pxor mm1, mm1 xor edx, edx .looprow xor ecx, ecx .loopcol load8bv xmm0, - 2, mm0, mm2, mm1, xmm6 load8bv xmm1, - 1, mm0, mm2, mm1, xmm6 load8bv xmm2, - 0, mm0, mm2, mm1, xmm6 load8bv xmm3, + 1, mm0, mm2, mm1, xmm6 load8bv xmm4, + 2, mm0, mm2, mm1, xmm6 load8bv xmm5, + 3, mm0, mm2, mm1, xmm6 movdqa xmm7, [sse2_20] movdqa xmm6, [sse2_5n] pmullw xmm1, xmm6 pmullw xmm4, xmm6 pmullw xmm2, xmm7 pmullw xmm3, xmm7 movdqa xmm6, [sse2_16] paddw xmm0, xmm1 paddw xmm2, xmm3 paddw xmm4, xmm5 paddw xmm0, xmm2 paddw xmm0, xmm4 paddw xmm0, xmm6 psraw xmm0, 5 ; pack packuswb xmm0, xmm0 movq [edi + ecx], xmm0 add ecx, 8 cmp ecx, ebp jnz .loopcol inc edx add esi, eax add edi, [esp + 16 + 16] cmp edx, [esp + 16 + 24] ; height jnz .looprow pop ebp pop edi pop esi pop ebx ret ;====================================================== ; ; void ; interpolate_halfpel_hv_sse2(uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t width, int32_t height) ; ;====================================================== align 16 cglobal interpolate_halfpel_hv_sse2 interpolate_halfpel_hv_sse2 ret