www.pudn.com > T264-src-0.02.zip > interpolate_sse2.asm


;/***************************************************************************** 
; * 
; *  T264 AVC CODEC 
; * 
; *  Copyright(C) 2004-2005 llcc  
; *               2004-2005 visionany  
; * 
; *  This program is free software ; you can redistribute it and/or modify 
; *  it under the terms of the GNU General Public License as published by 
; *  the Free Software Foundation ; either version 2 of the License, or 
; *  (at your option) any later version. 
; * 
; *  This program is distributed in the hope that it will be useful, 
; *  but WITHOUT ANY WARRANTY ; without even the implied warranty of 
; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
; *  GNU General Public License for more details. 
; * 
; *  You should have received a copy of the GNU General Public License 
; *  along with this program ; if not, write to the Free Software 
; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA 
; * 
; ****************************************************************************/ 
 
bits 32 
 
; ideal from xvid 
%macro cglobal 1  
	global _%1  
	%define %1 _%1 
%endmacro 
 
; %1 dst=xmm, %2 src addr = esi, %3, 4 = mm tmp, %5 = mm 0, %6 = xmm tmp 
%macro load8b 6 
    movq %3, [%2]     ; 8 bytes 
    movq %4, %3 
    punpcklbw %3, %5  ; low 4 words 
    punpckhbw %4, %5  ; high 4 words 
    movq2dq %1, %4 
    movq2dq %6, %3 
    pslldq %1, 8 
    por %1, %6 
%endmacro 
 
; %1 dst=xmm, %2 src addr = esi, %3, 4 = mm tmp, %5 = mm 0, %6 = xmm tmp 
%macro load8bv 6 
%if (%2 == -2) 
    mov ebx, eax 
    add ebx, eax 
    neg ebx 
%elif (%2 == -1) 
    mov ebx, eax 
    neg ebx 
%elif (%2 == 0) 
    xor ebx, ebx 
%elif (%2 == 1) 
    mov ebx, eax 
%elif (%2 == 2) 
    mov ebx, eax 
    add ebx, eax 
%elif (%2 == 3) 
    mov ebx, eax 
    add ebx, eax 
    add ebx, eax 
%endif 
    add ebx, esi 
    add ebx, ecx 
    load8b %1, ebx, %3, %4, %5, %6 
%endmacro 
 
section .rodata data align=16 
 
align 16 
    sse2_20 times 8 dw 20 
    sse2_5n times 8 dw -5 
    sse2_16 times 8 dw 16 
 
section .text 
 
;====================================================== 
; 
; void 
; interpolate_halfpel_h_sse2(uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t width, int32_t height) 
; 
;====================================================== 
 
align 16 
 
cglobal interpolate_halfpel_h_sse2 
interpolate_halfpel_h_sse2 
     
    push ebx 
    push esi 
    push edi 
    push ebp 
     
    mov esi, [esp + 16 + 4]     ; src 
    mov eax, [esp + 16 + 8]     ; src_stride 
    mov edi, [esp + 16 + 12]    ; dst 
    mov ebx, [esp + 16 + 16]    ; dst_stride 
    mov ebp, [esp + 16 + 20]    ; width 
     
    pxor mm1, mm1 
    xor edx, edx 
 
.looprow 
 
    xor ecx, ecx 
 
.loopcol 
 
    load8b xmm0, esi + ecx - 2, mm0, mm2, mm1, xmm6 
    load8b xmm1, esi + ecx - 1, mm0, mm2, mm1, xmm6 
    load8b xmm2, esi + ecx - 0, mm0, mm2, mm1, xmm6 
    load8b xmm3, esi + ecx + 1, mm0, mm2, mm1, xmm6 
    load8b xmm4, esi + ecx + 2, mm0, mm2, mm1, xmm6 
    load8b xmm5, esi + ecx + 3, mm0, mm2, mm1, xmm6 
     
    movdqa xmm7, [sse2_20] 
    movdqa xmm6, [sse2_5n] 
    pmullw xmm1, xmm6 
    pmullw xmm4, xmm6 
    pmullw xmm2, xmm7 
    pmullw xmm3, xmm7 
    movdqa xmm6, [sse2_16] 
    paddw  xmm0, xmm1 
    paddw  xmm2, xmm3 
    paddw  xmm4, xmm5 
    paddw  xmm0, xmm2 
    paddw  xmm0, xmm4 
    paddw  xmm0, xmm6 
    psraw  xmm0, 5 
     
    ; pack 
    packuswb xmm0, xmm0 
    movq [edi + ecx], xmm0 
     
    add ecx, 8 
    cmp ecx, ebp 
    jnz .loopcol 
    inc edx 
    add esi, eax 
    add edi, ebx 
    cmp edx, [esp + 16 + 24]    ; height 
    jnz .looprow 
     
    pop ebp 
    pop edi 
    pop esi 
    pop ebx 
     
    ret 
     
;====================================================== 
; 
; void 
; interpolate_halfpel_v_sse2(uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t width, int32_t height) 
; 
;====================================================== 
 
align 16 
 
cglobal interpolate_halfpel_v_sse2 
interpolate_halfpel_v_sse2 
     
    push ebx 
    push esi 
    push edi 
    push ebp 
     
    mov esi, [esp + 16 + 4]     ; src 
    mov eax, [esp + 16 + 8]     ; src_stride 
    mov edi, [esp + 16 + 12]    ; dst 
    mov ebp, [esp + 16 + 20]    ; width 
     
    pxor mm1, mm1 
    xor edx, edx 
 
.looprow 
 
    xor ecx, ecx 
 
.loopcol 
 
    load8bv xmm0, - 2, mm0, mm2, mm1, xmm6 
    load8bv xmm1, - 1, mm0, mm2, mm1, xmm6 
    load8bv xmm2, - 0, mm0, mm2, mm1, xmm6 
    load8bv xmm3, + 1, mm0, mm2, mm1, xmm6 
    load8bv xmm4, + 2, mm0, mm2, mm1, xmm6 
    load8bv xmm5, + 3, mm0, mm2, mm1, xmm6 
     
    movdqa xmm7, [sse2_20] 
    movdqa xmm6, [sse2_5n] 
    pmullw xmm1, xmm6 
    pmullw xmm4, xmm6 
    pmullw xmm2, xmm7 
    pmullw xmm3, xmm7 
    movdqa xmm6, [sse2_16] 
    paddw  xmm0, xmm1 
    paddw  xmm2, xmm3 
    paddw  xmm4, xmm5 
    paddw  xmm0, xmm2 
    paddw  xmm0, xmm4 
    paddw  xmm0, xmm6 
    psraw  xmm0, 5 
     
    ; pack 
    packuswb xmm0, xmm0 
    movq [edi + ecx], xmm0 
     
    add ecx, 8 
    cmp ecx, ebp 
    jnz .loopcol 
    inc edx 
    add esi, eax 
    add edi, [esp + 16 + 16] 
    cmp edx, [esp + 16 + 24]    ; height 
    jnz .looprow 
     
    pop ebp 
    pop edi 
    pop esi 
    pop ebx 
     
    ret 
     
;====================================================== 
; 
; void 
; interpolate_halfpel_hv_sse2(uint8_t* src, int32_t src_stride, uint8_t* dst, int32_t dst_stride, int32_t width, int32_t height) 
; 
;====================================================== 
 
align 16 
 
cglobal interpolate_halfpel_hv_sse2 
interpolate_halfpel_hv_sse2 
     
    ret