www.pudn.com > x264_2007.rar > mc-a2.asm


;*****************************************************************************
;* mc-a2.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2005 x264 project
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
;*****************************************************************************

BITS 32

;=============================================================================
; Macros and other preprocessor constants
;=============================================================================

%include "i386inc.asm"

;=============================================================================
; Read only data
;=============================================================================

SECTION_RODATA

ALIGN 16
pw_1:  times 4 dw 1
pw_16: times 4 dw 16
pw_32: times 4 dw 32

;=============================================================================
; Macros
;=============================================================================

%macro LOAD_ADD 3
    movd        %1,     %2
    movd        mm7,    %3
    punpcklbw   %1,     mm0
    punpcklbw   mm7,    mm0
    paddw       %1,     mm7
%endmacro

%macro FILT_V 0
    psubw       mm1,    mm2         ; a-b
    psubw       mm4,    mm5
    psubw       mm2,    mm3         ; b-c
    psubw       mm5,    mm6
    psllw       mm2,    2
    psllw       mm5,    2
    psubw       mm1,    mm2         ; a-5*b+4*c
    psubw       mm4,    mm5
    psllw       mm3,    4
    psllw       mm6,    4
    paddw       mm1,    mm3         ; a-5*b+20*c
    paddw       mm4,    mm6
%endmacro

%macro FILT_H 0
    psubw       mm1,    mm2         ; a-b
    psubw       mm4,    mm5
    psraw       mm1,    2           ; (a-b)/4
    psraw       mm4,    2
    psubw       mm1,    mm2         ; (a-b)/4-b
    psubw       mm4,    mm5
    paddw       mm1,    mm3         ; (a-b)/4-b+c
    paddw       mm4,    mm6
    psraw       mm1,    2           ; ((a-b)/4-b+c)/4
    psraw       mm4,    2
    paddw       mm1,    mm3         ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
    paddw       mm4,    mm6
%endmacro

%macro FILT_PACK 1
    paddw       mm1,    mm7
    paddw       mm4,    mm7
    psraw       mm1,    %1
    psraw       mm4,    %1
    packuswb    mm1,    mm4
%endmacro


;=============================================================================
; Code
;=============================================================================

SECTION .text

;-----------------------------------------------------------------------------
; void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
;                               int i_stride, int i_width, int i_height );
;-----------------------------------------------------------------------------
cglobal x264_hpel_filter_mmxext 
    push        ebp
    mov         ebp,    esp
    push        ebx
    push        esi
    push        edi
    picgetgot   ebx

    %define     tdsth   ebp +  8
    %define     tdstv   ebp + 12
    %define     tdstc   ebp + 16
    %define     tsrc    ebp + 20
    %define     tstride ebp + 24
    %define     twidth  ebp + 28
    %define     theight ebp + 32
    %define     tpw_1   ebp - 36
    %define     tpw_16  ebp - 28
    %define     tpw_32  ebp - 20
    %define     tbuffer esp +  8

    %define     x       eax
    %define     dsth    ebx
    %define     dstv    ebx
    %define     dstc    ebx
    %define     src     ecx
    %define     src3    edx
    %define     stride  esi
    %define     width   edi

    mov         stride, [tstride]
    mov         width,  [twidth]
    lea         eax,    [stride*2 + 24 + 24]
    sub         esp,    eax
    pxor        mm0,    mm0

    ; mov globals onto the stack, to free up ebx
    movq        mm1,    [pw_1  GOT_ebx]
    movq        mm2,    [pw_16 GOT_ebx]
    movq        mm3,    [pw_32 GOT_ebx]
    movq        [tpw_1],  mm1
    movq        [tpw_16], mm2
    movq        [tpw_32], mm3

.loopy:

    mov         src,    [tsrc]
    mov         dstv,   [tdstv]
    lea         src3,   [src + stride]
    sub         src,    stride
    sub         src,    stride
    xor         x,      x
ALIGN 16
.vertical_filter:

    prefetcht0  [src3 + stride*2 + 32]

    LOAD_ADD    mm1,    [src               ], [src3 + stride*2    ] ; a0
    LOAD_ADD    mm2,    [src + stride      ], [src3 + stride      ] ; b0
    LOAD_ADD    mm3,    [src + stride*2    ], [src3               ] ; c0
    LOAD_ADD    mm4,    [src            + 4], [src3 + stride*2 + 4] ; a1
    LOAD_ADD    mm5,    [src + stride   + 4], [src3 + stride   + 4] ; b1
    LOAD_ADD    mm6,    [src + stride*2 + 4], [src3            + 4] ; c1

    FILT_V

    movq        mm7,    [tpw_16]
    movq        [tbuffer + x*2],  mm1
    movq        [tbuffer + x*2 + 8],  mm4
    paddw       mm1,    mm7
    paddw       mm4,    mm7
    psraw       mm1,    5
    psraw       mm4,    5
    packuswb    mm1,    mm4
    movntq      [dstv + x], mm1

    add         x,      8
    add         src,    8
    add         src3,   8
    cmp         x,      width
    jle         .vertical_filter

    pshufw      mm2, [tbuffer], 0
    movq        [tbuffer - 8], mm2 ; pad left
    ; no need to pad right, since vertical_filter already did 4 extra pixels

    mov         dstc,   [tdstc]
    xor         x,      x
    movq        mm7,    [tpw_32]
.center_filter:

    movq        mm1,    [tbuffer + x*2 - 4 ]
    movq        mm2,    [tbuffer + x*2 - 2 ]
    movq        mm3,    [tbuffer + x*2     ]
    movq        mm4,    [tbuffer + x*2 + 4 ]
    movq        mm5,    [tbuffer + x*2 + 6 ]
    paddw       mm3,    [tbuffer + x*2 + 2 ] ; c0
    paddw       mm2,    mm4                  ; b0
    paddw       mm1,    mm5                  ; a0
    movq        mm6,    [tbuffer + x*2 + 8 ]
    paddw       mm4,    [tbuffer + x*2 + 14] ; a1
    paddw       mm5,    [tbuffer + x*2 + 12] ; b1
    paddw       mm6,    [tbuffer + x*2 + 10] ; c1

    FILT_H
    FILT_PACK 6
    movntq      [dstc + x], mm1

    add         x,      8
    cmp         x,      width
    jl          .center_filter

    mov         dsth,   [tdsth]
    mov         src,    [tsrc]
    xor         x,      x
.horizontal_filter:

    movd        mm1,    [src + x - 2]
    movd        mm2,    [src + x - 1]
    movd        mm3,    [src + x    ]
    movd        mm6,    [src + x + 1]
    movd        mm4,    [src + x + 2]
    movd        mm5,    [src + x + 3]
    punpcklbw   mm1,    mm0
    punpcklbw   mm2,    mm0
    punpcklbw   mm3,    mm0
    punpcklbw   mm6,    mm0
    punpcklbw   mm4,    mm0
    punpcklbw   mm5,    mm0
    paddw       mm3,    mm6 ; c0
    paddw       mm2,    mm4 ; b0
    paddw       mm1,    mm5 ; a0
    movd        mm7,    [src + x + 7]
    movd        mm6,    [src + x + 6]
    punpcklbw   mm7,    mm0
    punpcklbw   mm6,    mm0
    paddw       mm4,    mm7 ; c1
    paddw       mm5,    mm6 ; b1
    movd        mm7,    [src + x + 5]
    movd        mm6,    [src + x + 4]
    punpcklbw   mm7,    mm0
    punpcklbw   mm6,    mm0
    paddw       mm6,    mm7 ; a1

    movq        mm7,    [tpw_1]
    FILT_H
    FILT_PACK 1
    movntq      [dsth + x], mm1

    add         x,      8
    cmp         x,      width
    jl          .horizontal_filter

    add         [tsrc],  stride
    add         [tdsth], stride
    add         [tdstv], stride
    add         [tdstc], stride
    dec         dword [theight]
    jg          .loopy

    lea         esp,    [ebp-12]
    pop         edi
    pop         esi
    pop         ebx
    pop         ebp
    ret




;-----------------------------------------------------------------------------
; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst,
;                              uint8_t *src, int i_src, int w, int h)
;-----------------------------------------------------------------------------
cglobal x264_plane_copy_mmxext
    push   edi
    push   esi
    push   ebx
    mov    edi, [esp+16] ; dst
    mov    ebx, [esp+20] ; i_dst
    mov    esi, [esp+24] ; src
    mov    eax, [esp+28] ; i_src
    mov    edx, [esp+32] ; w
    add    edx, 3
    and    edx, ~3
    sub    ebx, edx
    sub    eax, edx
.loopy:
    mov    ecx, edx
    sub    ecx, 64
    jl     .endx
.loopx:
    prefetchnta [esi+256]
    movq   mm0, [esi   ]
    movq   mm1, [esi+ 8]
    movq   mm2, [esi+16]
    movq   mm3, [esi+24]
    movq   mm4, [esi+32]
    movq   mm5, [esi+40]
    movq   mm6, [esi+48]
    movq   mm7, [esi+56]
    movntq [edi   ], mm0
    movntq [edi+ 8], mm1
    movntq [edi+16], mm2
    movntq [edi+24], mm3
    movntq [edi+32], mm4
    movntq [edi+40], mm5
    movntq [edi+48], mm6
    movntq [edi+56], mm7
    add    esi, 64
    add    edi, 64
    sub    ecx, 64
    jge    .loopx
.endx:
    prefetchnta [esi+256]
    add    ecx, 64
    shr    ecx, 2
    rep movsd
    add    edi, ebx
    add    esi, eax
    sub    dword [esp+36], 1
    jg     .loopy
    pop    ebx
    pop    esi
    pop    edi
    emms
    ret