www.pudn.com > x264_2007.rar > mc-a2.asm


;*****************************************************************************
;* mc-a2.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2005 x264 project
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
;*****************************************************************************

BITS 64

;=============================================================================
; Macros and other preprocessor constants
;=============================================================================

%include "amd64inc.asm"

;=============================================================================
; Read only data
;=============================================================================

SECTION .rodata align=16

ALIGN 16
pw_1:  times 4 dw 1
pw_16: times 4 dw 16
pw_32: times 4 dw 32

;=============================================================================
; Macros
;=============================================================================

%macro LOAD_ADD 3
    movd        %1,     %2
    movd        mm7,    %3
    punpcklbw   %1,     mm0
    punpcklbw   mm7,    mm0
    paddw       %1,     mm7
%endmacro

%macro FILT_V 0
    psubw       mm1,    mm2         ; a-b
    psubw       mm4,    mm5
    psubw       mm2,    mm3         ; b-c
    psubw       mm5,    mm6
    psllw       mm2,    2
    psllw       mm5,    2
    psubw       mm1,    mm2         ; a-5*b+4*c
    psubw       mm4,    mm5
    psllw       mm3,    4
    psllw       mm6,    4
    paddw       mm1,    mm3         ; a-5*b+20*c
    paddw       mm4,    mm6
%endmacro

%macro FILT_H 0
    psubw       mm1,    mm2         ; a-b
    psubw       mm4,    mm5
    psraw       mm1,    2           ; (a-b)/4
    psraw       mm4,    2
    psubw       mm1,    mm2         ; (a-b)/4-b
    psubw       mm4,    mm5
    paddw       mm1,    mm3         ; (a-b)/4-b+c
    paddw       mm4,    mm6
    psraw       mm1,    2           ; ((a-b)/4-b+c)/4
    psraw       mm4,    2
    paddw       mm1,    mm3         ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
    paddw       mm4,    mm6
%endmacro

%macro FILT_PACK 1
    paddw       mm1,    mm7
    paddw       mm4,    mm7
    psraw       mm1,    %1
    psraw       mm4,    %1
    packuswb    mm1,    mm4
%endmacro


;=============================================================================
; Code
;=============================================================================

SECTION .text

;-----------------------------------------------------------------------------
; void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
;                               int i_stride, int i_width, int i_height );
;-----------------------------------------------------------------------------
cglobal x264_hpel_filter_mmxext

%ifdef WIN64
    push        rdi
    pushreg     rdi
    push        rsi
    pushreg     rsi
%endif
    push        rbp
    pushreg     rbp
    push        rbx
    pushreg     rbx
    mov         rbp,    rsp
    setframe    rbp, 0
    endprolog

%ifdef WIN64
    mov         rdi,    parm1q
    mov         rsi,    parm2q
    mov         rdx,    parm3q
    mov         rcx,    parm4q
    movsxd      r8,     dword [rbp+72]
    movsxd      r9,     dword [rbp+80]
    mov         ebx,    dword [rbp+88]
%else
    mov         ebx,    dword [rbp+24]
%endif
    %define     dsth    rdi
    %define     dstv    rsi
    %define     dstc    rdx
    %define     src     rcx
    %define     stride  r8
    %define     width   r9
    %define     height  ebx
    %define     stride3 r10
    %define     stride5 r11
    %define     x       rax
    %define     tbuffer rsp + 8

    lea         stride3, [stride*3]
    lea         stride5, [stride*5]
    sub         src,    stride
    sub         src,    stride

    lea         rax,    [stride*2 + 24]
    sub         rsp,    rax

    pxor        mm0,    mm0

.loopy:

    xor         x,      x
ALIGN 16
.vertical_filter:

    prefetcht0  [src + stride5 + 32]

    LOAD_ADD    mm1,    [src               ], [src + stride5     ] ; a0
    LOAD_ADD    mm2,    [src + stride      ], [src + stride*4    ] ; b0
    LOAD_ADD    mm3,    [src + stride*2    ], [src + stride3     ] ; c0
    LOAD_ADD    mm4,    [src            + 4], [src + stride5  + 4] ; a1
    LOAD_ADD    mm5,    [src + stride   + 4], [src + stride*4 + 4] ; b1
    LOAD_ADD    mm6,    [src + stride*2 + 4], [src + stride3  + 4] ; c1

    FILT_V

    movq        mm7,    [pw_16 GLOBAL]
    movq        [tbuffer + x*2],  mm1
    movq        [tbuffer + x*2 + 8],  mm4
    paddw       mm1,    mm7
    paddw       mm4,    mm7
    psraw       mm1,    5
    psraw       mm4,    5
    packuswb    mm1,    mm4
    movntq      [dstv + x], mm1

    add         x,      8
    add         src,    8
    cmp         x,      width
    jle         .vertical_filter

    pshufw      mm2, [tbuffer], 0
    movq        [tbuffer - 8], mm2 ; pad left
    ; no need to pad right, since vertical_filter already did 4 extra pixels

    sub         src,    x
    xor         x,      x
    movq        mm7,    [pw_32 GLOBAL]
.center_filter:

    movq        mm1,    [tbuffer + x*2 - 4 ]
    movq        mm2,    [tbuffer + x*2 - 2 ]
    movq        mm3,    [tbuffer + x*2     ]
    movq        mm4,    [tbuffer + x*2 + 4 ]
    movq        mm5,    [tbuffer + x*2 + 6 ]
    paddw       mm3,    [tbuffer + x*2 + 2 ] ; c0
    paddw       mm2,    mm4                  ; b0
    paddw       mm1,    mm5                  ; a0
    movq        mm6,    [tbuffer + x*2 + 8 ]
    paddw       mm4,    [tbuffer + x*2 + 14] ; a1
    paddw       mm5,    [tbuffer + x*2 + 12] ; b1
    paddw       mm6,    [tbuffer + x*2 + 10] ; c1

    FILT_H
    FILT_PACK 6
    movntq      [dstc + x], mm1

    add         x,      8
    cmp         x,      width
    jl          .center_filter

    lea         src,    [src + stride*2]
    xor         x,      x
.horizontal_filter:

    movd        mm1,    [src + x - 2]
    movd        mm2,    [src + x - 1]
    movd        mm3,    [src + x    ]
    movd        mm6,    [src + x + 1]
    movd        mm4,    [src + x + 2]
    movd        mm5,    [src + x + 3]
    punpcklbw   mm1,    mm0
    punpcklbw   mm2,    mm0
    punpcklbw   mm3,    mm0
    punpcklbw   mm6,    mm0
    punpcklbw   mm4,    mm0
    punpcklbw   mm5,    mm0
    paddw       mm3,    mm6 ; c0
    paddw       mm2,    mm4 ; b0
    paddw       mm1,    mm5 ; a0
    movd        mm7,    [src + x + 7]
    movd        mm6,    [src + x + 6]
    punpcklbw   mm7,    mm0
    punpcklbw   mm6,    mm0
    paddw       mm4,    mm7 ; c1
    paddw       mm5,    mm6 ; b1
    movd        mm7,    [src + x + 5]
    movd        mm6,    [src + x + 4]
    punpcklbw   mm7,    mm0
    punpcklbw   mm6,    mm0
    paddw       mm6,    mm7 ; a1

    movq        mm7,    [pw_1 GLOBAL]
    FILT_H
    FILT_PACK 1
    movntq      [dsth + x], mm1

    add         x,      8
    cmp         x,      width
    jl          .horizontal_filter

    sub         src,    stride
    add         dsth,   stride
    add         dstv,   stride
    add         dstc,   stride
    dec         height
    jg          .loopy

    mov         rsp,    rbp
    pop         rbx
    pop         rbp
%ifdef WIN64
    pop         rsi
    pop         rdi
%endif
    ret



;-----------------------------------------------------------------------------
; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst,
;                              uint8_t *src, int i_src, int w, int h)
;-----------------------------------------------------------------------------
cglobal x264_plane_copy_mmxext
    movsxd parm2q, parm2d
    movsxd parm4q, parm4d
    add    parm5d, 3
    and    parm5d, ~3
    sub    parm2q, parm5q
    sub    parm4q, parm5q
    ; shuffle regs because movsd needs dst=rdi, src=rsi, w=ecx
    xchg   rsi, rdx
    mov    rax, parm4q
.loopy:
    mov    ecx, parm5d
    sub    ecx, 64
    jl     .endx
.loopx:
    prefetchnta [rsi+256]
    movq   mm0, [rsi   ]
    movq   mm1, [rsi+ 8]
    movq   mm2, [rsi+16]
    movq   mm3, [rsi+24]
    movq   mm4, [rsi+32]
    movq   mm5, [rsi+40]
    movq   mm6, [rsi+48]
    movq   mm7, [rsi+56]
    movntq [rdi   ], mm0
    movntq [rdi+ 8], mm1
    movntq [rdi+16], mm2
    movntq [rdi+24], mm3
    movntq [rdi+32], mm4
    movntq [rdi+40], mm5
    movntq [rdi+48], mm6
    movntq [rdi+56], mm7
    add    rsi, 64
    add    rdi, 64
    sub    ecx, 64
    jge    .loopx
.endx:
    prefetchnta [rsi+256]
    add    ecx, 64
    shr    ecx, 2
    rep movsd
    add    rdi, rdx
    add    rsi, rax
    sub    parm6d, 1
    jg     .loopy
    emms
    ret