www.pudn.com > X264_20060729.rar > mc-a2.asm
;*****************************************************************************
;* mc-a2.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2005 x264 project
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
;*****************************************************************************
BITS 64
;=============================================================================
; Macros and other preprocessor constants
;=============================================================================
%include "amd64inc.asm"
;=============================================================================
; Read only data
;=============================================================================
SECTION .rodata
ALIGN 16
mmx_dw_one:
times 4 dw 16
mmx_dd_one:
times 2 dd 512
mmx_dw_20:
times 4 dw 20
mmx_dw_5:
times 4 dw -5
%assign tbuffer 0
;=============================================================================
; Macros
;=============================================================================
%macro LOAD_4 9
movd %1, %5
movd %2, %6
movd %3, %7
movd %4, %8
punpcklbw %1, %9
punpcklbw %2, %9
punpcklbw %3, %9
punpcklbw %4, %9
%endmacro
%macro FILT_2 2
psubw %1, %2
psllw %2, 2
psubw %1, %2
%endmacro
%macro FILT_4 3
paddw %2, %3
psllw %2, 2
paddw %1, %2
psllw %2, 2
paddw %1, %2
%endmacro
%macro FILT_6 4
psubw %1, %2
psllw %2, 2
psubw %1, %2
paddw %1, %3
paddw %1, %4
psraw %1, 5
%endmacro
%macro FILT_ALL 1
LOAD_4 mm1, mm2, mm3, mm4, [%1], [%1 + rcx], [%1 + 2 * rcx], [%1 + rbx], mm0
FILT_2 mm1, mm2
movd mm5, [%1 + 4 * rcx]
movd mm6, [%1 + rdx]
FILT_4 mm1, mm3, mm4
punpcklbw mm5, mm0
punpcklbw mm6, mm0
psubw mm1, mm5
psllw mm5, 2
psubw mm1, mm5
paddw mm1, mm6
%endmacro
;=============================================================================
; Code
;=============================================================================
SECTION .text
cglobal x264_horizontal_filter_mmxext
cglobal x264_center_filter_mmxext
;-----------------------------------------------------------------------------
;
; void x264_center_filter_mmxext( uint8_t *dst1, int i_dst1_stride,
; uint8_t *dst2, int i_dst2_stride,
; uint8_t *src, int i_src_stride,
; int i_width, int i_height );
;
;-----------------------------------------------------------------------------
ALIGN 16
x264_center_filter_mmxext :
push r15
pushreg r15
%ifdef WIN64
push rdi
pushreg rdi
push rsi
pushreg rsi
%endif
push rbp
pushreg rbp
push rbx
pushreg rbx
push r12
pushreg r12
push r13
pushreg r13
push r14
pushreg r14
lea rbp, [rsp]
setframe rbp, 0
endprolog
%ifdef WIN64
movsxd r13, dword [rsp+64+48] ; src_stride
mov r12, [rsp+64+40] ; src
%else
movsxd r13, r9d ; src_stride
mov r12, r8 ; src
%endif
sub r12, r13
sub r12, r13 ; tsrc = src - 2 * src_stride
; use 24 instead of 18 (used in i386/mc-a2.asm) to keep rsp aligned
lea rax, [r13 + r13 + 24 + tbuffer]
sub rsp, rax
mov r10, parm3q ; dst2
movsxd r11, parm4d ; dst2_stride
mov r8, parm1q ; dst1
movsxd r9, parm2d ; dst1_stride
%ifdef WIN64
movsxd r14, dword [rbp + 64 + 56] ; width
movsxd r15, dword [rbp + 64 + 64] ; height
%else
movsxd r14, dword [rbp + 56] ; width
movsxd r15, dword [rbp + 64] ; height
%endif
mov rcx, r13 ; src_stride
lea rbx, [r13 + r13 * 2] ; 3 * src_stride
lea rdx, [r13 + r13 * 4] ; 5 * src_stride
pxor mm0, mm0 ; 0 ---> mm0
movq mm7, [mmx_dd_one GLOBAL] ; for rounding
.loopcy:
xor rax, rax
mov rsi, r12 ; tsrc
FILT_ALL rsi
pshufw mm2, mm1, 0
movq [rsp + tbuffer], mm2
movq [rsp + tbuffer + 8], mm1
paddw mm1, [mmx_dw_one GLOBAL]
psraw mm1, 5
packuswb mm1, mm1
movd [r8], mm1 ; dst1[0] = mm1
add rax, 8
add rsi, 4
lea rdi, [r8 - 4] ; rdi = dst1 - 4
.loopcx1:
FILT_ALL rsi
movq [rsp + tbuffer + 2 * rax], mm1
paddw mm1, [mmx_dw_one GLOBAL]
psraw mm1, 5
packuswb mm1, mm1
movd [rdi + rax], mm1 ; dst1[rax - 4] = mm1
add rsi, 4
add rax, 4
cmp rax, r14 ; cmp rax, width
jnz .loopcx1
FILT_ALL rsi
pshufw mm2, mm1, 7
movq [rsp + tbuffer + 2 * rax], mm1
movq [rsp + tbuffer + 2 * rax + 8], mm2
paddw mm1, [mmx_dw_one GLOBAL]
psraw mm1, 5
packuswb mm1, mm1
movd [rdi + rax], mm1 ; dst1[rax - 4] = mm1
add r12, r13 ; tsrc = tsrc + src_stride
add r8, r9 ; dst1 = dst1 + dst1_stride
xor rax, rax
.loopcx2:
movq mm2, [rsp + 2 * rax + 2 + 4 + tbuffer]
movq mm3, [rsp + 2 * rax + 4 + 4 + tbuffer]
movq mm4, [rsp + 2 * rax + 6 + 4 + tbuffer]
movq mm5, [rsp + 2 * rax + 8 + 4 + tbuffer]
movq mm1, [rsp + 2 * rax + 4 + tbuffer]
movq mm6, [rsp + 2 * rax + 10 + 4 + tbuffer]
paddw mm2, mm5
paddw mm3, mm4
paddw mm1, mm6
movq mm5, [mmx_dw_20 GLOBAL]
movq mm4, [mmx_dw_5 GLOBAL]
movq mm6, mm1
pxor mm7, mm7
punpckhwd mm5, mm2
punpcklwd mm4, mm3
punpcklwd mm2, [mmx_dw_20 GLOBAL]
punpckhwd mm3, [mmx_dw_5 GLOBAL]
pcmpgtw mm7, mm1
pmaddwd mm2, mm4
pmaddwd mm3, mm5
punpcklwd mm1, mm7
punpckhwd mm6, mm7
paddd mm2, mm1
paddd mm3, mm6
paddd mm2, [mmx_dd_one GLOBAL]
paddd mm3, [mmx_dd_one GLOBAL]
psrad mm2, 10
psrad mm3, 10
packssdw mm2, mm3
packuswb mm2, mm0
movd [r10 + rax], mm2 ; dst2[rax] = mm2
add rax, 4
cmp rax, r14 ; cmp rax, width
jnz .loopcx2
add r10, r11 ; dst2 += dst2_stride
dec r15 ; height
jnz .loopcy
lea rsp, [rbp]
pop r14
pop r13
pop r12
pop rbx
pop rbp
%ifdef WIN64
pop rsi
pop rdi
%endif
pop r15
ret
;-----------------------------------------------------------------------------
;
; void x264_horizontal_filter_mmxext( uint8_t *dst, int i_dst_stride,
; uint8_t *src, int i_src_stride,
; int i_width, int i_height );
;
;-----------------------------------------------------------------------------
ALIGN 16
x264_horizontal_filter_mmxext :
movsxd r10, parm2d ; dst_stride
movsxd r11, parm4d ; src_stride
%ifdef WIN64
mov rdx, r8 ; src
mov r9, rcx ; dst
movsxd rcx, parm6d ; height
%else
movsxd rcx, parm6d ; height
mov r9, rdi ; dst
%endif
movsxd r8, parm5d ; width
pxor mm0, mm0
movq mm7, [mmx_dw_one GLOBAL]
sub rdx, 2
loophy:
xor rax, rax
loophx:
prefetchnta [rdx + rax + 48]
LOAD_4 mm1, mm2, mm3, mm4, [rdx + rax], [rdx + rax + 1], [rdx + rax + 2], [rdx + rax + 3], mm0
FILT_2 mm1, mm2
movd mm5, [rdx + rax + 4]
movd mm6, [rdx + rax + 5]
FILT_4 mm1, mm3, mm4
movd mm2, [rdx + rax + 4]
movd mm3, [rdx + rax + 6]
punpcklbw mm5, mm0
punpcklbw mm6, mm0
FILT_6 mm1, mm5, mm6, mm7
movd mm4, [rdx + rax + 7]
movd mm5, [rdx + rax + 8]
punpcklbw mm2, mm0
punpcklbw mm3, mm0 ; mm2(1), mm3(20), mm6(-5) ready
FILT_2 mm2, mm6
movd mm6, [rdx + rax + 9]
punpcklbw mm4, mm0
punpcklbw mm5, mm0 ; mm2(1-5), mm3(20), mm4(20), mm5(-5) ready
FILT_4 mm2, mm3, mm4
punpcklbw mm6, mm0
FILT_6 mm2, mm5, mm6, mm7
packuswb mm1, mm2
movq [r9 + rax], mm1
add rax, 8
cmp rax, r8 ; cmp rax, width
jnz loophx
add rdx, r11 ; src_pitch
add r9, r10 ; dst_pitch
dec rcx
jnz loophy
ret