www.pudn.com > X264_20060729.rar > mc-a2.asm


;*****************************************************************************
;* mc-a2.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2005 x264 project
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
;*****************************************************************************

BITS 32

;=============================================================================
; Macros and other preprocessor constants
;=============================================================================

>include "i386inc.asm"

;=============================================================================
; Read only data
;=============================================================================

SECTION_RODATA

ALIGN 16
mmx_dw_one:
times 4 dw 16
mmx_dd_one:
times 2 dd 512
mmx_dw_20:
times 4 dw 20
mmx_dw_5:
times 4 dw -5

>assign twidth 0
>assign theight 4
>assign tdstp1 8
>assign tdstp2 12
>assign tdst1 16
>assign tdst2 20
>assign tsrc 24
>assign tsrcp 28
>assign toffset 32
>assign tbuffer 36


;=============================================================================
; Macros
;=============================================================================

>macro LOAD_4 9
movd >1, >5
movd >2, >6
movd >3, >7
movd >4, >8
punpcklbw >1, >9
punpcklbw >2, >9
punpcklbw >3, >9
punpcklbw >4, >9
>endmacro

>macro FILT_2 2
psubw >1, >2
psllw >2, 2
psubw >1, >2
>endmacro

>macro FILT_4 3
paddw >2, >3
psllw >2, 2
paddw >1, >2
psllw >2, 2
paddw >1, >2
>endmacro

>macro FILT_6 4
psubw >1, >2
psllw >2, 2
psubw >1, >2
paddw >1, >3
paddw >1, >4
psraw >1, 5
>endmacro

>macro FILT_ALL 1
LOAD_4 mm1, mm2, mm3, mm4, [>1], [>1 + ecx], [>1 + 2 * ecx], [>1 + ebx], mm0
FILT_2 mm1, mm2
movd mm5, [>1 + 4 * ecx]
movd mm6, [>1 + edx]
FILT_4 mm1, mm3, mm4
punpcklbw mm5, mm0
punpcklbw mm6, mm0
psubw mm1, mm5
psllw mm5, 2
psubw mm1, mm5
paddw mm1, mm6
>endmacro




;=============================================================================
; Code
;=============================================================================

SECTION .text

cglobal x264_horizontal_filter_mmxext
cglobal x264_center_filter_mmxext

;-----------------------------------------------------------------------------
;
; void x264_center_filter_mmxext( uint8_t *dst1, int i_dst1_stride,
; uint8_t *dst2, int i_dst2_stride,
; uint8_t *src, int i_src_stride,
; int i_width, int i_height );
;
;-----------------------------------------------------------------------------

ALIGN 16
x264_center_filter_mmxext :

push edi
push esi
push ebx
push ebp

mov edx, [esp + 40] ; src_stride
lea edx, [edx + edx + 18 + tbuffer]
sub esp, edx
mov [esp + toffset] ,edx

mov eax, [esp + edx + 20] ; dst1
mov [esp + tdst1] ,eax

mov eax, [esp + edx + 28] ; dst2
mov [esp + tdst2] ,eax

mov eax, [esp + edx + 44] ; width
mov [esp + twidth] ,eax

mov eax, [esp + edx + 48] ; height
mov [esp + theight] ,eax

mov eax, [esp + edx + 24] ; dst1_stride
mov [esp + tdstp1] ,eax

mov eax, [esp + edx + 32] ; dst2_stride
mov [esp + tdstp2] ,eax

mov ecx, [esp + edx + 40] ; src_stride
mov [esp + tsrcp] ,ecx

mov eax, [esp + edx + 36] ; src
sub eax, ecx
sub eax, ecx
mov [esp + tsrc] ,eax ; src - 2 * src_stride

lea ebx, [ecx + ecx * 2] ; 3 * src_stride
lea edx, [ecx + ecx * 4] ; 5 * src_stride

picpush ebx
picgetgot ebx

pxor mm0, mm0 ; 0 ---> mm0


loopcy:

mov edi, [picesp + tdst1]
lea ebp, [picesp + tbuffer]
mov esi, [picesp + tsrc]
movq mm7, [mmx_dw_one GOT_ebx]

picpop ebx

FILT_ALL esi

pshufw mm2, mm1, 0
movq [ebp + 8], mm1
movq [ebp], mm2
paddw mm1, mm7
psraw mm1, 5

packuswb mm1, mm1
movd [edi], mm1

mov eax, 8
add esi, 4

loopcx1:

FILT_ALL esi

movq [ebp + 2 * eax], mm1
paddw mm1, mm7
psraw mm1, 5
packuswb mm1, mm1
movd [edi + eax - 4], mm1

add esi, 4
add eax, 4
cmp eax, [esp + twidth]
jnz loopcx1

FILT_ALL esi

pshufw mm2, mm1, 7
movq [ebp + 2 * eax], mm1
movq [ebp + 2 * eax + 8], mm2
paddw mm1, mm7
psraw mm1, 5
packuswb mm1, mm1
movd [edi + eax - 4], mm1

mov esi, [esp + tsrc]
add esi, ecx
mov [esp + tsrc], esi

add edi, [esp + tdstp1]
mov [esp + tdst1], edi

mov edi, [esp + tdst2]
; mov eax, [esp + twidth]
sub eax, 4

picpush ebx
picgetgot ebx

loopcx2:

movq mm2, [picesp + 2 * eax + 2 + 4 + tbuffer]
movq mm3, [picesp + 2 * eax + 4 + 4 + tbuffer]
movq mm4, [picesp + 2 * eax + 6 + 4 + tbuffer]
movq mm5, [picesp + 2 * eax + 8 + 4 + tbuffer]
movq mm1, [picesp + 2 * eax + 4 + tbuffer]
movq mm6, [picesp + 2 * eax + 10 + 4 + tbuffer]
paddw mm2, mm5
paddw mm3, mm4
paddw mm1, mm6

movq mm5, [mmx_dw_20 GOT_ebx]
movq mm4, [mmx_dw_5 GOT_ebx]
movq mm6, mm1
pxor mm7, mm7

punpckhwd mm5, mm2
punpcklwd mm4, mm3
punpcklwd mm2, [mmx_dw_20 GOT_ebx]
punpckhwd mm3, [mmx_dw_5 GOT_ebx]

pcmpgtw mm7, mm1

pmaddwd mm2, mm4
pmaddwd mm3, mm5

punpcklwd mm1, mm7
punpckhwd mm6, mm7

paddd mm2, mm1
paddd mm3, mm6

paddd mm2, [mmx_dd_one GOT_ebx]
paddd mm3, [mmx_dd_one GOT_ebx]

psrad mm2, 10
psrad mm3, 10

packssdw mm2, mm3
packuswb mm2, mm0

movd [edi + eax], mm2

sub eax, 4
jge loopcx2

add edi, [picesp + tdstp2]
mov [picesp + tdst2], edi

dec dword [picesp + theight]
jnz loopcy

picpop ebx

add esp, [esp + toffset]

pop ebp
pop ebx
pop esi
pop edi

ret

;-----------------------------------------------------------------------------
;
; void x264_horizontal_filter_mmxext( uint8_t *dst, int i_dst_stride,
; uint8_t *src, int i_src_stride,
; int i_width, int i_height );
;
;-----------------------------------------------------------------------------

ALIGN 16
x264_horizontal_filter_mmxext :
push edi
push esi

mov edi, [esp + 12] ; dst
mov esi, [esp + 20] ; src

pxor mm0, mm0
picpush ebx
picgetgot ebx
movq mm7, [mmx_dw_one GOT_ebx]
picpop ebx

mov ecx, [esp + 32] ; height

sub esi, 2

loophy:

xor eax, eax

loophx:

prefetchnta [esi + eax + 48]

LOAD_4 mm1, mm2, mm3, mm4, [esi + eax], [esi + eax + 1], [esi + eax + 2], [esi + eax + 3], mm0
FILT_2 mm1, mm2
movd mm5, [esi + eax + 4]
movd mm6, [esi + eax + 5]
FILT_4 mm1, mm3, mm4
movd mm2, [esi + eax + 4]
movd mm3, [esi + eax + 6]
punpcklbw mm5, mm0
punpcklbw mm6, mm0
FILT_6 mm1, mm5, mm6, mm7
movd mm4, [esi + eax + 7]
movd mm5, [esi + eax + 8]
punpcklbw mm2, mm0
punpcklbw mm3, mm0 ; mm2(1), mm3(20), mm6(-5) ready
FILT_2 mm2, mm6
movd mm6, [esi + eax + 9]
punpcklbw mm4, mm0
punpcklbw mm5, mm0 ; mm2(1-5), mm3(20), mm4(20), mm5(-5) ready
FILT_4 mm2, mm3, mm4
punpcklbw mm6, mm0
FILT_6 mm2, mm5, mm6, mm7

packuswb mm1, mm2
movq [edi + eax], mm1

add eax, 8
cmp eax, [esp + 28] ; width
jnz loophx

add esi, [esp + 24] ; src_pitch
add edi, [esp + 16] ; dst_pitch

dec ecx
jnz loophy

pop esi
pop edi

ret