www.pudn.com > x264_2007.rar > mc-a.asm


;*****************************************************************************
;* mc.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2003 x264 project
;* $Id: mc.asm,v 1.3 2004/06/18 01:59:58 chenm001 Exp $
;*
;* Authors: Min Chen  (converted to nasm)
;*          Laurent Aimar  (init algorithm)
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
;*****************************************************************************

;*****************************************************************************
;*                                                                           *
;*  Revision history:                                                        *
;*                                                                           *
;*  2004.05.17 portab mc_copy_w4/8/16 (CM)                                   *
;*                                                                           *
;*****************************************************************************

BITS 32

;=============================================================================
; Macros and other preprocessor constants
;=============================================================================

%include "i386inc.asm"

;=============================================================================
; Constants
;=============================================================================

SECTION_RODATA

ALIGN 16
pw_4:  times 4 dw  4
pw_8:  times 4 dw  8
pw_32: times 4 dw 32
pw_64: times 4 dw 64

;=============================================================================
; Code
;=============================================================================

SECTION .text

;=============================================================================
; pixel avg
;=============================================================================

;-----------------------------------------------------------------------------
; void x264_pixel_avg_w4_mmxext( uint8_t *dst,  int i_dst_stride,
;                                uint8_t *src1, int i_src1_stride,
;                                uint8_t *src2, int i_src2_stride,
;                                int i_height );
;-----------------------------------------------------------------------------
cglobal x264_pixel_avg_w4_mmxext
    push        ebp
    push        ebx
    push        esi
    push        edi

    mov         edi, [esp+20]       ; dst
    mov         ebx, [esp+28]       ; src1
    mov         ecx, [esp+36]       ; src2
    mov         esi, [esp+24]       ; i_dst_stride
    mov         eax, [esp+32]       ; i_src1_stride
    mov         edx, [esp+40]       ; i_src2_stride
    mov         ebp, [esp+44]       ; i_height
ALIGN 4
.height_loop    
    movd        mm0, [ebx]
    pavgb       mm0, [ecx]
    movd        mm1, [ebx+eax]
    pavgb       mm1, [ecx+edx]
    movd        [edi], mm0
    movd        [edi+esi], mm1
    dec         ebp
    dec         ebp
    lea         ebx, [ebx+eax*2]
    lea         ecx, [ecx+edx*2]
    lea         edi, [edi+esi*2]
    jg          .height_loop

    pop         edi
    pop         esi
    pop         ebx
    pop         ebp
    ret

                          

;-----------------------------------------------------------------------------
; void x264_pixel_avg_w8_mmxext( uint8_t *dst,  int i_dst_stride,
;                                uint8_t *src1, int i_src1_stride,
;                                uint8_t *src2, int i_src2_stride,
;                                int i_height );
;-----------------------------------------------------------------------------
cglobal x264_pixel_avg_w8_mmxext
    push        ebp
    push        ebx
    push        esi
    push        edi

    mov         edi, [esp+20]       ; dst
    mov         ebx, [esp+28]       ; src1
    mov         ecx, [esp+36]       ; src2
    mov         esi, [esp+24]       ; i_dst_stride
    mov         eax, [esp+32]       ; i_src1_stride
    mov         edx, [esp+40]       ; i_src2_stride
    mov         ebp, [esp+44]       ; i_height
ALIGN 4
.height_loop    
    movq        mm0, [ebx]
    pavgb       mm0, [ecx]
    movq        [edi], mm0
    dec         ebp
    lea         ebx, [ebx+eax]
    lea         ecx, [ecx+edx]
    lea         edi, [edi+esi]
    jg          .height_loop

    pop         edi
    pop         esi
    pop         ebx
    pop         ebp
    ret



;-----------------------------------------------------------------------------
; void x264_pixel_avg_w16_mmxext( uint8_t *dst,  int i_dst_stride,
;                                 uint8_t *src1, int i_src1_stride,
;                                 uint8_t *src2, int i_src2_stride,
;                                 int i_height );
;-----------------------------------------------------------------------------
cglobal x264_pixel_avg_w16_mmxext
    push        ebp
    push        ebx
    push        esi
    push        edi

    mov         edi, [esp+20]       ; dst
    mov         ebx, [esp+28]       ; src1
    mov         ecx, [esp+36]       ; src2
    mov         esi, [esp+24]       ; i_dst_stride
    mov         eax, [esp+32]       ; i_src1_stride
    mov         edx, [esp+40]       ; i_src2_stride
    mov         ebp, [esp+44]       ; i_height
ALIGN 4
.height_loop    
    movq        mm0, [ebx  ]
    movq        mm1, [ebx+8]
    pavgb       mm0, [ecx  ]
    pavgb       mm1, [ecx+8]
    movq        [edi  ], mm0
    movq        [edi+8], mm1
    dec         ebp
    lea         ebx, [ebx+eax]
    lea         ecx, [ecx+edx]
    lea         edi, [edi+esi]
    jg          .height_loop

    pop         edi
    pop         esi
    pop         ebx
    pop         ebp
    ret



;-----------------------------------------------------------------------------
; void x264_pixel_avg_w20_mmxext( uint8_t *dst,  int i_dst_stride,
;                                 uint8_t *src1, int i_src1_stride,
;                                 uint8_t *src2, int i_src2_stride,
;                                 int i_height );
;-----------------------------------------------------------------------------
cglobal x264_pixel_avg_w20_mmxext
    push        ebp
    push        ebx
    push        esi
    push        edi

    mov         edi, [esp+20]       ; dst
    mov         ebx, [esp+28]       ; src1
    mov         ecx, [esp+36]       ; src2
    mov         esi, [esp+24]       ; i_dst_stride
    mov         eax, [esp+32]       ; i_src1_stride
    mov         edx, [esp+40]       ; i_src2_stride
    mov         ebp, [esp+44]       ; i_height
ALIGN 4
.height_loop    
    movq        mm0, [ebx   ]
    movq        mm1, [ebx+8 ]
    movd        mm2, [ebx+16]
    pavgb       mm0, [ecx   ]
    pavgb       mm1, [ecx+8 ]
    pavgb       mm2, [ecx+16]
    movq        [edi   ], mm0
    movq        [edi+8 ], mm1
    movd        [edi+16], mm2
    dec         ebp
    lea         ebx, [ebx+eax]
    lea         ecx, [ecx+edx]
    lea         edi, [edi+esi]
    jg          .height_loop

    pop         edi
    pop         esi
    pop         ebx
    pop         ebp
    ret



ALIGN 16
;-----------------------------------------------------------------------------
; void x264_pixel_avg_w16_sse2( uint8_t *dst,  int i_dst_stride,
;                               uint8_t *src1, int i_src1_stride,
;                               uint8_t *src2, int i_src2_stride,
;                               int i_height );
;-----------------------------------------------------------------------------
cglobal x264_pixel_avg_w16_sse2
    push        ebp
    push        ebx
    push        esi
    push        edi

    mov         edi, [esp+20]       ; dst
    mov         ebx, [esp+28]       ; src1
    mov         ecx, [esp+36]       ; src2
    mov         esi, [esp+24]       ; i_dst_stride
    mov         eax, [esp+32]       ; i_src1_stride
    mov         edx, [esp+40]       ; i_src2_stride
    mov         ebp, [esp+44]       ; i_height
ALIGN 4
.height_loop    
    movdqu      xmm0, [ebx]
    pavgb       xmm0, [ecx]
    movdqu      [edi], xmm0

    dec         ebp
    lea         ebx, [ebx+eax]
    lea         ecx, [ecx+edx]
    lea         edi, [edi+esi]
    jg          .height_loop

    pop         edi
    pop         esi
    pop         ebx
    pop         ebp
    ret


;=============================================================================
; weighted prediction
;=============================================================================
; implicit bipred only:
; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64

%macro BIWEIGHT_4P_MMX 2
    movd      mm0, %1
    movd      mm1, %2
    punpcklbw mm0, mm7
    punpcklbw mm1, mm7
    pmullw    mm0, mm4
    pmullw    mm1, mm5
    paddw     mm0, mm1
    paddw     mm0, mm6
    psraw     mm0, 6
    pmaxsw    mm0, mm7
    packuswb  mm0, mm0
    movd      %1,  mm0
%endmacro

%macro BIWEIGHT_START_MMX 0
    push    edi
    push    esi
    picpush ebx
    picgetgot ebx
    mov     edi, [picesp+12]    ; dst
    mov     esi, [picesp+16]    ; i_dst
    mov     edx, [picesp+20]    ; src
    mov     ecx, [picesp+24]    ; i_src

    pshufw  mm4, [picesp+28], 0  ; weight_dst
    movq    mm5, [pw_64 GOT_ebx]
    psubw   mm5, mm4             ; weight_src
    movq    mm6, [pw_32 GOT_ebx] ; rounding
    pxor    mm7, mm7
%endmacro
%macro BIWEIGHT_END_MMX 0
    picpop  ebx
    pop     esi
    pop     edi
    ret
%endmacro

;-----------------------------------------------------------------------------
;   int __cdecl x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int )
;-----------------------------------------------------------------------------
cglobal x264_pixel_avg_weight_w16_mmxext
    BIWEIGHT_START_MMX
    mov     eax, [picesp+32] ; i_height
    ALIGN 4
    .height_loop

    BIWEIGHT_4P_MMX  [edi   ], [edx   ]
    BIWEIGHT_4P_MMX  [edi+ 4], [edx+ 4]
    BIWEIGHT_4P_MMX  [edi+ 8], [edx+ 8]
    BIWEIGHT_4P_MMX  [edi+12], [edx+12]

    add  edi, esi
    add  edx, ecx
    dec  eax
    jg   .height_loop
    BIWEIGHT_END_MMX

;-----------------------------------------------------------------------------
;   int __cdecl x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int )
;-----------------------------------------------------------------------------
cglobal x264_pixel_avg_weight_w8_mmxext
    BIWEIGHT_START_MMX
    mov     eax, [picesp+32]
    ALIGN 4
    .height_loop

    BIWEIGHT_4P_MMX  [edi      ], [edx      ]
    BIWEIGHT_4P_MMX  [edi+4    ], [edx+4    ]
    BIWEIGHT_4P_MMX  [edi+esi  ], [edx+ecx  ]
    BIWEIGHT_4P_MMX  [edi+esi+4], [edx+ecx+4]

    lea  edi, [edi+esi*2]
    lea  edx, [edx+ecx*2]
    sub  eax, byte 2
    jg   .height_loop
    BIWEIGHT_END_MMX

;-----------------------------------------------------------------------------
;   int __cdecl x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int )
;-----------------------------------------------------------------------------
cglobal x264_pixel_avg_weight_4x4_mmxext
    BIWEIGHT_START_MMX
    BIWEIGHT_4P_MMX  [edi      ], [edx      ]
    BIWEIGHT_4P_MMX  [edi+esi  ], [edx+ecx  ]
    BIWEIGHT_4P_MMX  [edi+esi*2], [edx+ecx*2]
    add  edi, esi
    add  edx, ecx
    BIWEIGHT_4P_MMX  [edi+esi*2], [edx+ecx*2]
    BIWEIGHT_END_MMX



;=============================================================================
; pixel copy
;=============================================================================

;-----------------------------------------------------------------------------
;  void x264_mc_copy_w4_mmx( uint8_t *src, int i_src_stride,
;                            uint8_t *dst, int i_dst_stride, int i_height )
;-----------------------------------------------------------------------------
cglobal x264_mc_copy_w4_mmx
    push    ebx
    push    esi
    push    edi

    mov     esi, [esp+24]       ; src
    mov     edi, [esp+16]       ; dst
    mov     ebx, [esp+28]       ; i_src_stride
    mov     edx, [esp+20]       ; i_dst_stride
    mov     ecx, [esp+32]       ; i_height
ALIGN 4
.height_loop
    mov     eax, [esi]
    mov     [edi], eax
    mov     eax, [esi+ebx]
    mov     [edi+edx], eax
    lea     esi, [esi+ebx*2]
    lea     edi, [edi+edx*2]
    dec     ecx
    dec     ecx
    jg      .height_loop

    pop     edi
    pop     esi
    pop     ebx
    ret

;-----------------------------------------------------------------------------
;   void x264_mc_copy_w8_mmx( uint8_t *src, int i_src_stride,
;                             uint8_t *dst, int i_dst_stride, int i_height )
;-----------------------------------------------------------------------------
cglobal x264_mc_copy_w8_mmx
    push    ebx
    push    esi
    push    edi

    mov     esi, [esp+24]       ; src
    mov     edi, [esp+16]       ; dst
    mov     ebx, [esp+28]       ; i_src_stride
    mov     edx, [esp+20]       ; i_dst_stride
    mov     ecx, [esp+32]       ; i_height
ALIGN 4
.height_loop
    movq    mm0, [esi]
    movq    [edi], mm0
    movq    mm1, [esi+ebx]
    movq    [edi+edx], mm1
    movq    mm2, [esi+ebx*2]
    movq    [edi+edx*2], mm2
    lea     esi, [esi+ebx*2]
    lea     edi, [edi+edx*2]
    movq    mm3, [esi+ebx]
    movq    [edi+edx], mm3
    lea     esi, [esi+ebx*2]
    lea     edi, [edi+edx*2]
    
    sub     ecx, byte 4
    jg      .height_loop

    pop     edi
    pop     esi
    pop     ebx
    ret

;-----------------------------------------------------------------------------
;   void x264_mc_copy_w16_mmx( uint8_t *src, int i_src_stride,
;                              uint8_t *dst, int i_dst_stride, int i_height )
;-----------------------------------------------------------------------------
cglobal x264_mc_copy_w16_mmx
    push    ebx
    push    esi
    push    edi

    mov     esi, [esp+24]       ; src
    mov     edi, [esp+16]       ; dst
    mov     ebx, [esp+28]       ; i_src_stride
    mov     edx, [esp+20]       ; i_dst_stride
    mov     ecx, [esp+32]       ; i_height

ALIGN 4
.height_loop
    movq    mm0, [esi]
    movq    mm1, [esi+8]
    movq    [edi], mm0
    movq    [edi+8], mm1
    movq    mm2, [esi+ebx]
    movq    mm3, [esi+ebx+8]
    movq    [edi+edx], mm2
    movq    [edi+edx+8], mm3
    movq    mm4, [esi+ebx*2]
    movq    mm5, [esi+ebx*2+8]
    movq    [edi+edx*2], mm4
    movq    [edi+edx*2+8], mm5
    lea     esi, [esi+ebx*2]
    lea     edi, [edi+edx*2]
    movq    mm6, [esi+ebx]
    movq    mm7, [esi+ebx+8]
    movq    [edi+edx], mm6
    movq    [edi+edx+8], mm7
    lea     esi, [esi+ebx*2]
    lea     edi, [edi+edx*2]
    sub     ecx, byte 4
    jg      .height_loop
    
    pop     edi
    pop     esi
    pop     ebx
    ret


;-----------------------------------------------------------------------------
;   void x264_mc_copy_w16_sse2( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
;-----------------------------------------------------------------------------
cglobal x264_mc_copy_w16_sse2
    push    ebx
    push    esi
    push    edi

    mov     esi, [esp+24]       ; src
    mov     edi, [esp+16]       ; dst
    mov     ebx, [esp+28]       ; i_src_stride
    mov     edx, [esp+20]       ; i_dst_stride
    mov     ecx, [esp+32]       ; i_height

ALIGN 4
.height_loop
    movdqu  xmm0, [esi]
    movdqu  xmm1, [esi+ebx]
    movdqu  [edi], xmm0
    movdqu  [edi+edx], xmm1
    dec     ecx
    dec     ecx
    lea     esi, [esi+ebx*2]
    lea     edi, [edi+edx*2]
    jg      .height_loop
    
    pop     edi
    pop     esi
    pop     ebx
    ret



;=============================================================================
; chroma MC
;=============================================================================

;-----------------------------------------------------------------------------
;   void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,
;                               uint8_t *dst, int i_dst_stride,
;                               int dx, int dy,
;                               int i_width, int i_height )
;-----------------------------------------------------------------------------

cglobal x264_mc_chroma_mmxext
    picpush ebx
    picgetgot ebx
    push    edi

    mov     ecx, [picesp+4+24]
    mov     edx, [picesp+4+20]
    mov     eax, ecx
    mov     edi, edx
    sar     ecx, 3
    sar     edx, 3
    imul    ecx, [picesp+4+8]
    add     ecx, edx
    add     [picesp+4+4], ecx     ; src += (dx>>3) + (dy>>3) * src_stride

    pxor    mm3, mm3

    and     edi, 7
    and     eax, 7
    movd    mm5, edi
    movd    mm6, eax
    pshufw  mm5, mm5, 0         ; mm5 = dx&7
    pshufw  mm6, mm6, 0         ; mm6 = dy&7

    movq    mm4, [pw_8 GOT_ebx]
    movq    mm0, mm4

    psubw   mm4, mm5            ; mm4 = 8-dx
    psubw   mm0, mm6            ; mm0 = 8-dy

    movq    mm7, mm5
    pmullw  mm5, mm0            ; mm5 = dx*(8-dy) =     cB
    pmullw  mm7, mm6            ; mm7 = dx*dy =         cD
    pmullw  mm6, mm4            ; mm6 = (8-dx)*dy =     cC
    pmullw  mm4, mm0            ; mm4 = (8-dx)*(8-dy) = cA

    mov     eax, [picesp+4+4]   ; src
    mov     edi, [picesp+4+12]  ; dst
    mov     ecx, [picesp+4+8]   ; i_src_stride
    mov     edx, [picesp+4+32]  ; i_height

ALIGN 4
.height_loop

    movd    mm1, [eax+ecx]
    movd    mm0, [eax]
    punpcklbw mm1, mm3          ; 00 px1 | 00 px2 | 00 px3 | 00 px4
    punpcklbw mm0, mm3
    pmullw  mm1, mm6            ; 2nd line * cC
    pmullw  mm0, mm4            ; 1st line * cA

    paddw   mm0, mm1            ; mm0 <- result

    movd    mm2, [eax+1]
    movd    mm1, [eax+ecx+1]
    punpcklbw mm2, mm3
    punpcklbw mm1, mm3

    paddw   mm0, [pw_32 GOT_ebx]

    pmullw  mm2, mm5            ; line * cB
    pmullw  mm1, mm7            ; line * cD
    paddw   mm0, mm2
    paddw   mm0, mm1

    psrlw   mm0, 6
    packuswb mm0, mm3           ; 00 00 00 00 px1 px2 px3 px4
    movd    [edi], mm0

    add     eax, ecx
    add     edi, [picesp+4+16]

    dec     edx
    jnz     .height_loop

    sub     [picesp+4+28], dword 8
    jnz     .finish            ; width != 8 so assume 4

    mov     edi, [picesp+4+12] ; dst
    mov     eax, [picesp+4+4]  ; src
    mov     edx, [picesp+4+32] ; i_height
    add     edi, 4
    add     eax, 4
    jmp    .height_loop

.finish
    pop     edi
    picpop  ebx
    ret



; prefetches tuned for 64 byte cachelines (K7/K8/Core2)
; TODO add 32 and 128 byte versions for P3/P4

;-----------------------------------------------------------------------------
; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y, 
;                                 uint8_t *pix_uv, int stride_uv, int mb_x )
;-----------------------------------------------------------------------------
cglobal x264_prefetch_fenc_mmxext
    mov   eax, [esp+20]
    mov   ecx, [esp+8]
    mov   edx, [esp+4]
    and   eax, 3
    imul  eax, ecx
    lea   edx, [edx+eax*4+64]
    prefetcht0 [edx]
    prefetcht0 [edx+ecx]
    lea   edx, [edx+ecx*2]
    prefetcht0 [edx]
    prefetcht0 [edx+ecx]

    mov   eax, [esp+20]
    mov   ecx, [esp+16]
    mov   edx, [esp+12]
    and   eax, 6
    imul  eax, ecx
    lea   edx, [edx+eax+64]
    prefetcht0 [edx]
    prefetcht0 [edx+ecx]
    ret

;-----------------------------------------------------------------------------
; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity )
;-----------------------------------------------------------------------------
cglobal x264_prefetch_ref_mmxext
    mov   eax, [esp+12]
    mov   ecx, [esp+8]
    mov   edx, [esp+4]
    sub   eax, 1
    and   eax, ecx
    lea   edx, [edx+eax*8+64]
    lea   eax, [ecx*3]
    prefetcht0 [edx]
    prefetcht0 [edx+ecx]
    prefetcht0 [edx+ecx*2]
    prefetcht0 [edx+eax]
    lea   edx, [edx+ecx*4]
    prefetcht0 [edx]
    prefetcht0 [edx+ecx]
    prefetcht0 [edx+ecx*2]
    prefetcht0 [edx+eax]
    ret