www.pudn.com > mpeg4_DECORE.rar > transferIDCT_mmx.c


/************************************************************************** 
 *                                                                        * 
 * This code has been developed by John Funnell. This software is an      * 
 * implementation of a part of one or more MPEG-4 Video tools as          * 
 * specified in ISO/IEC 14496-2 standard.  Those intending to use this    * 
 * software module in hardware or software products are advised that its  * 
 * use may infringe existing patents or copyrights, and any such use      * 
 * would be at such party's own risk.  The original developer of this     * 
 * software module and his/her company, and subsequent editors and their  * 
 * companies (including Project Mayo), will have no liability for use of  * 
 * this software or modifications or derivatives thereof.                 * 
 *                                                                        * 
 * Project Mayo gives users of the Codec a license to this software       * 
 * module or modifications thereof for use in hardware or software        * 
 * products claiming conformance to the MPEG-4 Video Standard as          * 
 * described in the Open DivX license.                                    * 
 *                                                                        * 
 * The complete Open DivX license can be found at                         * 
 * http://www.projectmayo.com/opendivx/license.php                        * 
 *                                                                        * 
 **************************************************************************/ 
/** 
*  Copyright (C) 2001 - Project Mayo 
 * 
 * John Funnell 
 * 
 * DivX Advanced Research Center  
**/ 
// transferIDCT_mmx.c // 
 
/* routines to transfer iDCT block result into the output picture */ 
 
/* this routine still needs to be optimised for pipelining        */ 
/* just haven't got time to finish the optimisations here ;-)     */ 
 
/* inline masm is used here so not compatible with gnu            */ 
 
 
 
//#define _TEST_TRANSFER 
 
#ifdef _TEST_TRANSFER 
#include  
#endif 
 
 
#include "portab.h" 
 
void transferIDCT_add(int16_t *sourceS16, uint8_t *destU8, int stride) { 
	#ifdef _TEST_TRANSFER 
	uint8_t reference_dest[64]; 
	int x, y, sum16; 
 
	/* populate reference_dest[] with the correct result... */ 
	for (y=0; y<8; y++) { 
		for (x=0; x<8; x++) { 
			sum16 = (destU8[stride*y + x] + sourceS16[8*y + x]); 
			if      (sum16 > 255) reference_dest[8*y + x] = 255; 
			else if (sum16 <   0) reference_dest[8*y + x] =   0; 
			else                  reference_dest[8*y + x] = (uint8_t)sum16; 
		} 
	} 
	#endif 
 
	_asm { 
 
	; not sure about the state handling here - there must be a better way 
	push eax 
	push ebx 
	push edi 
 
	mov eax, sourceS16           ;  parameter 1, *sourceS16 
	mov ebx, destU8              ;  parameter 2, *destU8 
	mov edi, stride              ;  parameter 3, stride 
	pxor mm7, mm7                ;  set mm7 = 0 
 
; lines 0 to 7 all scheduled in together 
	movq mm0,  qword ptr [ebx]   ;  eight bytes of destination into mm4 
	movq mm1,  mm0               ;  eight bytes of destination into mm0 
	punpcklbw mm0, mm7           ;  unpack first 4 bytes from dest into mm4 
	punpckhbw mm1, mm7           ;  unpack next 4 bytes from dest into mm5 
	paddsw mm0, qword ptr [eax]  ;  add source and destination 
	paddsw mm1, qword ptr [eax+8];  add source and destination 
	packuswb mm0, mm1            ;  pack mm0 and mm1 into mm0 
	movq  qword ptr  [ebx], mm0  ;  copy output to destination 
	add ebx, edi                 ;  add +stride to dest ptr 
 
	movq mm2,  qword ptr [ebx]   ;  eight bytes of destination into mm4 
	movq mm3,  mm2               ;  eight bytes of destination into mm3 
	punpcklbw mm2, mm7           ;  unpack first 4 bytes from dest into mm4 
	punpckhbw mm3, mm7           ;  unpack next 4 bytes from dest into mm5 
	paddsw mm2, qword ptr [eax+16]  ;  add source and destination 
	paddsw mm3, qword ptr [eax+24];  add source and destination 
	packuswb mm2, mm3            ;  pack mm0 and mm1 into mm0 
	movq  qword ptr  [ebx], mm2  ;  copy output to destination 
	add ebx, edi                 ;  add +stride to dest ptr 
 
	movq mm4,  qword ptr [ebx]   ;  eight bytes of destination into mm4 
	movq mm5,  mm4               ;  eight bytes of destination into mm5 
	punpcklbw mm4, mm7           ;  unpack first 4 bytes from dest into mm4 
	punpckhbw mm5, mm7           ;  unpack next 4 bytes from dest into mm5 
	paddsw mm4, qword ptr [eax+32]  ;  add source and destination 
	paddsw mm5, qword ptr [eax+40];  add source and destination 
	packuswb mm4, mm5            ;  pack mm0 and mm1 into mm0 
	movq  qword ptr  [ebx], mm4  ;  copy output to destination 
	add ebx, edi                 ;  add +stride to dest ptr 
 
	movq mm0,  qword ptr [ebx]   ;  eight bytes of destination into mm4 
	movq mm1,  qword ptr [ebx]   ;  eight bytes of destination into mm5 
	punpcklbw mm0, mm7           ;  unpack first 4 bytes from dest into mm4 
	punpckhbw mm1, mm7           ;  unpack next 4 bytes from dest into mm5 
	paddsw mm0, qword ptr [eax+48]  ;  add source and destination 
	paddsw mm1, qword ptr [eax+56];  add source and destination 
	packuswb mm0, mm1            ;  pack mm0 and mm1 into mm0 
	add eax, 64                  ;  add +64 to source ptr                 
	movq  qword ptr  [ebx], mm0  ;  copy output to destination 
	add ebx, edi                 ;  add +stride to dest ptr 
 
	movq mm2,  qword ptr [ebx]   ;  eight bytes of destination into mm4 
	movq mm3,  mm2               ;  eight bytes of destination into mm3 
	punpcklbw mm2, mm7           ;  unpack first 4 bytes from dest into mm4 
	punpckhbw mm3, mm7           ;  unpack next 4 bytes from dest into mm5 
	paddsw mm2, qword ptr [eax]  ;  add source and destination 
	paddsw mm3, qword ptr [eax+8];  add source and destination 
	packuswb mm2, mm3            ;  pack mm0 and mm1 into mm0 
	add eax, 16                  ;  add +16 to source ptr                 
	movq  qword ptr  [ebx], mm2  ;  copy output to destination 
	add ebx, edi                 ;  add +stride to dest ptr 
 
	movq mm4,  qword ptr [ebx]   ;  eight bytes of destination into mm4 
	movq mm5,  mm4               ;  eight bytes of destination into mm5 
	punpcklbw mm4, mm7           ;  unpack first 4 bytes from dest into mm4 
	punpckhbw mm5, mm7           ;  unpack next 4 bytes from dest into mm5 
	paddsw mm4, qword ptr [eax]  ;  add source and destination 
	paddsw mm5, qword ptr [eax+8];  add source and destination 
	packuswb mm4, mm5            ;  pack mm0 and mm1 into mm0 
	add eax, 16                  ;  add +16 to source ptr                 
	movq  qword ptr  [ebx], mm4  ;  copy output to destination 
	add ebx, edi                 ;  add +stride to dest ptr 
 
	movq mm0,  qword ptr [ebx]   ;  eight bytes of destination into mm4 
	movq mm1,  mm0               ;  eight bytes of destination into mm1 
	punpcklbw mm0, mm7           ;  unpack first 4 bytes from dest into mm4 
	punpckhbw mm1, mm7           ;  unpack next 4 bytes from dest into mm5 
	paddsw mm0, qword ptr [eax]  ;  add source and destination 
	paddsw mm1, qword ptr [eax+8];  add source and destination 
	packuswb mm0, mm1            ;  pack mm0 and mm1 into mm0 
	add eax, 16                  ;  add +16 to source ptr                 
	movq  qword ptr  [ebx], mm0  ;  copy output to destination 
	add ebx, edi                 ;  add +stride to dest ptr 
 
	movq mm2,  qword ptr [ebx]   ;  eight bytes of destination into mm4 
	movq mm3,  mm2               ;  eight bytes of destination into mm3 
	punpcklbw mm2, mm7           ;  unpack first 4 bytes from dest into mm4 
	punpckhbw mm3, mm7           ;  unpack next 4 bytes from dest into mm5 
	paddsw mm2, qword ptr [eax]  ;  add source and destination 
	paddsw mm3, qword ptr [eax+8];  add source and destination 
	packuswb mm2, mm3            ;  pack mm0 and mm1 into mm0 
	movq  qword ptr  [ebx], mm2  ;  copy output to destination 
 
	pop edi 
	pop ebx  
	pop eax 
 
	emms 
 
 
	} 
 
	#ifdef _TEST_TRANSFER 
	/* check destination against reference_dest[]... */ 
	for (y=0; y<8; y++) { 
		for (x=0; x<8; x++) { 
			if (reference_dest[8*y + x] != destU8[stride*y + x]) printf("transferIDCT_add() is broken\n"); 
		} 
	} 
	#endif 
   
} 
 
void transferIDCT_copy(int16_t *sourceS16, uint8_t *destU8, int stride) { 
	#ifdef _TEST_TRANSFER 
	int x, y, clipped; 
	#endif 
	 
	_asm { 
 
	; not sure about the state handling here - there must be a better way 
	push eax 
	push ebx 
	push edi 
 
	mov eax, sourceS16           ;  parameter 1, *sourceS16 
	mov ebx, destU8              ;  parameter 2, *destU8 
	mov edi, stride              ;  parameter 3, stride 
 
; lines 0 to 7 schedueled into each other... 
	movq mm0, qword ptr [eax]       ;  move first four words into mm0 
 
	packuswb mm0, qword ptr [eax+8] ;  pack mm0 and the next four words into mm0 
 
	movq mm1, qword ptr [eax+16]    ;  move first four words into mm1 
 
	packuswb mm1, qword ptr [eax+24];  pack mm0 and the next four words into mm1 
 
	movq mm2, qword ptr [eax+32]    ;  move first four words into mm2 
 
	packuswb mm2, qword ptr [eax+40];  pack mm0 and the next four words into mm2 
 
	movq mm3, qword ptr [eax+48]    ;  move first four words into mm3 
 
	packuswb mm3, qword ptr [eax+56] ;  pack mm3 and the next four words into mm3 
 
	movq qword ptr [ebx], mm0       ;  copy output to destination 
	add ebx, edi                    ;  add +stride to dest ptr 
 
	movq qword ptr [ebx], mm1       ;  copy output to destination 
	add ebx, edi                    ;  add +stride to dest ptr 
 
	movq qword ptr [ebx], mm2       ;  copy output to destination 
	add ebx, edi                    ;  add +stride to dest ptr 
 
	movq qword ptr [ebx], mm3       ;  copy output to destination 
	add ebx, edi                    ;  add +stride to dest ptr 
	 
	movq mm0, qword ptr [eax+64]    ;  move first four words into mm0 
	add eax, 64                     ;  add 64 to source ptr                 
 
	packuswb mm0, qword ptr [eax+8] ;  pack mm0 and the next four words into mm0 
 
	movq mm1, qword ptr [eax+16]    ;  move first four words into mm1 
 
	packuswb mm1, qword ptr [eax+24];  pack mm0 and the next four words into mm1 
 
	movq mm2, qword ptr [eax+32]    ;  move first four words into mm2 
 
	packuswb mm2, qword ptr [eax+40];  pack mm0 and the next four words into mm2 
 
	movq mm3, qword ptr [eax+48]    ;  move first four words into mm3 
 
	packuswb mm3, qword ptr [eax+56];  pack mm3 and the next four words into mm3 
 
	movq qword ptr [ebx], mm0       ;  copy output to destination 
	add ebx, edi                    ;  add +stride to dest ptr 
 
	movq qword ptr [ebx], mm1       ;  copy output to destination 
	add ebx, edi                    ;  add +stride to dest ptr 
 
	movq qword ptr [ebx], mm2       ;  copy output to destination 
	add ebx, edi                    ;  add +stride to dest ptr 
 
	movq qword ptr [ebx], mm3       ;  copy output to destination 
 
	pop edi 
	pop ebx  
	pop eax 
 
	emms 
 
	} 
 
	#ifdef _TEST_TRANSFER 
	for (y=0; y<8; y++) { 
		for (x=0; x<8; x++) { 
			clipped = sourceS16[8*y + x]; 
			if (clipped > 255) clipped = 255; 
			if (clipped <   0) clipped =   0; 
			if (clipped != destU8[stride*y+x]) printf("transferIDCT_copy() is broken\n"); 
		} 
	} 
	#endif 
 
}