www.pudn.com > MPEG4Codec.zip > deblock_horiz_lpf9.c


/************************************************************************** 
 *                                                                        * 
 * This code has been developed by John Funnell. This software is an      * 
 * implementation of a part of one or more MPEG-4 Video tools as          * 
 * specified in ISO/IEC 14496-2 standard.  Those intending to use this    * 
 * software module in hardware or software products are advised that its  * 
 * use may infringe existing patents or copyrights, and any such use      * 
 * would be at such party's own risk.  The original developer of this     * 
 * software module and his/her company, and subsequent editors and their  * 
 * companies (including Project Mayo), will have no liability for use of  * 
 * this software or modifications or derivatives thereof.                 * 
 *                                                                        * 
 * Project Mayo gives users of the Codec a license to this software       * 
 * module or modifications thereof for use in hardware or software        * 
 * products claiming conformance to the MPEG-4 Video Standard as          * 
 * described in the Open DivX license.                                    * 
 *                                                                        * 
 * The complete Open DivX license can be found at                         * 
 * http://www.projectmayo.com/opendivx/license.php                        * 
 *                                                                        * 
 **************************************************************************/ 
/** 
*  Copyright (C) 2001 - Project Mayo 
 * 
 * John Funnell 
 * 
 * DivX Advanced Research Center  
* 
**/ 
 
/*** 
 
References:   
 * ISO/IEC 14496-2 
 * MoMuSys-FDIS-V1.0-990812 
 * Intel Architecture Software Developer's Manual 
   Volume 2: Instruction Set Reference 
 
***/ 
 
#include "postprocess_mmx.h" 
 
/* John Funnell, December 2000 */ 
 
 
const static uint64_t mm64_0008 = 0x0008000800080008; 
const static uint64_t mm64_0101 = 0x0101010101010101; 
static uint64_t mm64_temp; 
const static uint64_t mm64_coefs[18] =  { 
	0x0001000200040006, /* p1 left */ 0x0000000000000001, /* v1 right */ 
	0x0001000200020004, /* v1 left */ 0x0000000000010001, /* v2 right */ 
	0x0002000200040002, /* v2 left */ 0x0000000100010002, /* v3 right */ 
	0x0002000400020002, /* v3 left */ 0x0001000100020002, /* v4 right */ 
	0x0004000200020001, /* v4 left */ 0x0001000200020004, /* v5 right */ 
	0x0002000200010001, /* v5 left */ 0x0002000200040002, /* v6 right */ 
	0x0002000100010000, /* v6 left */ 0x0002000400020002, /* v7 right */ 
	0x0001000100000000, /* v7 left */ 0x0004000200020001, /* v8 right */ 
	0x0001000000000000, /* v8 left */ 0x0006000400020001  /* p2 right */ 
}; 
static uint32_t mm32_p1p2; 
static uint8_t *pmm1; 
 
 
 
 
/* The 9-tap low pass filter used in "DC" regions */ 
/* I'm not sure that I like this implementation any more...! */ 
INLINE void deblock_horiz_lpf9(uint8_t *v, int stride, int QP) { 
	int y, p1, p2; 
	#ifdef PP_SELF_CHECK 
	uint8_t selfcheck[9]; 
	int psum; 
	uint8_t *vv;  
	int i;	 
	#endif 
 
	for (y=0; y<4; y++) { 
		p1 = (ABS(v[0+y*stride]-v[1+y*stride]) < QP ) ?  v[0+y*stride] : v[1+y*stride]; 
		p2 = (ABS(v[8+y*stride]-v[9+y*stride]) < QP ) ?  v[9+y*stride] : v[8+y*stride]; 
 
		mm32_p1p2 = 0x0101 * ((p2 << 16) + p1); 
 
		#ifdef PP_SELF_CHECK 
		/* generate a self-check version of the filter result in selfcheck[9] */ 
		/* low pass filtering (LPF9: 1 1 2 2 4 2 2 1 1) */ 
		vv = &(v[y*stride]); 
		psum = p1 + p1 + p1 + vv[1] + vv[2] + vv[3] + vv[4] + 4; 
		selfcheck[1] = (((psum + vv[1]) << 1) - (vv[4] - vv[5])) >> 4; 
		psum += vv[5] - p1;  
		selfcheck[2] = (((psum + vv[2]) << 1) - (vv[5] - vv[6])) >> 4; 
		psum += vv[6] - p1;  
		selfcheck[3] = (((psum + vv[3]) << 1) - (vv[6] - vv[7])) >> 4; 
		psum += vv[7] - p1;  
		selfcheck[4] = (((psum + vv[4]) << 1) + p1 - vv[1] - (vv[7] - vv[8])) >> 4; 
		psum += vv[8] - vv[1];  
		selfcheck[5] = (((psum + vv[5]) << 1) + (vv[1] - vv[2]) - vv[8] + p2) >> 4; 
		psum += p2 - vv[2];  
		selfcheck[6] = (((psum + vv[6]) << 1) + (vv[2] - vv[3])) >> 4; 
		psum += p2 - vv[3];  
		selfcheck[7] = (((psum + vv[7]) << 1) + (vv[3] - vv[4])) >> 4; 
		psum += p2 - vv[4];  
		selfcheck[8] = (((psum + vv[8]) << 1) + (vv[4] - vv[5])) >> 4; 
		#endif 
 
		pmm1 = (&(v[y*stride-3])); /* this is 64-aligned */ 
 
		/* mm7 = 0, mm6 is left hand accumulator, mm5 is right hand acc */ 
		__asm { 
			push eax 
			push ebx 
			mov eax, pmm1 
			lea ebx, mm64_coefs 
 
			#ifdef PREFETCH_ENABLE 
			prefetcht0 32[ebx]                      
			#endif 
 
			movd   mm0,   mm32_p1p2            /* mm0 = ________p2p2p1p1    0w1 2 3 4 5 6 7    */ 
			punpcklbw mm0, mm0                 /* mm0 = p2p2p2p2p1p1p1p1    0m1 2 3 4 5 6 7    */ 
 
			movq    mm2, qword ptr [eax]       /* mm2 = v4v3v2v1xxxxxxxx    0 1 2w3 4 5 6 7    */ 
			pxor    mm7, mm7                   /* mm7 = 0000000000000000    0 1 2 3 4 5 6 7w   */ 
 
			movq     mm6, mm64_0008            /* mm6 = 0008000800080008    0 1 2 3 4 5 6w7    */ 
			punpckhbw mm2, mm2                 /* mm2 = v4__v3__v2__v1__    0 1 2m3 4 5 6 7    */ 
 
			movq     mm64_temp, mm0            /*temp = p2p2p2p2p1p1p1p1    0r1 2 3 4 5 6 7    */ 
 
			punpcklbw mm0, mm7                 /* mm0 = __p1__p1__p1__p1    0m1 2 3 4 5 6 7    */ 
			movq      mm5, mm6                 /* mm5 = 0008000800080008    0 1 2 3 4 5w6r7    */ 
 
			pmullw    mm0, [ebx]               /* mm0 *= mm64_coefs[0]      0m1 2 3 4 5 6 7    */ 
 
			movq      mm1, mm2                 /* mm1 = v4v4v3v3v2v2v1v1    0 1w2r3 4 5 6 7    */ 
			punpcklbw mm2, mm2                 /* mm2 = v2v2v2v2v1v1v1v1    0 1 2m3 4 5 6 7    */ 
 
			punpckhbw mm1, mm1                 /* mm1 = v4v4v4v4v3v3v3v3    0 1m2 3 4 5 6 7    */ 
 
			#ifdef PREFETCH_ENABLE 
			prefetcht0 32[ebx]                      
			#endif 
 
			movq      mm3, mm2                /* mm3 = v2v2v2v2v1v1v1v1    0 1 2r3w4 5 6 7    */ 
			punpcklbw mm2, mm7                /* mm2 = __v1__v1__v1__v1    0 1 2m3 4 5 6 7    */ 
 
			punpckhbw mm3, mm7                /* mm3 = __v2__v2__v2__v2    0 1 2 3m4 5 6 7    */ 
			paddw     mm6, mm0                /* mm6 += mm0                0r1 2 3 4 5 6m7    */ 
 
			movq      mm0, mm2                /* mm0 = __v1__v1__v1__v1    0w1 2r3 4 5 6 7    */  
 
			pmullw    mm0, 8[ebx]             /* mm2 *= mm64_coefs[1]      0m1 2 3 4 5 6 7    */ 
			movq      mm4, mm3                /* mm4 = __v2__v2__v2__v2    0 1 2 3r4w5 6 7    */  
 
			pmullw    mm2, 16[ebx]            /* mm2 *= mm64_coefs[2]      0 1 2m3 4 5 6 7    */ 
 
			pmullw    mm3, 32[ebx]            /* mm3 *= mm64_coefs[4]      0 1 2 3m4 5 6 7    */ 
 
			pmullw    mm4, 24[ebx]            /* mm3 *= mm64_coefs[3]      0 1 2 3 4m5 6 7    */ 
			paddw     mm5, mm0                /* mm5 += mm0                0r1 2 3 4 5m6 7    */ 
 
			paddw     mm6, mm2                /* mm6 += mm2                0 1 2r3 4 5 6m7    */ 
			movq      mm2, mm1                /* mm2 = v4v4v4v4v3v3v3v3    0 1 2 3 4 5 6 7    */ 
 
			punpckhbw mm2, mm7                /* mm2 = __v4__v4__v4__v4    0 1 2m3 4 5 6 7r   */ 
			paddw     mm5, mm4                /* mm5 += mm4                0 1 2 3 4r5m6 7    */ 
 
			punpcklbw mm1, mm7                /* mm1 = __v3__v3__v3__v3    0 1m2 3 4 5 6 7r   */ 
			paddw     mm6, mm3                /* mm6 += mm3                0 1 2 3r4 5 6m7    */ 
 
			#ifdef PREFETCH_ENABLE 
			prefetcht0 64[ebx]                    
			#endif 
			movq      mm0, mm1                /* mm0 = __v3__v3__v3__v3    0w1 2 3 4 5 6 7    */  
 
			pmullw    mm1, 48[ebx]            /* mm1 *= mm64_coefs[6]      0 1m2 3 4 5 6 7    */ 
 
			pmullw    mm0, 40[ebx]            /* mm0 *= mm64_coefs[5]      0m1 2 3 4 5 6 7    */ 
			movq      mm4, mm2                /* mm4 = __v4__v4__v4__v4    0 1 2r3 4w5 6 7    */  
 
			pmullw    mm2, 64[ebx]            /* mm2 *= mm64_coefs[8]      0 1 2 3 4 5 6 7    */ 
			paddw     mm6, mm1                /* mm6 += mm1                0 1 2 3 4 5 6 7    */ 
 
			pmullw    mm4, 56[ebx]            /* mm4 *= mm64_coefs[7]      0 1 2 3 4m5 6 7    */ 
			pxor      mm3, mm3                /* mm3 = 0000000000000000    0 1 2 3w4 5 6 7    */ 
 
			movq      mm1, 8[eax]             /* mm1 = xxxxxxxxv8v7v6v5    0 1w2 3 4 5 6 7    */ 
			paddw     mm5, mm0                /* mm5 += mm0                0r1 2 3 4 5 6 7    */ 
 
			punpcklbw mm1, mm1                /* mm1 = v8v8v7v7v6v6v5v5    0 1m2 3m4 5 6 7    */ 
			paddw     mm6, mm2                /* mm6 += mm2                0 1 2r3 4 5 6 7    */ 
 
			#ifdef PREFETCH_ENABLE 
			prefetcht0 96[ebx]                    
			#endif 
 
			movq      mm2, mm1                /* mm2 = v8v8v7v7v6v6v5v5    0 1r2w3 4 5 6 7    */ 
			paddw     mm5, mm4                /* mm5 += mm4                0 1 2 3 4r5 6 7    */ 
 
			punpcklbw mm2, mm2                /* mm2 = v6v6v6v6v5v5v5v5    0 1 2m3 4 5 6 7    */ 
			punpckhbw mm1, mm1                /* mm1 = v8v8v8v8v7v7v7v7    0 1m2 3 4 5 6 7    */ 
 
			movq      mm3, mm2                /* mm3 = v6v6v6v6v5v5v5v5    0 1 2r3w4 5 6 7    */ 
			punpcklbw mm2, mm7                /* mm2 = __v5__v5__v5__v5    0 1 2m3 4 5 6 7r   */ 
 
			punpckhbw mm3, mm7                /* mm3 = __v6__v6__v6__v6    0 1 2 3m4 5 6 7r   */ 
			movq      mm0, mm2                /* mm0 = __v5__v5__v5__v5    0w1 2b3 4 5 6 7    */  
 
			pmullw    mm0, 72[ebx]            /* mm0 *= mm64_coefs[9]      0m1 2 3 4 5 6 7    */ 
			movq      mm4, mm3                /* mm4 = __v6__v6__v6__v6    0 1 2 3 4w5 6 7    */  
 
			pmullw    mm2, 80[ebx]            /* mm2 *= mm64_coefs[10]     0 1 2m3 4 5 6 7    */ 
 
			pmullw    mm3, 96[ebx]            /* mm3 *= mm64_coefs[12]     0 1 2 3m4 5 6 7    */ 
 
			pmullw    mm4, 88[ebx]            /* mm4 *= mm64_coefs[11]     0 1 2 3 4m5 6 7    */ 
			paddw     mm5, mm0                /* mm5 += mm0                0r1 2 3 4 5 6 7    */ 
 
			paddw     mm6, mm2                /* mm6 += mm2                0 1 2r3 4 5 6 7    */ 
			movq      mm2, mm1                /* mm2 = v8v8v8v8v7v7v7v7    0 1r2w3 4 5 6 7    */ 
 
			paddw     mm6, mm3                /* mm6 += mm3                0 1 2 3r4 5 6 7    */ 
			punpcklbw mm1, mm7                /* mm1 = __v7__v7__v7__v7    0 1m2 3 4 5 6 7r   */ 
 
			paddw     mm5, mm4                /* mm5 += mm4                0 1 2 3 4r5 6 7    */ 
			punpckhbw mm2, mm7                /* mm2 = __v8__v8__v8__v8    0 1 2m3 4 5 6 7    */ 
 
			#ifdef PREFETCH_ENABLE 
			prefetcht0 128[ebx]                   
			#endif 
 
			movq      mm3, mm64_temp          /* mm0 = p2p2p2p2p1p1p1p1    0 1 2 3w4 5 6 7    */ 
			movq      mm0, mm1                /* mm0 = __v7__v7__v7__v7    0w1r2 3 4 5 6 7    */  
 
			pmullw    mm0, 104[ebx]           /* mm0 *= mm64_coefs[13]     0m1b2 3 4 5 6 7    */ 
			movq      mm4, mm2                /* mm4 = __v8__v8__v8__v8    0 1 2r3 4w5 6 7    */  
 
			pmullw    mm1, 112[ebx]           /* mm1 *= mm64_coefs[14]     0 1w2 3 4 5 6 7    */ 
			punpckhbw mm3, mm7                /* mm0 = __p2__p2__p2__p2    0 1 2 3 4 5 6 7    */ 
 
			pmullw    mm2, 128[ebx]           /* mm2 *= mm64_coefs[16]     0 1b2m3 4 5 6 7    */ 
 
			pmullw    mm4, 120[ebx]           /* mm4 *= mm64_coefs[15]     0 1b2 3 4m5 6 7    */ 
			paddw     mm5, mm0                /* mm5 += mm0                0r1 2 3 4 5m6 7    */ 
 
			pmullw    mm3, 136[ebx]           /* mm0 *= mm64_coefs[17]     0 1 2 3m4 5 6 7    */ 
			paddw     mm6, mm1                /* mm6 += mm1                0 1w2 3 4 5 6m7    */ 
 
			paddw     mm6, mm2                /* mm6 += mm2                0 1 2r3 4 5 6m7    */ 
 
			paddw     mm5, mm4                /* mm5 += mm4                0 1 2 3 4r5m6 7    */ 
			psrlw     mm6, 4                  /* mm6 /= 16                 0 1 2 3 4 5 6m7    */ 
 
			paddw     mm5, mm3                /* mm6 += mm0                0 1 2 3r4 5m6 7    */ 
 
			psrlw     mm5, 4                  /* mm5 /= 16                 0 1 2 3 4 5m6 7    */ 
 
			packuswb  mm6, mm5                /* pack result into mm6      0 1 2 3 4 5r6m7    */ 
 
			movq      4[eax], mm6             /* v[] = mm6                 0 1 2 3 4 5 6r7    */ 
 
			pop ebx 
			pop eax 
 
 
 
 
		}; 
	 
		#ifdef PP_SELF_CHECK 
		for (i=1; i<=8; i++) { 
			if (selfcheck[i] != v[i+y*stride]) { 
				printf("ERROR: MMX version of horiz lpf9 is incorrect at %d\n", i); 
			} 
		} 
		#endif 
 
	} 
 
}