www.pudn.com > src.rar > Copy.cpp


#include "stdafx.h" 
#include  
#include "DirectVobSubFilter.h" 
#include "misc.h" 
 
extern int c2y_yb[256]; 
extern int c2y_yg[256]; 
extern int c2y_yr[256]; 
 
static const __int64 _8181 = 0x0080001000800010i64; 
 
void MixLine(uint* d, uint* s, uchar* sub, int pitch, const GUID& subtype) 
{ 
	if(subtype == MEDIASUBTYPE_YUY2) 
	{ 
		uint* dstend = d + (pitch>>2); 
 
		if(s != NULL) 
		{ 
			for(; d < dstend; sub+=8, s++, d++) 
			{ 
				int a3 = (sub[3]+sub[7])>>1; 
 
				if(a3 < 0xff) 
				{ 
//					rgb2yuv(sub[2], sub[1], sub[0], sub[6], sub[5], sub[4]); 
/* 
					dy1 = (((((int)(*s)&0xff)-0x10)*sub[3])>>8) + sub[1]; // + y1; 
					dy2 = (((((int)(*s>>16)&0xff)-0x10)*sub[7])>>8) + sub[5]; // + y2; 
					du = (((((int)(*s>>8)&0xff)-0x80)*a3)>>8) + sub[0]; // + u; 
					dv = (((((int)(*s>>24)&0xff)-0x80)*a3)>>8) + sub[4]; // + v; 
 
					*d = (dv<<24)|(dy2<<16)|(du<<8)|dy1; 
*/ 
					uint ia = (a3<<24)|(sub[7]<<16)|(a3<<8)|sub[3]; 
					uint c = (sub[4]<<24)|(sub[5]<<16)|(sub[0]<<8)|sub[1]; // (v<<24)|(y2<<16)|(u<<8)|y1; 
 
					__asm 
					{ 
						mov			esi, s 
						mov			edi, d 
						pxor		mm0, mm0 
						movq		mm1, _8181 
						movd		mm2, c 
						punpcklbw	mm2, mm0 
						movd		mm3, [esi] 
						punpcklbw	mm3, mm0 
						movd		mm4, ia 
						punpcklbw	mm4, mm0 
						psrlw		mm4, 1 
						psubsw		mm3, mm1 
						pmullw		mm3, mm4 
						psraw		mm3, 7 
						paddsw		mm3, mm2 
						packuswb	mm3, mm3 
						movd		[edi], mm3 
					}; 
				} 
				else 
				{ 
					*d = *s; 
				} 
			} 
		} 
		else 
		{ 
			for(; d < dstend; sub+=8, d++) 
			{ 
				if((sub[3]+sub[7]) < (0xff<<1)) 
				{ 
//					rgb2yuv(sub[2], sub[1], sub[0], sub[6], sub[5], sub[4]); 
//					*d = (v<<24)|(y2<<16)|(u<<8)|y1; 
 
					*d = (sub[4]<<24)|(sub[5]<<16)|(sub[0]<<8)|sub[1]; 
				} 
				else 
				{ 
					*d = 0x80108010; 
				} 
			} 
		} 
 
	} 
	else if(subtype == MEDIASUBTYPE_RGB555) 
	{ 
		ushort* ss = (ushort*)s; 
		ushort* ds = (ushort*)d; 
		ushort* dstend = (ushort*)(d + (pitch>>2)); 
 
		if(ss != NULL) 
		{ 
			for(; ds < dstend; sub+=4, ss++, ds++) 
			{ 
				if(sub[3] < 0xff) 
				{ 
					*ds = (((((*ss&0x7c00)*sub[3])>>8) + ((*((uint*)sub)>>9)&0x7c00))&0x7c00) 
						| (((((*ss&0x03e0)*sub[3])>>8) + ((*((uint*)sub)>>6)&0x03e0))&0x03e0) 
						| (((((*ss&0x001f)*sub[3])>>8) + ((*((uint*)sub)>>3)&0x001f))&0x001f); 
				} 
				else 
				{ 
					*ds = *ss; 
				} 
			} 
		} 
		else 
		{ 
			for(; ds < dstend; sub+=4, ds++) 
			{ 
				if(sub[3] < 0xff) 
				{ 
					*ds = ((*((uint*)sub)>>9)&0x7c00)|((*((uint*)sub)>>6)&0x03e0)|((*((uint*)sub)>>3)&0x001f); 
				} 
				else 
				{ 
					*ds = 0; 
				} 
			} 
		} 
	} 
	else if(subtype == MEDIASUBTYPE_RGB565) 
	{ 
		ushort* ss = (ushort*)s; 
		ushort* ds = (ushort*)d; 
		ushort* dstend = (ushort*)(d + (pitch>>2)); 
 
		if(ss != NULL) 
		{ 
			for(; ds < dstend; sub+=4, ss++, ds++) 
			{ 
				if(sub[3] < 0xff) 
				{ 
					*ds = (((((*ss&0xf800)*sub[3])>>8) + ((*((uint*)sub)>>8)&0xf800))&0xf800) 
						| (((((*ss&0x07e0)*sub[3])>>8) + ((*((uint*)sub)>>5)&0x07e0))&0x07e0) 
						| (((((*ss&0x001f)*sub[3])>>8) + ((*((uint*)sub)>>3)&0x001f))&0x001f); 
				} 
				else 
				{ 
					*ds = *ss; 
				} 
			} 
		} 
		else 
		{ 
			for(; ds < dstend; sub+=4, ds++) 
			{ 
				if(sub[3] < 0xff) 
				{ 
					*ds = ((*((uint*)sub)>>8)&0xf800)|((*((uint*)sub)>>5)&0x07e0)|((*((uint*)sub)>>3)&0x001f); 
				} 
				else 
				{ 
					*ds = 0; 
				} 
			} 
		} 
	} 
	else if(subtype == MEDIASUBTYPE_RGB24) 
	{ 
		uchar* st = (uchar*)s; 
		uchar* dt = (uchar*)d; 
		uchar* dstend = dt + pitch; 
 
		if(s != NULL) 
		{ 
			for(; dt < dstend; sub+=4, st+=3, dt+=3) 
			{ 
				if(sub[3] < 0xff) 
				{ 
					dt[0] = ((st[0]*sub[3])>>8) + sub[0]; 
					dt[1] = ((st[1]*sub[3])>>8) + sub[1]; 
					dt[2] = ((st[2]*sub[3])>>8) + sub[2]; 
				} 
				else 
				{ 
					dt[0] = st[0]; 
					dt[1] = st[1]; 
					dt[2] = st[2]; 
				} 
			} 
		} 
		else 
		{ 
			for(; dt < dstend; sub+=4, dt+=3) 
			{ 
				if(sub[3] < 0xff) 
				{ 
					dt[0] = sub[0]; 
					dt[1] = sub[1]; 
					dt[2] = sub[2]; 
				} 
				else 
				{ 
					dt[0] = dt[1] = dt[2] = 0; 
				} 
			} 
		} 
	} 
	else if(subtype == MEDIASUBTYPE_RGB32 || subtype == MEDIASUBTYPE_ARGB32) 
	{ 
		uint* dstend = d + (pitch>>2); 
 
		if(s != NULL) 
		{ 
			for(; d < dstend; sub+=4, s++, d++) 
			{ 
				if(sub[3] < 0xff) 
				{ 
					*d = (((((*s&0x00ff00ff)*sub[3])>>8) + (*((uint*)sub)&0x00ff00ff))&0x00ff00ff) 
						| (((((*s&0x0000ff00)*sub[3])>>8) + (*((uint*)sub)&0x0000ff00))&0x0000ff00); 
				} 
				else 
				{ 
					*d = *s; 
				} 
			} 
		} 
		else 
		{ 
			for(; d < dstend; sub+=4, d++) 
			{ 
				*d = (sub[3] < 0xff)  
					? (*((uint*)sub)&0xffffff)  
					: 0; 
			} 
		} 
	} 
 
	__asm emms; 
} 
 
void MixLineYV12(uint* d, uint* s, uchar* sub, int pitch, int plane, int subPitch) 
{ 
	if(plane == 0) // y 
	{ 
		BYTE* sb = (BYTE*)s; 
		BYTE* db = (BYTE*)d; 
		BYTE* dbtend = db + pitch; 
 
		if(s != NULL) 
		{ 
			for(; db < dbtend; sub+=4, sb++, db++) 
			{ 
				if(sub[3] < 0xff) 
				{ 
					*db = (((*sb-0x10)*sub[3])>>8) + sub[1]; 
				} 
				else 
				{ 
					*db = *sb; 
				} 
			} 
		} 
		else 
		{ 
			for(; db < dbtend; sub+=4, db++) 
			{ 
				if(sub[3] < 0xff) 
				{ 
					*db = sub[1]; 
				} 
				else 
				{ 
					*db = 0x10; 
				} 
			} 
		} 
	} 
	else if(plane == 1 || plane == 2) // u, v 
	{ 
		BYTE* sb = (BYTE*)s; 
		BYTE* db = (BYTE*)d; 
		BYTE* dbtend = db + pitch; 
 
		if(plane == 1) sub += 4; 
 
		if(s != NULL) 
		{ 
			for(; db < dbtend; sub+=8, sb++, db++) 
			{ 
				int ia = (sub[3]+sub[3+subPitch])>>1; 
				if(ia < 0xff) 
				{ 
					*db = (((*sb-0x80)*ia)>>8) + ((sub[0]+sub[subPitch])>>1); 
				} 
				else 
				{ 
					*db = *sb; 
				} 
			} 
		} 
		else 
		{ 
			for(; db < dbtend; sub+=8, db++) 
			{ 
				int ia = (sub[3]+sub[3+subPitch])>>1; 
				if(ia < 0xff) 
				{ 
					*db = (sub[0]+sub[subPitch])>>1; 
				} 
				else 
				{ 
					*db = 0x80; 
				} 
			} 
		} 
	} 
} 
 
void BltLineRGB32(uint* d, uchar* sub, int w, const GUID& subtype) 
{ 
	if(subtype == MEDIASUBTYPE_YV12) 
	{ 
		BYTE* db = (BYTE*)d; 
		BYTE* dbtend = db + w; 
 
		for(; db < dbtend; sub+=4, db++) 
		{ 
			if(sub[3] < 0xff) 
			{ 
				int y = (c2y_yb[sub[0]] + c2y_yg[sub[1]] + c2y_yr[sub[2]] + 0x108000) >> 16;  
				*db = y; // w/o colors  
			} 
		} 
	} 
	else if(subtype == MEDIASUBTYPE_YUY2) 
	{ 
		ushort* ds = (ushort*)d; 
		ushort* dstend = ds + w; 
 
		for(; ds < dstend; sub+=4, ds++) 
		{ 
			if(sub[3] < 0xff) 
			{ 
				int y = (c2y_yb[sub[0]] + c2y_yg[sub[1]] + c2y_yr[sub[2]] + 0x108000) >> 16;  
				*ds = 0x8000|y; // w/o colors  
			} 
		} 
	} 
	else if(subtype == MEDIASUBTYPE_RGB555) 
	{ 
		ushort* ds = (ushort*)d; 
		ushort* dstend = ds + w; 
 
		for(; ds < dstend; sub+=4, ds++) 
		{ 
			if(sub[3] < 0xff) 
			{ 
				*ds = ((*((uint*)sub)>>9)&0x7c00)|((*((uint*)sub)>>6)&0x03e0)|((*((uint*)sub)>>3)&0x001f); 
			} 
		} 
	} 
	else if(subtype == MEDIASUBTYPE_RGB565) 
	{ 
		ushort* ds = (ushort*)d; 
		ushort* dstend = ds + w; 
 
		for(; ds < dstend; sub+=4, ds++) 
		{ 
			if(sub[3] < 0xff) 
			{ 
				*ds = ((*((uint*)sub)>>8)&0xf800)|((*((uint*)sub)>>5)&0x07e0)|((*((uint*)sub)>>3)&0x001f); 
			} 
		} 
	} 
	else if(subtype == MEDIASUBTYPE_RGB24) 
	{ 
		uchar* dt = (uchar*)d; 
		uchar* dstend = dt + w*3; 
 
		for(; dt < dstend; sub+=4, dt+=3) 
		{ 
			if(sub[3] < 0xff) 
			{ 
				dt[0] = sub[0]; 
				dt[1] = sub[1]; 
				dt[2] = sub[2]; 
			} 
		} 
	} 
	else if(subtype == MEDIASUBTYPE_RGB32 || subtype == MEDIASUBTYPE_ARGB32) 
	{ 
		uint* dstend = d + w; 
 
		for(; d < dstend; sub+=4, d++) 
		{ 
			if(sub[3] < 0xff) *d = *((uint*)sub)&0xffffff; 
		} 
	} 
} 
 
void AvgLines8(BYTE* ptr, int height, int pitch) 
{ 
	if(height <= 1) return; 
 
	BYTE* s = ptr; 
	BYTE* d = ptr + (height-2)*pitch; 
 
	for(; s < d; s += pitch*2) 
	{ 
		BYTE* tmp = s; 
 
		__asm 
		{ 
			mov		esi, tmp 
			mov		ebx, pitch 
 
			mov		ecx, ebx 
			shr		ecx, 3 
 
			pxor	mm7, mm7 
AvgLines8_loop: 
			movq	mm0, [esi] 
			movq	mm1, mm0 
 
			punpcklbw	mm0, mm7 
			punpckhbw	mm1, mm7 
 
			movq	mm2, [esi+ebx*2] 
			movq	mm3, mm2 
 
			punpcklbw	mm2, mm7 
			punpckhbw	mm3, mm7 
 
			paddw	mm0, mm2 
			psrlw	mm0, 1 
 
			paddw	mm1, mm3 
			psrlw	mm1, 1 
 
			packuswb	mm0, mm1 
 
			movq	[esi+ebx], mm0 
 
			lea		esi, [esi+8] 
 
			loop	AvgLines8_loop 
 
			mov		tmp, esi 
		} 
 
		for(int i = pitch&7; i--; tmp++) 
		{ 
			tmp[pitch] = (tmp[0] + tmp[pitch<<1]) >> 1; 
		} 
	} 
 
	if(!(height&1) && height >= 2) 
	{ 
		ptr += (height-2)*pitch; 
		memcpy(ptr + pitch, ptr, pitch); 
	} 
 
	__asm emms; 
} 
 
void AvgLines555(BYTE* ptr, int height, int pitch) 
{ 
	if(height <= 1) return; 
 
	unsigned __int64 __0x7c007c007c007c00 = 0x7c007c007c007c00; 
	unsigned __int64 __0x03e003e003e003e0 = 0x03e003e003e003e0; 
	unsigned __int64 __0x001f001f001f001f = 0x001f001f001f001f; 
 
	BYTE* s = ptr; 
	BYTE* d = ptr + (height-2)*pitch; 
 
	for(; s < d; s += pitch*2) 
	{ 
		BYTE* tmp = s; 
 
		__asm 
		{ 
			mov		esi, tmp 
			mov		ebx, pitch 
 
			mov		ecx, ebx 
			shr		ecx, 3 
 
			movq	mm6, __0x03e003e003e003e0 
			movq	mm7, __0x001f001f001f001f 
 
AvgLines555_loop: 
			movq	mm0, [esi] 
			movq	mm1, mm0 
			movq	mm2, mm0 
 
			psrlw	mm0, 10				// red1 bits: mm0 = 001f001f001f001f 
			pand	mm1, mm6			// green1 bits: mm1 = 03e003e003e003e0 
			pand	mm2, mm7			// blue1 bits: mm2 = 001f001f001f001f 
 
			movq	mm3, [esi+ebx*2] 
			movq	mm4, mm3 
			movq	mm5, mm3 
 
			psrlw	mm3, 10				// red2 bits: mm3 = 001f001f001f001f 
			pand	mm4, mm6			// green2 bits: mm4 = 03e003e003e003e0 
			pand	mm5, mm7			// blue2 bits: mm5 = 001f001f001f001f 
 
			paddw	mm0, mm3 
			psrlw	mm0, 1				// (red1+red2)/2 
			psllw	mm0, 10				// red bits at 7c007c007c007c00 
 
			paddw	mm1, mm4 
			psrlw	mm1, 1				// (green1+green2)/2 
			pand	mm1, mm6			// green bits at 03e003e003e003e0 
 
			paddw	mm2, mm5 
			psrlw	mm2, 1				// (blue1+blue2)/2 
										// blue bits at 001f001f001f001f (no need to pand, lower bits were discareded) 
 
			por		mm0, mm1 
			por		mm0, mm2 
 
			movq	[esi+ebx], mm0 
 
			lea		esi, [esi+8] 
 
			loop	AvgLines555_loop 
 
			mov		tmp, esi 
		} 
 
		for(int i = (pitch&7)>>1; i--; tmp++) 
		{ 
			tmp[pitch] =  
				((((*tmp&0x7c00) + (tmp[pitch<<1]&0x7c00)) >> 1)&0x7c00)| 
				((((*tmp&0x03e0) + (tmp[pitch<<1]&0x03e0)) >> 1)&0x03e0)| 
				((((*tmp&0x001f) + (tmp[pitch<<1]&0x001f)) >> 1)&0x001f); 
		} 
	} 
 
	if(!(height&1) && height >= 2) 
	{ 
		ptr += (height-2)*pitch; 
		memcpy(ptr + pitch, ptr, pitch); 
	} 
 
	__asm emms; 
} 
 
void AvgLines565(BYTE* ptr, int height, int pitch) 
{ 
	if(height <= 1) return; 
 
	unsigned __int64 __0xf800f800f800f800 = 0xf800f800f800f800; 
	unsigned __int64 __0x07e007e007e007e0 = 0x07e007e007e007e0; 
	unsigned __int64 __0x001f001f001f001f = 0x001f001f001f001f; 
 
	BYTE* s = ptr; 
	BYTE* d = ptr + (height-2)*pitch; 
 
	for(; s < d; s += pitch*2) 
	{ 
		ushort* tmp = (ushort*)s; 
 
		__asm 
		{ 
			mov		esi, tmp 
			mov		ebx, pitch 
 
			mov		ecx, ebx 
			shr		ecx, 3 
 
			movq	mm6, __0x07e007e007e007e0 
			movq	mm7, __0x001f001f001f001f 
 
AvgLines565_loop: 
			movq	mm0, [esi] 
			movq	mm1, mm0 
			movq	mm2, mm0 
 
			psrlw	mm0, 11				// red1 bits: mm0 = 001f001f001f001f 
			pand	mm1, mm6			// green1 bits: mm1 = 07e007e007e007e0 
			pand	mm2, mm7			// blue1 bits: mm2 = 001f001f001f001f 
 
			movq	mm3, [esi+ebx*2] 
			movq	mm4, mm3 
			movq	mm5, mm3 
 
			psrlw	mm3, 11				// red2 bits: mm3 = 001f001f001f001f 
			pand	mm4, mm6			// green2 bits: mm4 = 07e007e007e007e0 
			pand	mm5, mm7			// blue2 bits: mm5 = 001f001f001f001f 
 
			paddw	mm0, mm3 
			psrlw	mm0, 1				// (red1+red2)/2 
			psllw	mm0, 11				// red bits at f800f800f800f800 
 
			paddw	mm1, mm4 
			psrlw	mm1, 1				// (green1+green2)/2 
			pand	mm1, mm6			// green bits at 03e003e003e003e0 
 
			paddw	mm2, mm5 
			psrlw	mm2, 1				// (blue1+blue2)/2 
										// blue bits at 001f001f001f001f (no need to pand, lower bits were discareded) 
 
			por		mm0, mm1 
			por		mm0, mm2 
 
			movq	[esi+ebx], mm0 
 
			lea		esi, [esi+8] 
 
			loop	AvgLines565_loop 
 
			mov		tmp, esi 
		} 
 
		for(int i = (pitch&7)>>1; i--; tmp++) 
		{ 
			tmp[pitch] =  
				((((*tmp&0xf800) + (tmp[pitch<<1]&0xf800)) >> 1)&0xf800)| 
				((((*tmp&0x07e0) + (tmp[pitch<<1]&0x07e0)) >> 1)&0x07e0)| 
				((((*tmp&0x001f) + (tmp[pitch<<1]&0x001f)) >> 1)&0x001f); 
		} 
	} 
 
	if(!(height&1) && height >= 2) 
	{ 
		ptr += (height-2)*pitch; 
		memcpy(ptr + pitch, ptr, pitch); 
	} 
 
	__asm emms; 
} 
 
/* ResX2 */ 
void CDirectVobSubFilter::Scale2x(BYTE* d, BYTE* s) 
{ 
	if(m_bihIn.biCompression == mmioFOURCC('Y', 'V', '1', '2') 
	&& m_bihOut.biCompression == mmioFOURCC('Y', 'V', '1', '2')) 
	{ 
		int ww = m_bihIn.biWidth; 
		int hh = m_bihIn.biHeight; 
		BYTE* ss = s; 
		BYTE* dd = d; 
 
		for(int plane = 0; plane < 3; plane++) 
		{ 
			int w = ww; 
			int h = hh; 
			int pitch = w; 
 
			BYTE* s1; 
			BYTE* s2; 
			BYTE* d1; 
 
			for(s1 = ss, s2 = ss + (h*pitch), d1 = dd; s1 < s2; d1 += pitch*2) // TODO: replace this mess with mmx code 
			{ 
				BYTE* tmp = s1 + pitch; 
 
				for(BYTE* s3 = s1 + pitch - 1; s1 < s3; s1 += 1, d1 += 2) 
				{ 
					d1[0] = s1[0];  
					d1[1] = (s1[0]+s1[1])>>1; 
				} 
 
				d1[0] = d1[1] = s1[0];  
 
				s1 += 1; 
				d1 += 2; 
 
				s1 = tmp; 
			} 
 
			w <<= 1; 
			h <<= 1; 
			pitch = w; 
 
			AvgLines8(dd, h, pitch); 
 
			if(plane == 0) 
			{ 
				ww >>= 1; 
				hh >>= 1; 
 
				int size = m_bihIn.biWidth*m_bihIn.biHeight; 
				ss = s + size; 
				size <<= 2; 
				dd = d + size; 
			} 
			else if(plane == 1) 
			{ 
				int size = m_bihIn.biWidth*m_bihIn.biHeight; 
				ss = s + size + (size>>2); 
				size <<= 2; 
				dd = d + size + (size>>2); 
			} 
		} 
	} 
	if(m_bihIn.biCompression == mmioFOURCC('Y', 'U', 'Y', '2') 
	&& m_bihOut.biCompression == mmioFOURCC('Y', 'U', 'Y', '2')) 
	{ 
		unsigned __int64 __0xffffffff00000000 = 0xffffffff00000000; 
		unsigned __int64 __0x00000000ffffffff = 0x00000000ffffffff; 
		unsigned __int64 __0x00ff00ff00ff00ff = 0x00ff00ff00ff00ff; 
 
		int w = m_bihIn.biWidth; 
		int h = m_bihIn.biHeight; 
		int pitch = w*2; 
 
		BYTE* s1; 
		BYTE* s2; 
		BYTE* d1; 
 
		for(s1 = s, s2 = s + (h*pitch), d1 = d; s1 < s2; d1 += pitch*2) 
		{ 
			BYTE* tmp = s1 + pitch; 
 
			// row0, 4 pixels: y1|u1|y2|v1|y3|u2|y4|v2 
			// -> 
			// row0, 8 pixels: y1|u1|(y1+y2)/2|v1|y2|(u1+u2)/2|(y2+y3)/2|(v1+v2)/2 
 
			__asm 
			{ 
				mov		esi, s1 
				mov		edi, d1 
 
				mov		ecx, pitch 
				shr		ecx, 2 
				dec		ecx 
 
				movq	mm4, __0x00ff00ff00ff00ff 
				movq	mm5, __0x00000000ffffffff 
				movq	mm6, __0xffffffff00000000 
row_loop1: 
				movq	mm0, [esi] 
				movq	mm2, mm0 
 
				pand	mm0, mm4	// mm0 = 00y400y300y200y1 
				psrlw	mm2, 8		// mm2 = 00u200v200u100v1 
 
 
				movq	mm1, mm0 
 
				pand	mm0, mm5	// mm0 = 0000000000y200y1 
 
				psllq	mm1, 16 
				pand	mm1, mm6	// mm1 = 00y300y200000000 
 
				por		mm1, mm0	// mm1 = 00y300y200y200y1 
 
				punpcklwd mm0, mm0	// mm0 = 00y200y200y100y1 
 
				paddw	mm0, mm1 
				psrlw	mm0, 1		// mm0 = (mm0 + mm1) / 2 
 
 
				movq	mm1, mm2 
				punpckldq	mm1, mm1 // mm1 = 00u100v100u100v1 
 
				paddw	mm1, mm2 
				psrlw	mm1, 1		// mm1 = (mm1 + mm2) / 2 
 
 
				psllw	mm1, 8 
				por		mm0, mm1	// mm0 = (v1+v2)/2|(y2+y3)/2|(u1+u2)/2|y2|v1|(y1+y2)/2|u1|y1 
 
				movq	[edi], mm0 
 
				lea		esi, [esi+4] 
				lea		edi, [edi+8] 
 
				loop	row_loop1 
 
				mov		s1, esi 
				mov		d1, edi 
			}; 
 
			*d1++ = s1[0]; 
			*d1++ = s1[1]; 
			*d1++ =(s1[0]+s1[2])>>1; 
			*d1++ = s1[3]; 
 
			*d1++ = s1[2]; 
			*d1++ = s1[1]; 
			*d1++ = s1[2]; 
			*d1++ = s1[3]; 
 
			s1 += 4; 
 
			s1 = tmp; 
		} 
 
		w <<= 1; 
		h <<= 1; 
		pitch = w*2; 
 
		AvgLines8(d, h, pitch); 
	} 
	else if(m_bihIn.biCompression <= 3 && m_bihOut.biCompression <= 3 
		&& m_bihIn.biBitCount == 16 && m_bihOut.biBitCount == 16  
		&& m_pOutput->CurrentMediaType().subtype == MEDIASUBTYPE_RGB555) 
	{ 
		int w = m_bihIn.biWidth; 
		int h = m_bihIn.biHeight; 
		int pitch = w*2; 
 
		BYTE* s1; 
		BYTE* s2; 
		BYTE* d1; 
 
		for(s1 = s, s2 = s + (h*pitch), d1 = d; s1 < s2; d1 += pitch*2) // TODO: replace this mess with mmx code 
		{ 
			BYTE* tmp = s1 + pitch; 
 
			for(BYTE* s3 = s1 + pitch - 2; s1 < s3; s1 += 2, d1 += 4) 
			{ 
				*((ushort*)d1) = *((ushort*)s1); 
				*((ushort*)d1+1) =  
					((((*((ushort*)s1)&0x7c00) + (*((ushort*)s1+1)&0x7c00)) >> 1)&0x7c00)| 
					((((*((ushort*)s1)&0x03e0) + (*((ushort*)s1+1)&0x03e0)) >> 1)&0x03e0)| 
					((((*((ushort*)s1)&0x001f) + (*((ushort*)s1+1)&0x001f)) >> 1)&0x001f); 
			} 
 
			*((ushort*)d1) = *((ushort*)s1); 
			*((ushort*)d1+1) = *((ushort*)s1); 
 
			s1 += 2; 
			d1 += 4; 
 
			s1 = tmp; 
		} 
 
		w <<= 1; 
		h <<= 1; 
		pitch = w*2; 
 
		AvgLines555(d, h, pitch); 
	} 
	else if(m_bihIn.biCompression <= 3 && m_bihOut.biCompression <= 3 
		&& m_bihIn.biBitCount == 16 && m_bihOut.biBitCount == 16 
		&& m_pOutput->CurrentMediaType().subtype == MEDIASUBTYPE_RGB565) 
	{ 
		int w = m_bihIn.biWidth; 
		int h = m_bihIn.biHeight; 
		int pitch = w*2; 
 
		BYTE* s1; 
		BYTE* s2; 
		BYTE* d1; 
 
		for(s1 = s, s2 = s + (h*pitch), d1 = d; s1 < s2; d1 += pitch*2) // TODO: replace this mess with mmx code 
		{ 
			BYTE* tmp = s1 + pitch; 
 
			for(BYTE* s3 = s1 + pitch - 2; s1 < s3; s1 += 2, d1 += 4) 
			{ 
				*((ushort*)d1) = *((ushort*)s1); 
				*((ushort*)d1+1) =  
					((((*((ushort*)s1)&0xf800) + (*((ushort*)s1+1)&0xf800)) >> 1)&0xf800)| 
					((((*((ushort*)s1)&0x07e0) + (*((ushort*)s1+1)&0x07e0)) >> 1)&0x07e0)| 
					((((*((ushort*)s1)&0x001f) + (*((ushort*)s1+1)&0x001f)) >> 1)&0x001f); 
			} 
 
			*((ushort*)d1) = *((ushort*)s1); 
			*((ushort*)d1+1) = *((ushort*)s1); 
 
			s1 += 2; 
			d1 += 4; 
 
			s1 = tmp; 
		} 
 
		w <<= 1; 
		h <<= 1; 
		pitch = w*2; 
 
		AvgLines565(d, h, pitch); 
	} 
	else if(m_bihIn.biCompression <= 3 && m_bihOut.biCompression <= 3 
		&& m_bihIn.biBitCount == 24 && m_bihOut.biBitCount == 24) 
	{ 
		int w = m_bihIn.biWidth; 
		int h = m_bihIn.biHeight; 
		int pitch = w*3; //(w*3+3)&~3; 
 
		BYTE* s1; 
		BYTE* s2; 
		BYTE* d1; 
 
		for(s1 = s, s2 = s + (h*pitch), d1 = d; s1 < s2; d1 += pitch*2) // TODO: replace this mess with mmx code 
		{ 
			BYTE* tmp = s1 + pitch; 
 
			for(BYTE* s3 = s1 + pitch - 3; s1 < s3; s1 += 3, d1 += 6) 
			{ 
				d1[0] = s1[0];  
				d1[1] = s1[1];  
				d1[2] = s1[2]; 
				d1[3] = (s1[0]+s1[3])>>1; 
				d1[4] = (s1[1]+s1[4])>>1; 
				d1[5] = (s1[2]+s1[5])>>1; 
			} 
 
			d1[0] = d1[3] = s1[0];  
			d1[1] = d1[4] = s1[1];  
			d1[2] = d1[5] = s1[2]; 
 
			s1 += 3; 
			d1 += 6; 
 
			s1 = tmp; 
		} 
 
		w <<= 1; 
		h <<= 1; 
		pitch = w*3; //(w*3+3)&~3; 
 
		AvgLines8(d, h, pitch); 
	} 
	else if(m_bihIn.biCompression <= 3 && m_bihOut.biCompression <= 3 
		&& m_bihIn.biBitCount == 32 && m_bihOut.biBitCount == 32) 
	{ 
/* 
		{ 
    DDSURFACEDESC2 ddsd2; 
    CComPtr pDDS1, pDDS2; 
  
    // Initialize the surface description. 
    ZeroMemory(&ddsd2, sizeof(DDSURFACEDESC2)); 
    ZeroMemory(&ddsd2.ddpfPixelFormat, sizeof(DDPIXELFORMAT)); 
    ddsd2.dwSize = sizeof(ddsd2); 
    ddsd2.dwFlags = DDSD_WIDTH | DDSD_HEIGHT | DDSD_LPSURFACE | 
                    DDSD_PITCH | DDSD_PIXELFORMAT | DDSD_CAPS; 
    ddsd2.ddsCaps.dwCaps = DDSCAPS_OFFSCREENPLAIN | 
                           DDSCAPS_SYSTEMMEMORY; 
    ddsd2.dwWidth = m_bihIn.biWidth; 
	ddsd2.dwHeight= m_bihIn.biHeight; 
    ddsd2.lPitch  = (LONG)4 * m_bihIn.biWidth; 
    ddsd2.lpSurface = (LPVOID)s; 
  
    // Set up the pixel format for 24-bit RGB (8-8-8). 
    ddsd2.ddpfPixelFormat.dwSize = sizeof(DDPIXELFORMAT); 
    ddsd2.ddpfPixelFormat.dwFlags= DDPF_RGB; 
    ddsd2.ddpfPixelFormat.dwRGBBitCount = (DWORD)4*8; 
    ddsd2.ddpfPixelFormat.dwRBitMask    = 0x00FF0000; 
    ddsd2.ddpfPixelFormat.dwGBitMask    = 0x0000FF00; 
    ddsd2.ddpfPixelFormat.dwBBitMask    = 0x000000FF; 
  
	HRESULT hr; 
 
    // Create the surface 
    hr = m_pDD->CreateSurface(&ddsd2, &pDDS1, NULL); 
     
    ddsd2.dwWidth <<= 1; 
	ddsd2.dwHeight<<= 1; 
    ddsd2.lPitch  <<= 1; 
    ddsd2.lpSurface = (LPVOID)d; 
 
	hr = m_pDD->CreateSurface(&ddsd2, &pDDS2, NULL); 
 
	if(pDDS1 && pDDS2) 
	{ 
		hr = pDDS2->Blt(NULL, pDDS1, NULL, DDBLT_WAIT, NULL); 
	} 
	return; 
		} 
 
		// remark: the image quality was disappointing... 
*/ 
 
 
		int w = m_bihIn.biWidth; 
		int h = m_bihIn.biHeight; 
		int pitch = w * 4; 
 
		BYTE* s1; 
		BYTE* s2; 
		BYTE* d1; 
 
		for(s1 = s, s2 = s + (h*pitch), d1 = d; s1 < s2; d1 += pitch*2) 
		{ 
			BYTE* tmp = s1 + pitch; 
 
			__asm 
			{ 
				mov		esi, s1 
				mov		edi, d1 
 
				mov		ecx, pitch 
				shr		ecx, 2 
				dec		ecx 
 
				pxor	mm0, mm0 
row_loop3: 
				movq	mm1, [esi] 
				movq	mm2, mm1 
 
				punpcklbw mm1, mm0	// mm1 = 00xx00r100g100b1 
				punpckhbw mm2, mm0	// mm2 = 00xx00r200g200b2 
 
				paddw	mm2, mm1 
				psrlw	mm2, 1		// mm2 = (mm1 + mm2) / 2 
 
				packuswb	mm1, mm2 
 
				movq	[edi], mm1 
 
				lea		esi, [esi+4] 
				lea		edi, [edi+8] 
 
				loop	row_loop3 
 
				mov		s1, esi 
				mov		d1, edi 
			}; 
 
			*((uint*)d1) = *((uint*)s1); 
			*((uint*)d1+1) = *((uint*)s1); 
 
			s1 += 4; 
			d1 += 8; 
 
			s1 = tmp; 
		} 
 
		w <<= 1; 
		h <<= 1; 
		pitch = w * 4; 
 
		AvgLines8(d, h, pitch); 
	} 
 
	__asm emms; 
} 
 
HRESULT CDirectVobSubFilter::Copy(BYTE* pOut, BYTE* pIn, SubImage* img) 
{ 
	int wIn = m_bihIn.biWidth, wOut = m_bihOut.biWidth; 
	int hIn = m_bihIn.biHeight, hOut = abs(m_bihOut.biHeight); 
	int bppIn = m_bihIn.biBitCount, bppOut = m_bihOut.biBitCount; 
	int pitchIn = wIn * bppIn >> 3, pitchOut = wOut * bppOut >> 3; 
 
	bool fFlip = m_bihOut.biHeight < 0 && m_bihOut.biCompression <= 3; // flip if we are copying rgb and the signs aren't matching (we only check the output height since input is always > 0) 
	if(m_fFlipPicture) fFlip = !fFlip; 
	if(m_fMSMpeg4Fix) fFlip = !fFlip; 
//	if(m_fDivxPlusFix) fFlip = !fFlip; 
 
	bool fFlipSub = !(m_bihOut.biHeight > 0 && m_bihOut.biCompression <= 3); // flip unless the dst bitmap is also a flipped rgb 
	if(m_fFlipSubtitles) fFlipSub = !fFlipSub; 
//	if(m_fDivxPlusFix) fFlipSub = !fFlipSub; 
 
/* ResX2 */ 
	if(m_fResX2Active) 
	{ 
		Scale2x(m_pTempPicBuff, pIn); 
		pIn = m_pTempPicBuff; 
		wIn <<= 1; hIn <<= 1; pitchIn <<= 1; 
	} 
 
	pitchIn = (pitchIn+3)&~3; 
	pitchOut = (pitchOut+3)&~3; 
 
	uint black = (m_bihIn.biCompression == mmioFOURCC('Y', 'U', 'Y', '2')) ? 0x80108010 : 0; 
 
	const GUID& subtype = m_pOutput->CurrentMediaType().subtype; 
 
	BITMAP bm; 
	if(img) GetObject(img->hbm, sizeof(BITMAP), &bm); 
 
	int y = 0; 
 
	{ 
		int dpLeft = (((m_bihSub.biWidth - wIn) >> 1) & ~1) * bppIn >> 3; 
		int dpRight = max(pitchOut - (dpLeft + pitchIn), 0); 
		int dpMid = pitchOut - dpLeft - dpRight; 
 
		int i = 0, j = 0, k = 0; 
 
		j += (hOut - hIn) >> 1; 
 
		for(; i < j; i++, pOut += pitchOut) // memsetd(pOut, black, pitchOut); 
		{ 
			memsetd(pOut, black, dpLeft); 
 
			if(img && m_mode >= VOBSUB)  
			{ 
				y = fFlipSub ? (bm.bmHeight - i - 1) : i; 
				MixLine((uint*)(pOut + dpLeft), NULL, (uchar*)bm.bmBits + y * bm.bmWidthBytes, dpMid, subtype); 
			} 
			else  
				memsetd(pOut + dpLeft, black, dpMid); 
 
			memsetd(pOut + dpLeft + dpMid, black, dpRight); 
		} 
 
		j += hIn; 
 
		int pitchIn2; 
 
		if(!fFlip) 
		{ 
			if(hIn > hOut) pIn += pitchIn * ((hIn - hOut) >> 1); 
			pitchIn2 = pitchIn; 
		} 
		else 
		{ 
			pIn += pitchIn * (j-i-1); 
			pitchIn2 = -pitchIn; 
		} 
 
		for(k = min(j, hOut); i < k; i++, pIn += pitchIn2, pOut += pitchOut) 
		{ 
			memsetd(pOut, black, dpLeft); 
 
			if(img && m_mode >= VOBSUB)  
			{ 
				y = fFlipSub ? (bm.bmHeight - i - 1) : i; 
				MixLine((uint*)(pOut + dpLeft), (uint*)pIn, (uchar*)bm.bmBits + y * bm.bmWidthBytes, dpMid, subtype); 
			} 
			else 
				memcpy(pOut + dpLeft, pIn, dpMid); 
 
			memsetd(pOut + dpLeft + dpMid, black, dpRight); 
		} 
 
		j = hOut; 
 
		for(; i < j; i++, pOut += pitchOut) // memsetd(pOut, black, pitchOut); 
		{ 
			memsetd(pOut, black, dpLeft); 
 
			if(img && m_mode >= VOBSUB) 
			{ 
				y = fFlipSub ? (bm.bmHeight - i - 1) : i; 
				MixLine((uint*)(pOut + dpLeft), NULL, (uchar*)bm.bmBits + y * bm.bmWidthBytes, dpMid, subtype); 
			} 
			else  
				memsetd(pOut + dpLeft, black, dpMid); 
 
			memsetd(pOut + dpLeft + dpMid, black, dpRight); 
		} 
	} 
 
    return NOERROR; 
} 
 
HRESULT CDirectVobSubFilter::CopyYV12(BYTE* pOut, BYTE* pIn, SubImage* img) 
{ 
	int wIn = m_bihIn.biWidth, wOut = m_bihOut.biWidth; 
	int hIn = m_bihIn.biHeight, hOut = abs(m_bihOut.biHeight); 
	int pitchIn = wIn, pitchOut = wOut; 
 
	bool fFlip = m_bihOut.biHeight < 0 && m_bihOut.biCompression <= 3; // flip if we are copying rgb and the signs aren't matching (we only check the output height since input is always > 0) 
	if(m_fFlipPicture) fFlip = !fFlip; 
	if(m_fMSMpeg4Fix) fFlip = !fFlip; 
//	if(m_fDivxPlusFix) fFlip = !fFlip; 
 
	bool fFlipSub = !(m_bihOut.biHeight > 0 && m_bihOut.biCompression <= 3); // flip unless the dst bitmap is also a flipped rgb 
	if(m_fFlipSubtitles) fFlipSub = !fFlipSub; 
//	if(m_fDivxPlusFix) fFlipSub = !fFlipSub; 
 
/* ResX2 */ 
	if(m_fResX2Active) 
	{ 
		Scale2x(m_pTempPicBuff, pIn); 
		pIn = m_pTempPicBuff; 
		wIn <<= 1; hIn <<= 1; pitchIn <<= 1; 
	} 
 
	pitchIn = (pitchIn+3)&~3; 
	pitchOut = (pitchOut+3)&~3; 
 
	BYTE* pInVU = pIn + pitchIn*hIn; 
	BYTE* pOutVU = pOut + pitchOut*hOut; 
 
	uint black = 0x10; 
 
	int subPitch = (fFlipSub ? -m_bihSub.biWidth : m_bihSub.biWidth)*4; 
 
	const GUID& subtype = m_pOutput->CurrentMediaType().subtype; 
 
	BITMAP bm; 
	if(img) GetObject(img->hbm, sizeof(BITMAP), &bm); 
 
	int y = 0; 
 
	int dpLeft = ((m_bihSub.biWidth - wIn) >> 1) & ~1; 
 
	for(int plane = 0; plane < 3; plane++) 
	{ 
		int dpRight = max(pitchOut - (dpLeft + pitchIn), 0); 
		int dpMid = pitchOut - dpLeft - dpRight; 
 
		int i = 0, j = 0, k = 0; 
 
		j += (hOut - hIn) >> 1; 
 
		for(; i < j; i++, pOut += pitchOut) // memset(pOut, black, pitchOut); 
		{ 
			memset(pOut, black, dpLeft); 
 
			if(img && m_mode >= VOBSUB)  
			{ 
				y = (plane == 0) ? i : (i<<1); 
				y = fFlipSub ? (bm.bmHeight - y - 1) : y; 
				MixLineYV12((uint*)(pOut + dpLeft), NULL, (uchar*)bm.bmBits + y * bm.bmWidthBytes, dpMid, plane, subPitch); 
			} 
			else  
				memset(pOut + dpLeft, black, dpMid); 
 
			memset(pOut + dpLeft + dpMid, black, dpRight); 
		} 
 
		j += hIn; 
 
		int pitchIn2; 
 
		if(!fFlip) 
		{ 
			if(hIn > hOut) pIn += pitchIn * ((hIn - hOut) >> 1); 
			pitchIn2 = pitchIn; 
		} 
		else 
		{ 
			pIn += pitchIn * (j-i-1); 
			pitchIn2 = -pitchIn; 
		} 
 
		for(k = min(j, hOut); i < k; i++, pIn += pitchIn2, pOut += pitchOut) 
		{ 
			memset(pOut, black, dpLeft); 
 
			if(img && m_mode >= VOBSUB)  
			{ 
				y = (plane == 0) ? i : (i<<1); 
				y = fFlipSub ? (bm.bmHeight - y - 1) : y; 
				MixLineYV12((uint*)(pOut + dpLeft), (uint*)pIn, (uchar*)bm.bmBits + y * bm.bmWidthBytes, dpMid, plane, subPitch); 
			} 
			else 
				memcpy(pOut + dpLeft, pIn, dpMid); 
 
			memset(pOut + dpLeft + dpMid, black, dpRight); 
		} 
 
		j = hOut; 
 
		for(; i < j; i++, pOut += pitchOut) // memsetd(pOut, black, pitchOut); 
		{ 
			memset(pOut, black, dpLeft); 
 
			if(img && m_mode >= VOBSUB) 
			{ 
				y = (plane == 0) ? i : (i<<1); 
				y = fFlipSub ? (bm.bmHeight - y - 1) : y; 
				MixLineYV12((uint*)(pOut + dpLeft), NULL, (uchar*)bm.bmBits + y * bm.bmWidthBytes, dpMid, plane, subPitch); 
			} 
			else  
				memset(pOut + dpLeft, black, dpMid); 
 
			memset(pOut + dpLeft + dpMid, black, dpRight); 
		} 
 
		if(plane == 0) 
		{ 
			hIn >>= 1; 
			hOut >>= 1; 
			pitchIn >>= 1; 
			pitchOut >>= 1; 
			dpLeft >>= 1; 
			pIn = pInVU; 
			pOut = pOutVU; 
			black = 0x80; 
		} 
		else if(plane == 1) 
		{ 
			pIn = pInVU + pitchIn*hIn; 
			pOut = pOutVU + pitchOut*hOut; 
		} 
	} 
 
    return NOERROR; 
} 
 
void CDirectVobSubFilter::PrintMessages(BYTE* pOut) 
{ 
	const GUID& subtype = m_pOutput->CurrentMediaType().subtype; 
 
	CString msg, tmp; 
 
	if(m_fOSD) 
	{ 
		CString c(GuidNames[subtype]); 
		if(!_tcsncmp(c, _T("MEDIASUBTYPE_"), 13)) c = c.Mid(13); 
		tmp.Format(_T("%dx%d %s\n"), m_bihOut.biWidth, m_bihOut.biHeight, c); 
		msg += tmp; 
 
		int start, stop, pos, size; 
		m_sic.GetStats(start, stop, pos, size); 
 
		tmp.Format(_T("real fps: %.3f, current fps: %.3f\nmedia time: %d, subtitle time: %d [ms]\nframe number: %d (calculated)\nrate: %.4f\nstart: %d - stop: %d [ms]\npos: %d - size: %d"),  
			m_fps, m_MediaFPS/*m_fMediaFPSEnabled?m_MediaFPS:fabs(m_fps)*/, 
			(int)m_tPrev.Millisecs(), CalcCurrentTime(), 
			(int)(m_tPrev.m_time * m_fps / 10000000), 
			m_pInput->CurrentRate(), 
			start, stop, pos, size); 
 
		msg += tmp; 
 
		if(size > 0) 
		{ 
			SubImage* img = NULL; 
			 
			CAutoLock cAutoLock(&m_sic.m_csAccessLock); 
 
			for(int i = 0; i < size && (img = m_sic.GetSubImage(i)); i++) 
			{ 
				tmp.Format(_T("\n%d: %d - %d [ms]"), i, img->start, img->stop); 
				msg += tmp; 
			} 
		} 
	} 
 
	if(msg.IsEmpty()) return; 
 
	HANDLE hOldBitmap = SelectObject(m_hdc, m_hbm); 
	HANDLE hOldFont = SelectObject(m_hdc, m_hfont); 
 
	SetTextColor(m_hdc, 0xffffff); 
	SetBkMode(m_hdc, TRANSPARENT); 
	SetMapMode(m_hdc, MM_TEXT); 
 
	BITMAP bm; 
	GetObject(m_hbm, sizeof(BITMAP), &bm); 
 
	CRect r(0, 0, bm.bmWidth, bm.bmHeight); 
	DrawText(m_hdc, msg, _tcslen(msg), &r, DT_CALCRECT|DT_EXTERNALLEADING|DT_NOPREFIX|DT_WORDBREAK); 
 
	r += CPoint(10, 10); 
	r &= CRect(0, 0, bm.bmWidth, bm.bmHeight); 
 
	DrawText(m_hdc, msg, _tcslen(msg), &r, DT_LEFT|DT_TOP|DT_NOPREFIX|DT_WORDBREAK); 
 
	BYTE* pIn = (BYTE*)bm.bmBits; 
	int pitchIn = bm.bmWidthBytes; 
	int pitchOut = m_bihOut.biWidth * m_bihOut.biBitCount >> 3; 
 
	if(m_bihOut.biCompression == mmioFOURCC('Y', 'V', '1', '2')) 
		pitchOut = m_bihOut.biWidth; 
 
	pitchIn = (pitchIn+3)&~3; 
	pitchOut = (pitchOut+3)&~3; 
 
	if(m_bihOut.biHeight > 0 && m_bihOut.biCompression <= 3) // flip if the dst bitmap is flipped rgb (m_hbm is a top-down bitmap, not like the subpictures) 
	{ 
		pOut += pitchOut * (abs(m_bihOut.biHeight)-1); 
		pitchOut = -pitchOut; 
	} 
 
	pIn += pitchIn * r.top; 
	pOut += pitchOut * r.top; 
 
	for(int w = min(r.right, m_bihOut.biWidth), h = r.Height(); h--; pIn += pitchIn, pOut += pitchOut) 
	{ 
		BltLineRGB32((uint*)pOut, pIn, w, subtype); 
		memsetd(pIn, 0xff000000, r.right*4); 
	} 
 
	SelectObject(m_hdc, hOldBitmap); 
	SelectObject(m_hdc, hOldFont); 
}