www.pudn.com > DirectDraw.rar > DirectDisplay.cpp, change:2007-09-20,size:65137b


// DirectDisplay.cpp: implementation of the CDirectDisplay class. 
// 
////////////////////////////////////////////////////////////////////// 
 
#include "stdafx.h" 
#include <stdio.h> 
#include "DirectDisplay.h" 
#include "YUV2Jpg.h" 
#include "debug.h" 
////////////////////////////////////////////////////////////////////// 
// Construction/Destruction 
////////////////////////////////////////////////////////////////////// 
#pragma comment(lib, "ddraw.lib") 
#pragma comment(lib, "dxguid.lib") 
#pragma comment(lib, "YUV2Jpg.lib") 
 
#define MIN(X, Y) ((X)<(Y)?(X):(Y)) 
#define MAX(X, Y) ((X)>(Y)?(X):(Y)) 
 
#define MK_RGB555(R,G,B)	\ 
	((MAX(0,MIN(255, R)) << 7) & 0x7c00) | \ 
	((MAX(0,MIN(255, G)) << 2) & 0x03e0) | \ 
	((MAX(0,MIN(255, B)) >> 3) & 0x001f) 
 
#define MK_RGB565(R,G,B)	\ 
	((MAX(0,MIN(255, R)) << 8) & 0xf800) | \ 
	((MAX(0,MIN(255, G)) << 3) & 0x07e0) | \ 
	((MAX(0,MIN(255, B)) >> 3) & 0x001f) 
 
uint32_t CDirectDisplay::RGB_Y_tab[256]; 
uint32_t CDirectDisplay::B_U_tab[256]; 
uint32_t CDirectDisplay::G_U_tab[256]; 
uint32_t CDirectDisplay::G_V_tab[256]; 
uint32_t CDirectDisplay::R_V_tab[256]; 
 
CSPTransFuncPtr CDirectDisplay::yv12to_rgb555 = CDirectDisplay::yv12to_rgb555_mmx; 
CSPTransFuncPtr CDirectDisplay::yv12to_rgb565 = CDirectDisplay::yv12to_rgb565_mmx; 
CSPTransFuncPtr CDirectDisplay::yv12to_rgb24bit = CDirectDisplay::yv12to_rgb24bit_mmx; 
CSPTransFuncPtr CDirectDisplay::yv12to_rgb32bit = CDirectDisplay::yv12to_rgb32bit_mmx; 
CSPTransFuncPtr CDirectDisplay::yv12to_yuv422 = CDirectDisplay::yv12to_yuv422_mmx; 
 
CDirectDisplay::CDirectDisplay() 
{ 
	m_pDD = NULL; 
	m_pDDSFront = NULL; 
	m_pDDSBack = NULL; 
 
	m_hWnd = NULL; 
	m_dwWidth = 0; 
	m_dwHeight = 0; 
	m_pY = NULL; 
	m_pU = NULL; 
	m_pV = NULL; 
	m_vout_mode = VOUT_MODE_NONE; 
 
	m_pDDSOSD = NULL; //当使用YUV加速且注册了OSD自画回调时才有效 
 
	m_pDrawCallback = NULL; 
	m_pDrawcontext = NULL; 
 
	m_dwDDMemtype = DDSCAPS_VIDEOMEMORY; 
	m_bDispRect	  = FALSE; 
	ZeroMemory(&m_DispRect,sizeof(RECT)); 
//	SetFuncPointer(); 
	m_hCapMutex = CreateMutex(NULL,FALSE,NULL); 
} 
 
CDirectDisplay::~CDirectDisplay() 
{ 
	ReleaseObjs(); 
	if(m_hCapMutex) 
	{ 
		CloseHandle(m_hCapMutex); 
		m_hCapMutex = NULL; 
	} 
} 
 
void CDirectDisplay::ReleaseObjs() 
{ 
	SAFE_RELEASE(m_pDDSOSD); 
	SAFE_RELEASE(m_pDDSBack); 
	SAFE_RELEASE(m_pDDSFront); 
	SAFE_RELEASE(m_pDD); 
} 
 
void CDirectDisplay::SetDisplayRect(BOOL bDispRect,RECT *pRect) 
{ 
	if(bDispRect && pRect) 
	{ 
		m_bDispRect = TRUE; 
		memcpy(&m_DispRect,pRect,sizeof(RECT)); 
	} 
	else 
		m_bDispRect = FALSE; 
} 
 
 
BOOL CDirectDisplay::CreateOSDSurface(DWORD dwWidth,DWORD dwHeight) 
{ 
	DDSURFACEDESC2 ddsd; 
	HRESULT		   hr; 
	ZeroMemory(&ddsd, sizeof(DDSURFACEDESC2)); 
	ddsd.dwSize = sizeof(DDSURFACEDESC2); 
	ddsd.dwFlags        = DDSD_CAPS | DDSD_WIDTH | DDSD_HEIGHT; 
		 
	ddsd.ddsCaps.dwCaps = DDSCAPS_OFFSCREENPLAIN | m_dwDDMemtype ; 
		 
	ddsd.dwWidth = dwWidth ; 
	ddsd.dwHeight = dwHeight ; 
	if(FAILED(hr = m_pDD->CreateSurface(&ddsd,&m_pDDSOSD,NULL))) 
	{ 
		if(DDERR_OUTOFVIDEOMEMORY == hr) 
		{ 
			m_dwDDMemtype = DDSCAPS_SYSTEMMEMORY; 
			ddsd.ddsCaps.dwCaps = DDSCAPS_OFFSCREENPLAIN | m_dwDDMemtype; 
			if(DD_OK == m_pDD->CreateSurface(&ddsd,&m_pDDSOSD,NULL)) 
			{ 
				m_vout_mode = VOUT_MODE_YUY2; 
				m_dwWidth = dwWidth; 
				m_dwHeight = dwHeight; 
				return TRUE; 
			} 
		} 
		return FALSE; 
	} 
	m_vout_mode = VOUT_MODE_YUY2; 
	m_dwWidth = dwWidth; 
	m_dwHeight = dwHeight; 
	return TRUE; 
} 
 
BOOL CDirectDisplay::CreateYUVSurface(DWORD dwWidth,DWORD dwHeight) 
{ 
	DDPIXELFORMAT ddpfYUY2 ={sizeof(DDPIXELFORMAT), DDPF_FOURCC,MAKEFOURCC('Y','U','Y','2'),0,0,0,0,0}; 
	DDSURFACEDESC2 ddsd; 
	HRESULT hr; 
	ZeroMemory(&ddsd,sizeof(DDSURFACEDESC2)); 
	ddsd.dwSize = sizeof(DDSURFACEDESC2); 
	ddsd.dwFlags = DDSD_CAPS | DDSD_WIDTH | DDSD_HEIGHT | DDSD_PIXELFORMAT; 
	ddsd.ddsCaps.dwCaps = DDSCAPS_OFFSCREENPLAIN | m_dwDDMemtype; 
	ddsd.ddpfPixelFormat = ddpfYUY2; 
	ddsd.dwWidth = dwWidth; 
	ddsd.dwHeight = dwHeight; 
	if(NULL == m_pDD) 
	{ 
		return FALSE; 
	} 
//	m_pDD->GetCaps() 
	if(FAILED(hr = m_pDD->CreateSurface(&ddsd,&m_pDDSBack,NULL))) 
	{ 
		if(DDERR_OUTOFVIDEOMEMORY == hr) 
		{ 
			m_dwDDMemtype = DDSCAPS_SYSTEMMEMORY; 
			ddsd.ddsCaps.dwCaps = DDSCAPS_OFFSCREENPLAIN | m_dwDDMemtype; 
			if(FAILED(hr = m_pDD->CreateSurface(&ddsd,&m_pDDSBack,NULL))) 
			{ 
				return FALSE; 
			} 
			return CreateOSDSurface(dwWidth,dwHeight); 
		} 
		return FALSE; 
	} 
	return CreateOSDSurface(dwWidth,dwHeight); 
} 
 
BOOL CDirectDisplay::CreateRGBSurface(DWORD dwWidth,DWORD dwHeight) 
{ 
	HRESULT hr; 
	DDSURFACEDESC2 ddsd; 
	ZeroMemory(&ddsd, sizeof(DDSURFACEDESC2)); 
	ddsd.dwSize = sizeof(DDSURFACEDESC2); 
	ddsd.dwFlags        = DDSD_CAPS | DDSD_WIDTH | DDSD_HEIGHT; 
		 
	ddsd.ddsCaps.dwCaps = DDSCAPS_OFFSCREENPLAIN | m_dwDDMemtype ; 
		 
	ddsd.dwWidth = dwWidth ; 
	ddsd.dwHeight = dwHeight ; 
 
	SAFE_RELEASE(m_pDDSBack); 
 
	if( FAILED( hr = m_pDD->CreateSurface( &ddsd, &m_pDDSBack, NULL ) ) ) 
	{ 
		if(DDERR_OUTOFVIDEOMEMORY == hr) 
		{ 
			ddsd.ddsCaps.dwCaps = DDSCAPS_OFFSCREENPLAIN | DDSCAPS_SYSTEMMEMORY; 
			if( FAILED( hr = m_pDD->CreateSurface( &ddsd, &m_pDDSBack, NULL ) ) ) 
				return FALSE; 
		} 
		return FALSE; 
	} 
	 
	ZeroMemory(&ddsd, sizeof(DDSURFACEDESC2)); 
	ddsd.dwSize     = sizeof(DDSURFACEDESC2); 
	if(FAILED(m_pDDSBack->Lock(NULL, &ddsd,DDLOCK_SURFACEMEMORYPTR | DDLOCK_WAIT, NULL))) 
	{ 
		return FALSE; 
	} 
	m_pDDSBack->Unlock(NULL); 
 
	if (ddsd.ddpfPixelFormat.dwFlags & DDPF_RGB) 
	{ 
		switch(ddsd.ddpfPixelFormat.dwRGBBitCount) 
		{ 
		case 32: 
			m_vout_mode = VOUT_MODE_RGB32; 
			break; 
		case 24: 
			m_vout_mode = VOUT_MODE_RGB24; 
			break; 
		case 16: 
			if(0xF800 == ddsd.ddpfPixelFormat.dwRBitMask) 
				m_vout_mode = VOUT_MODE_RGB565; 
			else if(0x7C00 == ddsd.ddpfPixelFormat.dwRBitMask) 
				m_vout_mode = VOUT_MODE_RGB555; 
			break; 
		} 
	} 
	m_dwWidth = dwWidth; 
	m_dwHeight = dwHeight; 
	return (VOUT_MODE_NONE != m_vout_mode); 
} 
 
BOOL CDirectDisplay::CreateDDraw(HWND hWnd) 
{ 
	DDSURFACEDESC2 ddsd; 
	if(DD_OK != DirectDrawCreateEx(NULL,(LPVOID *)&m_pDD,IID_IDirectDraw7,NULL)) 
		return FALSE; 
	if(NULL == m_pDD) 
		return FALSE; 
	if(DD_OK != m_pDD->SetCooperativeLevel(0,DDSCL_NORMAL)) 
	{ 
		return FALSE;	 
	} 
	ZeroMemory(&ddsd,sizeof(DDSURFACEDESC2)); 
	ddsd.dwSize = sizeof(DDSURFACEDESC2); 
	ddsd.dwFlags = DDSD_CAPS; 
	ddsd.ddsCaps.dwCaps = DDSCAPS_PRIMARYSURFACE; 
	if(DD_OK != m_pDD->CreateSurface(&ddsd,&m_pDDSFront,NULL)) 
	{ 
		return FALSE; 
	} 
	m_hWnd = hWnd; 
	return SetClipperWnd(hWnd); 
} 
 
BOOL CDirectDisplay::InitDDraw(DWORD dwWidth,DWORD dwHeight) 
{ 
	SAFE_RELEASE(m_pDDSOSD); 
	SAFE_RELEASE(m_pDDSBack); 
	if(!CreateYUVSurface(dwWidth,dwHeight)) 
	{ 
		BOOL bRet =  CreateRGBSurface(dwWidth,dwHeight); 
		return bRet; 
	} 
	return TRUE; 
} 
 
BOOL CDirectDisplay::SetClipperWnd(HWND hWnd) 
{ 
	if(NULL == m_pDD || NULL == m_pDDSFront) 
		return FALSE; 
	BOOL	bOK = FALSE; 
	LPDIRECTDRAWCLIPPER	lpDDClipper = NULL; 
	if(DD_OK == m_pDD->CreateClipper(0 , &lpDDClipper , NULL)) 
	{ 
		if(DD_OK == lpDDClipper->SetHWnd(0,hWnd)) 
		{ 
			if(DD_OK == m_pDDSFront->SetClipper(lpDDClipper)) 
				bOK = TRUE; 
		} 
		lpDDClipper->Release(); 
	} 
	m_hWnd = hWnd; 
	return bOK; 
} 
 
void CDirectDisplay::SetDrawcallback(DrawCallback pDrawcallback,LPVOID pDrawcontext) 
{ 
	m_pDrawCallback = pDrawcallback; 
	m_pDrawcontext = pDrawcontext; 
} 
 
void CDirectDisplay::UpdateImage(RECT *pRect) 
{ 
	DrawImageFromBack(pRect); 
} 
 
void CDirectDisplay::DrawImageFromBack(RECT *destRec) 
{ 
	RECT rcWnd; 
	int  nScreenWidth,nScreenHeight; 
	GetClientRect(m_hWnd,&rcWnd); 
	ClientToScreen(m_hWnd,(LPPOINT)&rcWnd.left); 
	ClientToScreen(m_hWnd,(LPPOINT)&rcWnd.right); 
	nScreenWidth = GetSystemMetrics(SM_CXSCREEN); 
	nScreenHeight = GetSystemMetrics(SM_CYSCREEN); 
	RECT	DrawRect; 
	if(NULL != destRec) 
	{ 
		memcpy(&DrawRect,destRec,sizeof(RECT*)); 
	} 
	else 
	{ 
		SetRect(&DrawRect,0,0,m_dwWidth,m_dwHeight); 
	} 
	double rate; 
	//显示窗口超出屏幕边界 
	if(rcWnd.left < 0)	 
	{ 
		rate = (double)m_dwWidth / (rcWnd.right - rcWnd.left); 
		DrawRect.left += abs((int)(rcWnd.left * rate)); 
		rcWnd.left = 0; 
	} 
	if(rcWnd.right > nScreenWidth) 
	{ 
		rate = (double)m_dwWidth / (rcWnd.right - rcWnd.left); 
		DrawRect.right = abs((int)((nScreenWidth - rcWnd.left)*rate)); 
		rcWnd.right  = nScreenWidth; 
	} 
	if(rcWnd.top < 0) 
	{ 
		rate = (double)m_dwHeight / (rcWnd.bottom - rcWnd.top); 
		DrawRect.top += abs((int)(rcWnd.top * rate)); 
		rcWnd.top = 0; 
	} 
	if(rcWnd.bottom > nScreenHeight) 
	{ 
		rate = (double)m_dwHeight / (rcWnd.bottom - rcWnd.top); 
		DrawRect.bottom = abs((int)((nScreenHeight - rcWnd.top) * rate)); 
		rcWnd.bottom = nScreenHeight; 
	} 
	LPDIRECTDRAWSURFACE7 pDDSTmp = m_pDDSBack; 
	if(m_pDrawCallback) 
	{ 
		HDC hDC; 
		if(m_pDDSOSD) 
		{ 
			RECT Rect; 
			Rect.left = Rect.top = 0; 
			Rect.right = m_dwWidth; 
			Rect.bottom = m_dwHeight; 
			m_pDDSOSD->Blt(&Rect,m_pDDSBack,destRec,DDBLT_WAIT,NULL); 
			if((DD_OK == m_pDDSOSD->GetDC(&hDC)) && (NULL != hDC)) 
			{ 
				SIZE size; 
				size.cx = m_dwWidth; 
				size.cy = m_dwHeight; 
				m_pDrawCallback(hDC,&size,m_pDrawcontext); 
				m_pDDSOSD->ReleaseDC(hDC); 
				pDDSTmp = m_pDDSOSD; 
			} 
		} 
		else 
		{ 
			if((DD_OK == m_pDDSBack->GetDC(&hDC)) && (NULL != hDC)) 
			{ 
				SIZE size; 
				size.cx = m_dwWidth; 
				size.cy = m_dwHeight; 
				m_pDrawCallback(hDC,&size,m_pDrawcontext); 
				m_pDDSBack->ReleaseDC(hDC); 
			} 
		} 
	} 
	HRESULT hr = m_pDDSFront->Blt(&rcWnd,pDDSTmp,&DrawRect,DDBLT_WAIT,NULL); 
	if(DDERR_SURFACELOST == hr) 
	{ 
		if(m_pDDSFront) 
			m_pDDSFront->Restore(); 
		if(m_pDDSBack) 
			m_pDDSBack->Restore(); 
	} 
/*	else 
	{ 
		char ss[100]; 
		sprintf(ss,"Blt:%x",DDERR_OVERLAPPINGRECTS); 
//		sprintf(ss,"-2147467263" ,hr); 
		 
		MessageBox(NULL,ss,NULL,MB_OK); 
	}*/ 
} 
 
BOOL CDirectDisplay::DrawImageFromYUV(PBYTE in_Y, PBYTE in_U, PBYTE in_V, int nPitch,RECT *destRec) 
{ 
	if(NULL == in_Y || NULL == in_U || NULL == in_V) 
	{ 
		return FALSE; 
	} 
	WaitForSingleObject(m_hCapMutex,INFINITE); 
	m_pY = in_Y; 
	m_pU = in_U; 
	m_pV = in_V; 
	m_nSrcPitch = nPitch; 
	ReleaseMutex(m_hCapMutex); 
	if(NULL == m_pDDSFront || NULL == m_pDDSBack || VOUT_MODE_NONE == m_vout_mode) 
	{ 
		return FALSE; 
	} 
	if(IsBadReadPtr(m_pDDSBack,sizeof(IDirectDrawSurface7))) 
	{ 
		return FALSE; 
	} 
	DDSURFACEDESC2 ddsd; 
	ZeroMemory(&ddsd,sizeof(DDSURFACEDESC2)); 
	ddsd.dwSize = sizeof(DDSURFACEDESC2); 
	if(DD_OK != m_pDDSBack->Lock(NULL , &ddsd,DDLOCK_SURFACEMEMORYPTR | DDLOCK_WAIT , NULL)) 
	{ 
		m_pDDSBack->Restore(); 
		return FALSE; 
	} 
	switch(m_vout_mode) 
	{ 
	case VOUT_MODE_RGB555: 
		yv12to_rgb555(in_Y, in_U, in_V, m_dwWidth, m_dwHeight, nPitch, (PBYTE)ddsd.lpSurface, ddsd.lPitch, FALSE); 
		break; 
	case VOUT_MODE_RGB565: 
		yv12to_rgb565(in_Y, in_U, in_V, m_dwWidth, m_dwHeight, nPitch, (PBYTE)ddsd.lpSurface, ddsd.lPitch, FALSE); 
		break; 
	case VOUT_MODE_RGB24: 
		yv12to_rgb24bit(in_Y, in_U, in_V, m_dwWidth, m_dwHeight, nPitch, (PBYTE)ddsd.lpSurface, ddsd.lPitch, FALSE); 
		break; 
	case VOUT_MODE_RGB32: 
		yv12to_rgb32bit(in_Y, in_U, in_V, m_dwWidth, m_dwHeight, nPitch, (PBYTE)ddsd.lpSurface, ddsd.lPitch, FALSE); 
		break; 
	case VOUT_MODE_YUY2: 
		yv12to_yuv422(in_Y, in_U, in_V, m_dwWidth, m_dwHeight, nPitch, (PBYTE)ddsd.lpSurface, ddsd.lPitch, FALSE); 
		break; 
	} 
	m_pDDSBack->Unlock(NULL); 
	if(NULL == destRec && m_bDispRect) 
		destRec = &m_DispRect; 
	DrawImageFromBack(destRec); 
	return TRUE; 
} 
 
void CDirectDisplay::colorspace_init(void) 
{ 
	static BOOL bInit = FALSE; 
	if(bInit) 
		return ; 
	 
#define RGB_Y_OUT		1.164 
#define B_U_OUT			2.018 
#define Y_ADD_OUT		16 
	 
#define G_U_OUT			0.391 
#define G_V_OUT			0.813 
#define U_ADD_OUT		128 
	 
#define R_V_OUT			1.596 
#define V_ADD_OUT		128 
	 
#define SCALEBITS_OUT	13 
#define FIX_OUT(x)		((uint16_t) ((x) * (1L<<SCALEBITS_OUT) + 0.5)) 
 
	int32_t i; 
	for (i = 0; i < 256; i++) 
	{ 
		RGB_Y_tab[i] = FIX_OUT(RGB_Y_OUT) * (i - Y_ADD_OUT); 
		B_U_tab[i] = FIX_OUT(B_U_OUT) * (i - U_ADD_OUT); 
		G_U_tab[i] = FIX_OUT(G_U_OUT) * (i - U_ADD_OUT); 
		G_V_tab[i] = FIX_OUT(G_V_OUT) * (i - V_ADD_OUT); 
		R_V_tab[i] = FIX_OUT(R_V_OUT) * (i - V_ADD_OUT); 
	} 
	bInit = TRUE; 
} 
 
/*void CDirectDisplay::SetFuncPointer() 
{ 
	unsigned int cpu_flags = detect_cpu_flags(); 
	if(cpu_flags & XVID_CPU_MMX) 
	{ 
		yv12to_rgb555 = yv12to_rgb555_mmx; 
		yv12to_rgb565 = yv12to_rgb565_mmx; 
		yv12to_rgb24bit = yv12to_rgb24bit_mmx; 
		yv12to_rgb32bit = yv12to_rgb32bit_mmx; 
		yv12to_yuv422 = yv12to_yuv422_mmx; 
	} 
	if(cpu_flags & XVID_CPU_SSE2) 
	{ 
		yv12to_rgb555 = yv12to_rgb555_sse2; 
		yv12to_rgb565 = yv12to_rgb565_sse2; 
		yv12to_rgb24bit = yv12to_rgb24bit_sse2; 
		yv12to_rgb32bit = yv12to_rgb32bit_sse2; 
		yv12to_yuv422 = yv12to_yuv422_sse2; 
	} 
} 
 
void CDirectDisplay::yv12to_rgb555_c(PBYTE in_Y,PBYTE in_U,PBYTE in_V,DWORD dwWidth,DWORD dwHeight,int nSrcPitch,PBYTE pOut,int nDstPitch,BOOL bFlip) 
{ 
	if(NULL == in_Y || NULL == in_U || NULL == in_V) 
		return ; 
	colorspace_init(); 
	int x_dif = nDstPitch - dwWidth *  2; 
	int y_dif = nSrcPitch - dwWidth; 
	int uv_dif = (nSrcPitch - dwWidth) / 2; 
	DWORD x,y; 
	if(bFlip) 
	{ 
		pOut += (dwHeight - 1) * nDstPitch; 
		x_dif = -2 * dwWidth - nDstPitch; 
		nDstPitch = -1 * nDstPitch;	 
	} 
	for(y = 0 ; y < dwHeight ; y+= 2) 
	{ 
		int r[2],g[2],b[2]; 
		r[0] = r[1] = g[0] = g[1] = b[0] = b[1] = 0; 
		for(x = 0; x < dwWidth ; x += 2) 
		{ 
			int rgb_y; 
			int b_u0 = B_U_tab[in_U[0]]; 
			int g_uv0 = G_U_tab[in_U[0]]+G_V_tab[in_V[0]]; 
			int r_v0 = R_V_tab[in_V[0]]; 
			rgb_y = RGB_Y_tab[ in_Y[0] ];						 
			b[0] = (b[0] & 0x7) + ((rgb_y + b_u0) >> 13);	 
			g[0] = (g[0] & 0x7) + ((rgb_y - g_uv0) >> 13);	 
			r[0] = (r[0] & 0x7) + ((rgb_y + r_v0) >> 13);		 
			*(uint16_t *) (pOut) = MK_RGB555(r[0], g[0], b[0]);	 
			rgb_y = RGB_Y_tab[in_Y[1]];				 
			b[0] = (b[0] & 0x7) + ((rgb_y + b_u0) >> 13);		 
			g[0] = (g[0] & 0x7) + ((rgb_y - g_uv0) >> 13);	 
			r[0] = (r[0] & 0x7) + ((rgb_y + r_v0) >> 13);		 
			*(uint16_t *) (pOut+2) = MK_RGB555(r[0], g[0], b[0]); 
 
			rgb_y = RGB_Y_tab[ in_Y[nSrcPitch ] ];						 
			b[1] = (b[1] & 0x7) + ((rgb_y + b_u0) >> 13);	 
			g[1] = (g[1] & 0x7) + ((rgb_y - g_uv0) >> 13);	 
			r[1] = (r[1] & 0x7) + ((rgb_y + r_v0) >> 13);		 
			*(uint16_t *) (pOut+nDstPitch) = MK_RGB555(r[1], g[1], b[1]);	 
			rgb_y = RGB_Y_tab[ in_Y[nSrcPitch + 1] ];				\ 
			b[1] = (b[1] & 0x7) + ((rgb_y + b_u0) >> 13);		 
			g[1] = (g[1] & 0x7) + ((rgb_y - g_uv0) >> 13);	 
			r[1] = (r[1] & 0x7) + ((rgb_y + r_v0) >> 13);		 
			*(uint16_t *) (pOut+nDstPitch+2) = MK_RGB555(r[1], g[1], b[1]); 
 
			pOut += 4; 
			in_Y += 2; 
			in_U += 1; 
			in_V += 1; 
		} 
		pOut += x_dif + nDstPitch; 
		in_Y += y_dif + nSrcPitch; 
		in_U += uv_dif; 
		in_V += uv_dif; 
	} 
} 
 
void CDirectDisplay::yv12to_rgb565_c(PBYTE in_Y, PBYTE in_U, PBYTE in_V, DWORD dwWidth, DWORD dwHeight, int nSrcPitch, PBYTE pOut, int nDstPitch,BOOL bFlip) 
{ 
	if(NULL == in_Y || NULL == in_U || NULL == in_V) 
		return ; 
	colorspace_init(); 
	int x_dif = nDstPitch - dwWidth *  2; 
	int y_dif = nSrcPitch - dwWidth; 
	int uv_dif = (nSrcPitch - dwWidth) / 2; 
	DWORD x,y; 
	if(bFlip) 
	{ 
		pOut += (dwHeight - 1) * nDstPitch; 
		x_dif = -2 * dwWidth - nDstPitch; 
		nDstPitch = -1 * nDstPitch;	 
	} 
	for(y = 0 ; y < dwHeight ; y+= 2) 
	{ 
		int r[2],g[2],b[2]; 
		r[0] = r[1] = g[0] = g[1] = b[0] = b[1] = 0; 
		for(x = 0; x < dwWidth ; x += 2) 
		{ 
			int rgb_y; 
			int b_u0 = B_U_tab[in_U[0]]; 
			int g_uv0 = G_U_tab[in_U[0]]+G_V_tab[in_V[0]]; 
			int r_v0 = R_V_tab[in_V[0]]; 
			rgb_y = RGB_Y_tab[ in_Y[0] ];						 
			b[0] = (b[0] & 0x7) + ((rgb_y + b_u0) >> 13);	 
			g[0] = (g[0] & 0x7) + ((rgb_y - g_uv0) >> 13);	 
			r[0] = (r[0] & 0x7) + ((rgb_y + r_v0) >> 13);		 
			*(uint16_t *) (pOut) = MK_RGB565(r[0], g[0], b[0]);	 
			rgb_y = RGB_Y_tab[in_Y[1]];				 
			b[0] = (b[0] & 0x7) + ((rgb_y + b_u0) >> 13);		 
			g[0] = (g[0] & 0x7) + ((rgb_y - g_uv0) >> 13);	 
			r[0] = (r[0] & 0x7) + ((rgb_y + r_v0) >> 13);		 
			*(uint16_t *) (pOut+2) = MK_RGB565(r[0], g[0], b[0]); 
 
			rgb_y = RGB_Y_tab[ in_Y[nSrcPitch ] ];						 
			b[1] = (b[1] & 0x7) + ((rgb_y + b_u0) >> 13);	 
			g[1] = (g[1] & 0x7) + ((rgb_y - g_uv0) >> 13);	 
			r[1] = (r[1] & 0x7) + ((rgb_y + r_v0) >> 13);		 
			*(uint16_t *) (pOut+nDstPitch) = MK_RGB565(r[1], g[1], b[1]);	 
			rgb_y = RGB_Y_tab[ in_Y[nSrcPitch + 1] ];				 
			b[1] = (b[1] & 0x7) + ((rgb_y + b_u0) >> 13);		 
			g[1] = (g[1] & 0x7) + ((rgb_y - g_uv0) >> 13);	 
			r[1] = (r[1] & 0x7) + ((rgb_y + r_v0) >> 13);		 
			*(uint16_t *) (pOut+nDstPitch+2) = MK_RGB565(r[1], g[1], b[1]); 
 
			pOut += 4; 
			in_Y += 2; 
			in_U += 1; 
			in_V += 1; 
		} 
		pOut += x_dif + nDstPitch; 
		in_Y += y_dif + nSrcPitch; 
		in_U += uv_dif; 
		in_V += uv_dif; 
	} 
} 
 
void CDirectDisplay::yv12to_rgb24bit_c(PBYTE in_Y, PBYTE in_U, PBYTE in_V, DWORD dwWidth, DWORD dwHeight, int nSrcPitch, PBYTE pOut, int nDstPitch,BOOL bFlip) 
{ 
	if(NULL == in_Y || NULL == in_U || NULL == in_V) 
		return ; 
	colorspace_init(); 
	int x_dif = nDstPitch - 3*dwWidth;		 
	int y_dif = nSrcPitch - dwWidth;				 
	int uv_dif = (nSrcPitch - dwWidth) / 2;		 
	DWORD x, y;										 
	if (bFlip) 
	{								 
		pOut += (dwHeight - 1) * nDstPitch;			 
		x_dif = -3 *dwWidth - nDstPitch;		 
		nDstPitch = -1 * nDstPitch;						 
	}												 
	for (y = 0; y < dwHeight; y+= 2) 
	{			 
		for (x = 0; x < dwWidth; x+= 2) 
		{	 
			int rgb_y;												 
			int b_u0 = B_U_tab[ in_U[0] ];							 
			int g_uv0 = G_U_tab[ in_U[0] ] + G_V_tab[ in_V[0] ];	 
			int r_v0 = R_V_tab[ in_V[0] ];							 
			rgb_y = RGB_Y_tab[ in_Y[0] ];						 
			pOut[0] = MAX(0, MIN(255, (rgb_y + b_u0) >> SCALEBITS_OUT));	 
			pOut[1] = MAX(0, MIN(255, (rgb_y - g_uv0) >> SCALEBITS_OUT));	 
			pOut[2] = MAX(0, MIN(255, (rgb_y + r_v0) >> SCALEBITS_OUT));	 
			rgb_y = RGB_Y_tab[ in_Y[1] ]; 
			pOut[3] = MAX(0, MIN(255, (rgb_y + b_u0) >> SCALEBITS_OUT));	 
			pOut[4] = MAX(0, MIN(255, (rgb_y - g_uv0) >> SCALEBITS_OUT));	 
			pOut[5] = MAX(0, MIN(255, (rgb_y + r_v0) >> SCALEBITS_OUT)); 
			 
			rgb_y = RGB_Y_tab[ in_Y[nSrcPitch] ];						 
			pOut[nDstPitch] = MAX(0, MIN(255, (rgb_y + b_u0) >> SCALEBITS_OUT));	 
			pOut[nDstPitch + 1] = MAX(0, MIN(255, (rgb_y - g_uv0) >> SCALEBITS_OUT));	 
			pOut[nDstPitch + 2] = MAX(0, MIN(255, (rgb_y + r_v0) >> SCALEBITS_OUT));	 
			rgb_y = RGB_Y_tab[ in_Y[nSrcPitch + 1] ];									 
			pOut[nDstPitch + 3] = MAX(0, MIN(255, (rgb_y + b_u0) >> SCALEBITS_OUT));	 
			pOut[nDstPitch + 4] = MAX(0, MIN(255, (rgb_y - g_uv0) >> SCALEBITS_OUT));	 
			pOut[nDstPitch + 5] = MAX(0, MIN(255, (rgb_y + r_v0) >> SCALEBITS_OUT));	 
 
			pOut += 6;				 
			in_Y += 2;						 
			in_U += 1;					 
			in_V += 1;					 
		}											 
		pOut += x_dif + nDstPitch;		 
		in_Y += y_dif + nSrcPitch;		 
		in_U += uv_dif;	 
		in_V += uv_dif;	 
	}												 
} 
 
void CDirectDisplay::yv12to_rgb32bit_c(PBYTE in_Y, PBYTE in_U, PBYTE in_V, DWORD dwWidth, DWORD dwHeight, int nSrcPitch, PBYTE pOut, int nDstPitch,BOOL bFlip) 
{ 
	if(NULL == in_Y || NULL == in_U || NULL == in_V) 
		return ; 
	colorspace_init(); 
	int x_dif = nDstPitch - 4 * dwWidth; 
	int y_dif = nSrcPitch - dwWidth; 
	int uv_dif = (nSrcPitch - dwWidth)  / 2; 
	DWORD x, y; 
	if (bFlip) 
	{								 
		pOut += (dwHeight - 1) * nDstPitch; 
		x_dif = - 4 * dwWidth - nDstPitch; 
		nDstPitch = -1 * nDstPitch; 
	} 
	for (y = 0; y < dwHeight; y+= 2) 
	{ 
		for (x = 0; x < dwWidth; x+= 2) 
		{ 
			int rgb_y; 
			int b_u0 = B_U_tab[ in_U[0] ]; 
			int g_uv0 = G_U_tab[ in_U[0] ] + G_V_tab[ in_V[0] ]; 
			int r_v0 = R_V_tab[ in_V[0] ]; 
 
			rgb_y = RGB_Y_tab[ in_Y[0] ]; 
			pOut[0] = MAX(0, MIN(255, (rgb_y + b_u0) >> SCALEBITS_OUT)); 
			pOut[1] = MAX(0, MIN(255, (rgb_y - g_uv0) >> SCALEBITS_OUT)); 
			pOut[2] = MAX(0, MIN(255, (rgb_y + r_v0) >> SCALEBITS_OUT)); 
			pOut[3] = 0; 
	 
			rgb_y = RGB_Y_tab[ in_Y[1] ]; 
			pOut[4] = MAX(0, MIN(255, (rgb_y + b_u0) >> SCALEBITS_OUT)); 
			pOut[5] = MAX(0, MIN(255, (rgb_y - g_uv0) >> SCALEBITS_OUT)); 
			pOut[6] = MAX(0, MIN(255, (rgb_y + r_v0) >> SCALEBITS_OUT)); 
			pOut[7] = 0; 
 
			rgb_y = RGB_Y_tab[ in_Y[nSrcPitch] ]; 
			pOut[nDstPitch] = MAX(0, MIN(255, (rgb_y + b_u0) >> SCALEBITS_OUT)); 
			pOut[nDstPitch+1] = MAX(0, MIN(255, (rgb_y - g_uv0) >> SCALEBITS_OUT)); 
			pOut[nDstPitch+2] = MAX(0, MIN(255, (rgb_y + r_v0) >> SCALEBITS_OUT)); 
			pOut[nDstPitch+3] = 0; 
			rgb_y = RGB_Y_tab[ in_Y[nSrcPitch + 1] ]; 
			pOut[nDstPitch+4] = MAX(0, MIN(255, (rgb_y + b_u0) >> SCALEBITS_OUT)); 
			pOut[nDstPitch+5] = MAX(0, MIN(255, (rgb_y - g_uv0) >> SCALEBITS_OUT)); 
			pOut[nDstPitch+6] = MAX(0, MIN(255, (rgb_y + r_v0) >> SCALEBITS_OUT)); 
			pOut[nDstPitch+7] = 0; 
 
			pOut += 8; 
			in_Y += 2; 
			in_U += 1; 
			in_V += 1; 
		} 
		pOut += x_dif + nDstPitch; 
		in_Y += y_dif + nSrcPitch; 
		in_U += uv_dif; 
		in_V += uv_dif; 
	} 
} 
 
void CDirectDisplay::yv12to_yuv422_c(PBYTE in_Y, PBYTE in_U, PBYTE in_V, DWORD dwWidth, DWORD dwHeight, int nSrcPitch, PBYTE pOut, int nDstPitch,BOOL bFlip) 
{ 
	if(NULL == in_Y || NULL == in_U || NULL == in_V) 
		return ; 
	int x_dif = nDstPitch - 2 * dwWidth; 
	int y_dif = nSrcPitch - dwWidth; 
	int uv_dif = (nSrcPitch - dwWidth) / 2; 
	DWORD x, y; 
	if (bFlip) 
	{ 
		pOut += (dwHeight - 1) * nDstPitch; 
		x_dif = - 2*dwWidth - nDstPitch; 
		nDstPitch = -1 * nDstPitch; 
	} 
	for (y = 0; y < dwHeight; y+= 2) 
	{ 
		for (x = 0; x < dwWidth; x+= 2) 
		{ 
			pOut[0] = in_Y[0]; 
			pOut[1] = in_U[0]; 
			pOut[2] = in_Y[1]; 
			pOut[3] = in_V[0]; 
 
			pOut[nDstPitch] = in_Y[nSrcPitch]; 
			pOut[nDstPitch + 1] = in_U[0]; 
			pOut[nDstPitch + 2] = in_Y[nSrcPitch +1]; 
			pOut[nDstPitch + 3] = in_V[0]; 
			pOut += 4; 
			in_Y += 2; 
			in_U += 1; 
			in_V += 1; 
		} 
		pOut += x_dif + nDstPitch; 
		in_Y += y_dif + nSrcPitch; 
		in_U += uv_dif; 
		in_V += uv_dif; 
	} 
}*/ 
 
 
void CDirectDisplay::yv12to_rgb555_mmx(PBYTE in_Y, PBYTE in_U, PBYTE in_V, DWORD dwWidth, DWORD dwHeight, int nSrcPitch, PBYTE pOut, int nDstPitch,BOOL bFlip) 
{ 
	if(NULL == in_Y || NULL == in_U || NULL == in_V) 
		return ; 
	DWORD i; 
	int horiz_circle = -1 * (dwWidth >> 3); 
	if(bFlip) 
	{ 
		in_Y = in_Y + (nSrcPitch * (dwHeight -1)); 
		in_U = in_U + (nSrcPitch/2 * (dwHeight /2 -1)) ; 
		in_V = in_V + (nSrcPitch/2 * (dwHeight /2 -1)) ; 
		nSrcPitch = -1 * nSrcPitch; 
	} 
	for (i=0; i<dwHeight; i+=2)  
	{ 
		__asm  
		{ 
			push eax 
			push ebx 
			push ecx 
			push edx 
			push edi 
				 
			mov eax, pOut        
			mov ebx, in_Y        
			mov ecx, in_U        
			mov edx, in_V 
			mov edi, horiz_circle 
				 
horiz_loop1: 
			 
			// load data 
			movd mm2, [ecx]					 //; mm2 = ________u3u2u1u0 
			movd mm3, [edx]					 //; mm3 = ________v3v2v1v0 
			movq mm0, [ebx]          //; mm0 = y7y6y5y4y3y2y1y0   
				 
			pxor mm7, mm7						 //; zero mm7 
				 
			// convert chroma part 
			punpcklbw mm2, mm7       //; mm2 = __u3__u2__u1__u0 
			punpcklbw mm3, mm7       //; mm3 = __v3__v2__v1__v0 
			psubw mm2, mmw_0x0080    //; mm2 -= 128 
			psubw mm3, mmw_0x0080    //; mm3 -= 128 
			psllw mm2, 3             //; mm2 *= 8 
			psllw mm3, 3             //; mm3 *= 8 
			movq mm4, mm2            //; mm4 = mm2 = u 
			movq mm5, mm3            //; mm5 = mm3 = v 
			pmulhw mm2, mmw_mult_U_G //; mm2 *= u green coeff  
			pmulhw mm3, mmw_mult_V_G //; mm3 *= v green coeff   
			pmulhw mm4, mmw_mult_U_B //; mm4 = blue chroma 
			pmulhw mm5, mmw_mult_V_R //; mm5 = red chroma 
			paddsw mm2, mm3					 //; mm2 = green chroma 
			 
				// convert luma part 
			psubusb mm0, mmb_0x10    //; mm0 -= 16 
			movq mm1, mmw_0x00ff     //; mm1 = 00ff00ff00ff00ff  
			pand mm1, mm0            //; mm1 = __y6__y4__y2__y0 luma even 
			psrlw mm0, 8             //; mm0 = __y7__y5__y3__y1 luma odd 
			psllw mm0, 3             //; mm0 *= 8 
			psllw mm1, 3             //; mm1 *= 8 
			pmulhw mm0, mmw_mult_Y   //; mm0 luma odd *= luma coeff  
			pmulhw mm1, mmw_mult_Y   //; mm1 luma even *= luma coeff  
			 
			// complete the matrix calc with the addictions 
			movq mm3, mm4						 //; copy blue chroma 
			movq mm6, mm5						 //; copy red chroma 
			movq mm7, mm2						 //; copy green chroma 
			paddsw mm3, mm0					 //; mm3 = luma odd + blue chroma 
			paddsw mm4, mm1					 //; mm4 = luma even + blue chroma 
			paddsw mm6, mm0					 //; mm6 = luma odd + red chroma 
			paddsw mm5, mm1					 //; mm5 = luma even + red chroma 
			paddsw mm7, mm0					 //; mm7 = luma odd + green chroma 
			paddsw mm2, mm1					 //; mm2 = luma even + green chroma 
			// clipping 
			packuswb mm3, mm3 
			packuswb mm4, mm4 
			packuswb mm6, mm6 
			packuswb mm5, mm5 
			packuswb mm7, mm7 
			packuswb mm2, mm2 
			// interleave odd and even parts 
			punpcklbw mm4, mm3			 //; mm4 = b7b6b5b4b3b2b1b0 blue 
			punpcklbw mm5, mm6			 //; mm5 = r7r6r5r4r3r2r1r0 red 
			punpcklbw mm2, mm7			 //; mm2 = g7g6g5g4g3g2g1g0 green 
			 
			// mask not needed bits (using 555) 
			pand mm4, mask_0xf8 
			pand mm5, mask_0xf8 
			pand mm2, mask_0xf8 
				 
			// mix colors and write 
				 
			psrlw mm4, 3						 //; mm4 = blue shifted 
			pand mm4, mask_0x1f			 //; mask the blue again 
			pxor mm7, mm7						 //; zero mm7 
			movq mm1, mm4						 //; mm1 = copy blue 
			movq mm3, mm5						 //; mm3 = copy red 
			movq mm6, mm2						 //; mm6 = copy green 
			 
			punpckhbw mm1, mm7 
			punpckhbw mm3, mm7 
			punpckhbw mm6, mm7 
			psllw mm6, 2						 //; shift green 
			psllw mm3, 7						 //; shift red 
			por mm6, mm3 
			por mm6, mm1 
			movq 8[eax], mm6 
				 
			punpcklbw mm2, mm7			 //; mm2 = __g3__g2__g1__g0 already masked 
			punpcklbw mm5, mm7 
			punpcklbw mm4, mm7 
			psllw mm2, 2						 //; shift green 
			psllw mm5, 7						 //; shift red 
			por mm2, mm5 
			por mm2, mm4 
			movq [eax], mm2 
				 
			add ebx, 8               //; in_Y   += 8//; 
			add ecx, 4               //; in_U   += 4//; 
			add edx, 4               //; in_V   += 4//; 
			add eax, 16              //; pOut += 16 // wrote 16 bytes 
				 
			inc edi 
			jne horiz_loop1			 
			mov eax, pOut 
			add eax, nDstPitch			//; 
			mov pOut,eax 
			mov ebx, in_Y 
			add ebx,nSrcPitch//; 
			mov in_Y,ebx 
			mov ecx, in_U        
			mov edx, in_V 
			mov edi, horiz_circle 
			 
horiz_loop2: 
			 
			// load data 
			movd mm2, [ecx]					 //; mm2 = ________u3u2u1u0 
			movd mm3, [edx]					 //; mm3 = ________v3v2v1v0 
			movq mm0, [ebx]          //; mm0 = y7y6y5y4y3y2y1y0   
			 
			pxor mm7, mm7						 //; zero mm7 
				 
				// convert chroma part 
			punpcklbw mm2, mm7       //; mm2 = __u3__u2__u1__u0 
			punpcklbw mm3, mm7       //; mm3 = __v3__v2__v1__v0 
			psubw mm2, mmw_0x0080    //; mm2 -= 128 
			psubw mm3, mmw_0x0080    //; mm3 -= 128 
			psllw mm2, 3             //; mm2 *= 8 
			psllw mm3, 3             //; mm3 *= 8 
			movq mm4, mm2            //; mm4 = mm2 = u 
			movq mm5, mm3            //; mm5 = mm3 = v 
			pmulhw mm2, mmw_mult_U_G //; mm2 *= u green coeff  
			pmulhw mm3, mmw_mult_V_G //; mm3 *= v green coeff   
			pmulhw mm4, mmw_mult_U_B //; mm4 = blue chroma 
			pmulhw mm5, mmw_mult_V_R //; mm5 = red chroma 
			paddsw mm2, mm3					 //; mm2 = green chroma 
			 
				// convert luma part 
			psubusb mm0, mmb_0x10    //; mm0 -= 16 
			movq mm1, mmw_0x00ff     //; mm1 = 00ff00ff00ff00ff  
			pand mm1, mm0            //; mm1 = __y6__y4__y2__y0 luma even 
			psrlw mm0, 8             //; mm0 = __y7__y5__y3__y1 luma odd 
			psllw mm0, 3             //; mm0 *= 8 
			psllw mm1, 3             //; mm1 *= 8 
			pmulhw mm0, mmw_mult_Y   //; mm0 luma odd *= luma coeff  
			pmulhw mm1, mmw_mult_Y   //; mm1 luma even *= luma coeff  
				 
			// complete the matrix calc with the addictions 
			movq mm3, mm4						 //; copy blue chroma 
			movq mm6, mm5						 //; copy red chroma 
			movq mm7, mm2						 //; copy green chroma 
			paddsw mm3, mm0					 //; mm3 = luma odd + blue chroma 
			paddsw mm4, mm1					 //; mm4 = luma even + blue chroma 
			paddsw mm6, mm0					 //; mm6 = luma odd + red chroma 
			paddsw mm5, mm1					 //; mm5 = luma even + red chroma 
			paddsw mm7, mm0					 //; mm7 = luma odd + green chroma 
			paddsw mm2, mm1					 //; mm2 = luma even + green chroma 
			// clipping 
			packuswb mm3, mm3 
			packuswb mm4, mm4 
			packuswb mm6, mm6 
			packuswb mm5, mm5 
			packuswb mm7, mm7 
			packuswb mm2, mm2 
			// interleave odd and even parts 
			punpcklbw mm4, mm3			 //; mm4 = b7b6b5b4b3b2b1b0 blue 
			punpcklbw mm5, mm6			 //; mm5 = r7r6r5r4r3r2r1r0 red 
			punpcklbw mm2, mm7			 //; mm2 = g7g6g5g4g3g2g1g0 green 
			 
			// mask not needed bits (using 555) 
			pand mm4, mask_0xf8 
			pand mm5, mask_0xf8 
			pand mm2, mask_0xf8 
			 
			// mix colors and write 
				 
			psrlw mm4, 3						 //; mm4 = blue shifted 
			pand mm4, mask_0x1f			 //; mask the blue again 
			pxor mm7, mm7						 //; zero mm7 
			movq mm1, mm4						 //; mm1 = copy blue 
			movq mm3, mm5						 //; mm3 = copy red 
			movq mm6, mm2						 //; mm6 = copy green 
			 
			punpckhbw mm1, mm7 
			punpckhbw mm3, mm7 
			punpckhbw mm6, mm7 
			psllw mm6, 2						 //; shift green 
			psllw mm3, 7						 //; shift red 
			por mm6, mm3 
			por mm6, mm1 
			movq 8[eax], mm6 
			 
			punpcklbw mm2, mm7			 //; mm2 = __g3__g2__g1__g0 already masked 
			punpcklbw mm5, mm7 
			punpcklbw mm4, mm7 
			psllw mm2, 2						 //; shift green 
			psllw mm5, 7						 //; shift red 
			por mm2, mm5 
			por mm2, mm4 
			movq [eax], mm2 
			 
			add ebx, 8               //; in_Y   += 8//; 
			add ecx, 4               //; in_U   += 4//; 
			add edx, 4               //; in_V   += 4//; 
			add eax, 16              //; pOut += 16 // wrote 16 bytes 
				 
			inc edi 
			jne horiz_loop2			 
				 
			pop edi  
			pop edx  
			pop ecx 
			pop ebx  
			pop eax 
				 
			emms 
				 
		} 
		pOut += nDstPitch; 
		in_Y   += nSrcPitch; 
		in_U   += nSrcPitch / 2; 
		in_V   += nSrcPitch / 2; 
	} 
} 
 
void CDirectDisplay::yv12to_rgb565_mmx(PBYTE in_Y, PBYTE in_U, PBYTE in_V, DWORD dwWidth, DWORD dwHeight, int nSrcPitch, PBYTE pOut, int nDstPitch,BOOL bFlip) 
{ 
	if(NULL == in_Y || NULL == in_U || NULL == in_V) 
		return ; 
	unsigned short * pus_out; 
	pus_out = (unsigned short *) pOut; 
	int horiz_circle = -1 *(dwWidth >> 3); 
	if(bFlip) 
	{ 
		in_Y = in_Y + (nSrcPitch * (dwHeight -1)); 
		in_U = in_U + (nSrcPitch/2 * (dwHeight /2 -1)) ; 
		in_V = in_V + (nSrcPitch/2 * (dwHeight /2 -1)) ; 
		nSrcPitch = -1 * nSrcPitch; 
	} 
	for (DWORD i=0; i<dwHeight; i+=2)  
	{ 
		_asm  
		{ 
			push eax 
			push ebx 
			push ecx 
			push edx 
			push edi 
			 
			mov eax, pOut        
			mov ebx, in_Y        
			mov ecx, in_U        
			mov edx, in_V 
			mov edi, horiz_circle 
				 
horiz_loop1: 
			 
			// load data 
			movd mm2, [ecx]					 //; mm2 = ________u3u2u1u0 
			movd mm3, [edx]					 //; mm3 = ________v3v2v1v0 
			movq mm0, [ebx]          //; mm0 = y7y6y5y4y3y2y1y0   
				 
			pxor mm7, mm7			 //; zero mm7 
				 
			// convert chroma part 
			punpcklbw mm2, mm7       //; mm2 = __u3__u2__u1__u0 
			punpcklbw mm3, mm7       //; mm3 = __v3__v2__v1__v0 
			psubw mm2, mmw_0x0080    //; mm2 -= 128  //0x0080008000800080 
			psubw mm3, mmw_0x0080    //; mm3 -= 128 
			psllw mm2, 3             //; mm2 *= 8    // u 
			psllw mm3, 3             //; mm3 *= 8    // v 
			movq mm4, mm2            //; mm4 = mm2 = u 
			movq mm5, mm3            //; mm5 = mm3 = v 
			pmulhw mm2, mmw_mult_U_G //; mm2 *= u green coeff  //mmw_mult_U_G=0xf36ef36ef36ef36e//; 
			pmulhw mm3, mmw_mult_V_G //; mm3 *= v green coeff  //mmw_mult_V_G=0xe5e2e5e2e5e2e5e2//; 
			pmulhw mm4, mmw_mult_U_B //; mm4 = blue chroma     //mmw_mult_U_B=0x40cf40cf40cf40cf//;   
			pmulhw mm5, mmw_mult_V_R //; mm5 = red chroma      //mmw_mult_V_R=0x3343334333433343//; 
			paddsw mm2, mm3			 //; mm2 = green chroma    // u+v 
				 
			// convert luma part 
			psubusb mm0, mmb_0x10    //; mm0 -= 16     //;y-16        //mmb_0x10=0x1010101010101010//; 
			 
			movq mm1, mmw_0x00ff     //; mm1 = 00ff00ff00ff00ff  
			pand mm1, mm0            //; mm1 = __y6__y4__y2__y0 luma even 
			psrlw mm0, 8             //; mm0 = __y7__y5__y3__y1 luma odd 
			 
			psllw mm0, 3             //; mm0 *= 8 
			psllw mm1, 3             //; mm1 *= 8 
			pmulhw mm0, mmw_mult_Y   //; mm0 luma odd *= luma coeff   //mmw_mult_Y= 0x2568256825682568//; 
			pmulhw mm1, mmw_mult_Y   //; mm1 luma even *= luma coeff  // 
			 
			// complete the matrix calc with the addictions 
			movq mm3, mm4						 //; copy blue chroma 
			movq mm6, mm5						 //; copy red chroma 
			movq mm7, mm2						 //; copy green chroma 
			paddsw mm3, mm0					 //; mm3 = luma odd + blue chroma 
			paddsw mm4, mm1					 //; mm4 = luma even + blue chroma 
			paddsw mm6, mm0					 //; mm6 = luma odd + red chroma 
			paddsw mm5, mm1					 //; mm5 = luma even + red chroma 
			paddsw mm7, mm0					 //; mm7 = luma odd + green chroma 
			paddsw mm2, mm1					 //; mm2 = luma even + green chroma 
			// clipping 
			packuswb mm3, mm3 
			packuswb mm4, mm4 
			packuswb mm6, mm6 
			packuswb mm5, mm5 
			packuswb mm7, mm7 
			packuswb mm2, mm2 
			// interleave odd and even parts 
			punpcklbw mm4, mm3			 //; mm4 = b7b6b5b4b3b2b1b0 blue 
			punpcklbw mm5, mm6			 //; mm5 = r7r6r5r4r3r2r1r0 red 
			punpcklbw mm2, mm7			 //; mm2 = g7g6g5g4g3g2g1g0 green 
			 
			// mask not needed bits (using 555) 
			pand mm4, mask_0xf8 
			pand mm5, mask_0xf8 
			pand mm2, mask_0xf8 
				 
			// mix colors and write 
				 
			psrlw mm4, 3						 //; mm4 = red shifted 
			pand mm4, mask_0x1f			 //; mask the blue again 
			pxor mm7, mm7						 //; zero mm7 
			movq mm1, mm5						 //; mm1 = copy blue 
			movq mm3, mm4						 //; mm3 = copy red 
			movq mm6, mm2						 //; mm6 = copy green 
			 
			punpckhbw mm1, mm7 
			punpckhbw mm3, mm7 
			punpckhbw mm6, mm7 
			psllw mm6, 3						 //; shift green 
			psllw mm1, 8						 //; shift blue 
			por mm6, mm3 
			por mm6, mm1 
			movq 8[eax], mm6 
			 
			punpcklbw mm2, mm7			 //; mm2 = __g3__g2__g1__g0 already masked 
			punpcklbw mm4, mm7 
			punpcklbw mm5, mm7 
			psllw mm2, 3						 //; shift green 
			psllw mm5, 8						 //; shift blue 
			por mm2, mm4 
			por mm2, mm5 
			movq [eax], mm2 
				 
			add ebx, 8               //; in_Y   += 8//; 
			add ecx, 4               //; in_U   += 4//; 
			add edx, 4               //; in_V   += 4//; 
			add eax, 16              //; pOut += 16 // wrote 16 bytes 
				 
			inc edi 
			jne horiz_loop1			 
				 
			mov eax, pOut 
			add eax, nDstPitch	//; 
			mov pOut,eax 
			mov ebx, in_Y 
			add ebx,nSrcPitch//; 
			mov in_Y,ebx 
				 
			mov ecx, in_U        
			mov edx, in_V 
			mov edi, horiz_circle 
				 
horiz_loop2: 
			 
			// load data 
			movd mm2, [ecx]					 //; mm2 = ________u3u2u1u0 
			movd mm3, [edx]					 //; mm3 = ________v3v2v1v0 
			movq mm0, [ebx]          //; mm0 = y7y6y5y4y3y2y1y0   
				 
			pxor mm7, mm7			 //; zero mm7 
				 
			// convert chroma part 
			punpcklbw mm2, mm7       //; mm2 = __u3__u2__u1__u0 
			punpcklbw mm3, mm7       //; mm3 = __v3__v2__v1__v0 
			psubw mm2, mmw_0x0080    //; mm2 -= 128  //0x0080008000800080 
			psubw mm3, mmw_0x0080    //; mm3 -= 128 
			psllw mm2, 3             //; mm2 *= 8    // u 
			psllw mm3, 3             //; mm3 *= 8    // v 
			movq mm4, mm2            //; mm4 = mm2 = u 
			movq mm5, mm3            //; mm5 = mm3 = v 
			pmulhw mm2, mmw_mult_U_G //; mm2 *= u green coeff  //mmw_mult_U_G=0xf36ef36ef36ef36e//; 
			pmulhw mm3, mmw_mult_V_G //; mm3 *= v green coeff  //mmw_mult_V_G=0xe5e2e5e2e5e2e5e2//; 
			pmulhw mm4, mmw_mult_U_B //; mm4 = blue chroma     //mmw_mult_U_B=0x40cf40cf40cf40cf//;   
			pmulhw mm5, mmw_mult_V_R //; mm5 = red chroma      //mmw_mult_V_R=0x3343334333433343//; 
			paddsw mm2, mm3			 //; mm2 = green chroma    // u+v 
				 
			// convert luma part 
			psubusb mm0, mmb_0x10    //; mm0 -= 16     //;y-16        //mmb_0x10=0x1010101010101010//; 
			 
			movq mm1, mmw_0x00ff     //; mm1 = 00ff00ff00ff00ff  
			pand mm1, mm0            //; mm1 = __y6__y4__y2__y0 luma even 
			psrlw mm0, 8             //; mm0 = __y7__y5__y3__y1 luma odd 
			 
			psllw mm0, 3             //; mm0 *= 8 
			psllw mm1, 3             //; mm1 *= 8 
			pmulhw mm0, mmw_mult_Y   //; mm0 luma odd *= luma coeff   //mmw_mult_Y= 0x2568256825682568//; 
			pmulhw mm1, mmw_mult_Y   //; mm1 luma even *= luma coeff  // 
			 
			// complete the matrix calc with the addictions 
			movq mm3, mm4						 //; copy blue chroma 
			movq mm6, mm5						 //; copy red chroma 
			movq mm7, mm2						 //; copy green chroma 
			paddsw mm3, mm0					 //; mm3 = luma odd + blue chroma 
			paddsw mm4, mm1					 //; mm4 = luma even + blue chroma 
			paddsw mm6, mm0					 //; mm6 = luma odd + red chroma 
			paddsw mm5, mm1					 //; mm5 = luma even + red chroma 
			paddsw mm7, mm0					 //; mm7 = luma odd + green chroma 
			paddsw mm2, mm1					 //; mm2 = luma even + green chroma 
			// clipping 
			packuswb mm3, mm3 
			packuswb mm4, mm4 
			packuswb mm6, mm6 
			packuswb mm5, mm5 
			packuswb mm7, mm7 
			packuswb mm2, mm2 
			// interleave odd and even parts 
			punpcklbw mm4, mm3			 //; mm4 = b7b6b5b4b3b2b1b0 blue 
			punpcklbw mm5, mm6			 //; mm5 = r7r6r5r4r3r2r1r0 red 
			punpcklbw mm2, mm7			 //; mm2 = g7g6g5g4g3g2g1g0 green 
				 
			// mask not needed bits (using 555) 
			pand mm4, mask_0xf8 
			pand mm5, mask_0xf8 
			pand mm2, mask_0xf8 
				 
			// mix colors and write 
			 
			psrlw mm4, 3						 //; mm4 = red shifted 
			pand mm4, mask_0x1f			 //; mask the blue again 
			pxor mm7, mm7						 //; zero mm7 
			movq mm1, mm5						 //; mm1 = copy blue 
			movq mm3, mm4						 //; mm3 = copy red 
			movq mm6, mm2						 //; mm6 = copy green 
				 
			punpckhbw mm1, mm7 
			punpckhbw mm3, mm7 
			punpckhbw mm6, mm7 
			psllw mm6, 3						 //; shift green 
			psllw mm1, 8						 //; shift blue 
			por mm6, mm3 
			por mm6, mm1 
			movq 8[eax], mm6 
				 
			punpcklbw mm2, mm7			 //; mm2 = __g3__g2__g1__g0 already masked 
			punpcklbw mm4, mm7 
			punpcklbw mm5, mm7 
			psllw mm2, 3						 //; shift green 
			psllw mm5, 8						 //; shift blue 
			por mm2, mm4 
			por mm2, mm5 
			movq [eax], mm2 
				 
			add ebx, 8               //; in_Y   += 8//; 
			add ecx, 4               //; in_U   += 4//; 
			add edx, 4               //; in_V   += 4//; 
			add eax, 16              //; pOut += 16 // wrote 16 bytes 
				 
			inc edi 
			jne horiz_loop2			 
				 
			pop edi  
			pop edx  
			pop ecx 
			pop ebx  
			pop eax 
				 
			emms 
				 
		} 
		in_Y   += nSrcPitch;  
		in_U   += nSrcPitch / 2; 
		in_V   += nSrcPitch / 2; 
		pOut += nDstPitch; 
		 
	} 
} 
 
void CDirectDisplay::yv12to_rgb24bit_mmx(PBYTE in_Y, PBYTE in_U, PBYTE in_V, DWORD dwWidth, DWORD dwHeight, int nSrcPitch, PBYTE pOut, int nDstPitch,BOOL bFlip) 
{ 
	if(NULL == in_Y || NULL == in_U || NULL == in_V) 
		return ; 
	int horiz_circle = -1 * (dwWidth >> 3); 
	if(bFlip) 
	{ 
		in_Y = in_Y + (nSrcPitch * (dwHeight -1)); 
		in_U = in_U + (nSrcPitch/2 * (dwHeight /2 -1)) ; 
		in_V = in_V + (nSrcPitch/2 * (dwHeight /2 -1)) ; 
		nSrcPitch = -1 * nSrcPitch; 
	} 
	 
	for (DWORD i=0; i<dwHeight; i+=2)  
	{ 
		__asm  
		{ 
			push eax 
			push ebx 
			push ecx 
			push edx 
			push edi 
				 
			mov eax, pOut        
			mov ebx, in_Y        
			mov ecx, in_U        
			mov edx, in_V 
			mov edi, horiz_circle 
horiz_loop1: 
			 
			movd mm2, [ecx] 
			pxor mm7, mm7 
				 
			movd mm3, [edx] 
			punpcklbw mm2, mm7       //; mm2 = __u3__u2__u1__u0 
				 
			movq mm0, [ebx]          //; mm0 = y7y6y5y4y3y2y1y0   
			punpcklbw mm3, mm7       //; mm3 = __v3__v2__v1__v0 
				 
			movq mm1, mmw_0x00ff     //; mm1 = 00ff00ff00ff00ff  
				 
			psubusb mm0, mmb_0x10    //; mm0 -= 16 
				 
			psubw mm2, mmw_0x0080    //; mm2 -= 128 
			pand mm1, mm0            //; mm1 = __y6__y4__y2__y0 
				 
			psubw mm3, mmw_0x0080    //; mm3 -= 128 
			psllw mm1, 3             //; mm1 *= 8 
				 
			psrlw mm0, 8             //; mm0 = __y7__y5__y3__y1 
			psllw mm2, 3             //; mm2 *= 8 
				 
			pmulhw mm1, mmw_mult_Y   //; mm1 *= luma coeff  
			psllw mm0, 3             //; mm0 *= 8 
				 
			psllw mm3, 3             //; mm3 *= 8 
			movq mm5, mm3            //; mm5 = mm3 = v 
			 
			pmulhw mm5, mmw_mult_V_R //; mm5 = red chroma 
			movq mm4, mm2            //; mm4 = mm2 = u 
				 
			pmulhw mm0, mmw_mult_Y   //; mm0 *= luma coeff  
			movq mm7, mm1            //; even luma part 
				 
			pmulhw mm2, mmw_mult_U_G //; mm2 *= u green coeff  
			paddsw mm7, mm5          //; mm7 = luma + chroma    __r6__r4__r2__r0 
				 
			pmulhw mm3, mmw_mult_V_G //; mm3 *= v green coeff   
			packuswb mm7, mm7        //; mm7 = r6r4r2r0r6r4r2r0 
				 
			pmulhw mm4, mmw_mult_U_B //; mm4 = blue chroma 
			paddsw mm5, mm0          //; mm5 = luma + chroma    __r7__r5__r3__r1 
				 
			packuswb mm5, mm5        //; mm6 = r7r5r3r1r7r5r3r1 
			paddsw mm2, mm3          //; mm2 = green chroma 
				 
			movq mm3, mm1            //; mm3 = __y6__y4__y2__y0 
			movq mm6, mm1            //; mm6 = __y6__y4__y2__y0 
				 
			paddsw mm3, mm4          //; mm3 = luma + chroma    __b6__b4__b2__b0 
			paddsw mm6, mm2          //; mm6 = luma + chroma    __g6__g4__g2__g0 
				 
			punpcklbw mm7, mm5       //; mm7 = r7r6r5r4r3r2r1r0 
			paddsw mm2, mm0          //; odd luma part plus chroma part    __g7__g5__g3__g1 
				 
			packuswb mm6, mm6        //; mm2 = g6g4g2g0g6g4g2g0 
			packuswb mm2, mm2        //; mm2 = g7g5g3g1g7g5g3g1 
				 
			packuswb mm3, mm3        //; mm3 = b6b4b2b0b6b4b2b0 
			paddsw mm4, mm0          //; odd luma part plus chroma part    __b7__b5__b3__b1 
				 
			packuswb mm4, mm4        //; mm4 = b7b5b3b1b7b5b3b1 
			punpcklbw mm6, mm2       //; mm6 = g7g6g5g4g3g2g1g0 
				 
			punpcklbw mm3, mm4       //; mm3 = b7b6b5b4b3b2b1b0 
				 
			/* 32-bit shuffle.... */ 
			pxor mm0, mm0            //; is this needed? 
				 
			movq mm1, mm6            //; mm1 = g7g6g5g4g3g2g1g0 
			punpcklbw mm1, mm0       //; mm1 = __g3__g2__g1__g0 
				 
			movq mm0, mm3            //; mm0 = b7b6b5b4b3b2b1b0 
			punpcklbw mm0, mm7       //; mm0 = r3b3r2b2r1b1r0b0 
				 
			movq mm2, mm0            //; mm2 = r3b3r2b2r1b1r0b0 
				 
			punpcklbw mm0, mm1       //; mm0 = __r1g1b1__r0g0b0 
			punpckhbw mm2, mm1       //; mm2 = __r3g3b3__r2g2b2 
				 
			/* 24-bit shuffle and save... */ 
			movd   [eax], mm0        //; eax[0] = __r0g0b0 
			psrlq mm0, 32            //; mm0 = __r1g1b1 
				 
			movd  3[eax], mm0        //; eax[3] = __r1g1b1 
				 
			movd  6[eax], mm2        //; eax[6] = __r2g2b2 
				 
				 
			psrlq mm2, 32            //; mm2 = __r3g3b3 
				 
			movd  9[eax], mm2        //; eax[9] = __r3g3b3 
				 
			/* 32-bit shuffle.... */ 
			pxor mm0, mm0            //; is this needed? 
				 
			movq mm1, mm6            //; mm1 = g7g6g5g4g3g2g1g0 
			punpckhbw mm1, mm0       //; mm1 = __g7__g6__g5__g4 
				 
			movq mm0, mm3            //; mm0 = b7b6b5b4b3b2b1b0 
			punpckhbw mm0, mm7       //; mm0 = r7b7r6b6r5b5r4b4 
				 
			movq mm2, mm0            //; mm2 = r7b7r6b6r5b5r4b4 
				 
			punpcklbw mm0, mm1       //; mm0 = __r5g5b5__r4g4b4 
			punpckhbw mm2, mm1       //; mm2 = __r7g7b7__r6g6b6 
				 
			/* 24-bit shuffle and save... */ 
			movd 12[eax], mm0        //; eax[12] = __r4g4b4 
			psrlq mm0, 32            //; mm0 = __r5g5b5 
				 
			movd 15[eax], mm0        //; eax[15] = __r5g5b5 
			add ebx, 8               //; in_Y   += 8//; 
				 
			movq  mm5, mm2			 //; mm5 = _r7g7b7_r6g6b6 
			psrlq mm0, 8			 //; mm0 = __r5g5 
			psllq mm5, 32			 //; mm5 = _r6g6b6____ 
			psrlq mm2, 32			 //; mm2 = _____r7g7b7 
			psrlq mm5, 16			 //; mm5 = ___r6g6b6__ 
			por	  mm0, mm5			 //; mm0 = ___r6g6b6r5g5 
			psllq mm2, 40			 //; mm2 = r7g7b7_____  
			por	  mm2, mm0			 //; mm2 = r7g7b7r6g6b6r5g5 
			movq 16[eax],mm2		  
 
					 
			add ecx, 4               //; in_U   += 4//; 
			add edx, 4               //; in_V   += 4//; 
				 
			add eax, 24              //; pOut += 24 
					 
			inc edi 
			jne horiz_loop1			 
					 
			mov eax, pOut 
			add eax, nDstPitch//; 
			mov pOut,eax 
			mov ebx, in_Y 
			add ebx,nSrcPitch//; 
			mov in_Y,ebx 
 
			mov ecx, in_U        
			mov edx, in_V 
			mov edi, horiz_circle 
horiz_loop2: 
			 
			movd mm2, [ecx] 
			pxor mm7, mm7 
				 
			movd mm3, [edx] 
			punpcklbw mm2, mm7       //; mm2 = __u3__u2__u1__u0 
				 
			movq mm0, [ebx]          //; mm0 = y7y6y5y4y3y2y1y0   
			punpcklbw mm3, mm7       //; mm3 = __v3__v2__v1__v0 
				 
			movq mm1, mmw_0x00ff     //; mm1 = 00ff00ff00ff00ff  
				 
			psubusb mm0, mmb_0x10    //; mm0 -= 16 
				 
			psubw mm2, mmw_0x0080    //; mm2 -= 128 
			pand mm1, mm0            //; mm1 = __y6__y4__y2__y0 
				 
			psubw mm3, mmw_0x0080    //; mm3 -= 128 
			psllw mm1, 3             //; mm1 *= 8 
				 
			psrlw mm0, 8             //; mm0 = __y7__y5__y3__y1 
			psllw mm2, 3             //; mm2 *= 8 
				 
			pmulhw mm1, mmw_mult_Y   //; mm1 *= luma coeff  
			psllw mm0, 3             //; mm0 *= 8 
				 
			psllw mm3, 3             //; mm3 *= 8 
			movq mm5, mm3            //; mm5 = mm3 = v 
				 
			pmulhw mm5, mmw_mult_V_R //; mm5 = red chroma 
			movq mm4, mm2            //; mm4 = mm2 = u 
				 
			pmulhw mm0, mmw_mult_Y   //; mm0 *= luma coeff  
			movq mm7, mm1            //; even luma part 
				 
			pmulhw mm2, mmw_mult_U_G //; mm2 *= u green coeff  
			paddsw mm7, mm5          //; mm7 = luma + chroma    __r6__r4__r2__r0 
				 
			pmulhw mm3, mmw_mult_V_G //; mm3 *= v green coeff   
			packuswb mm7, mm7        //; mm7 = r6r4r2r0r6r4r2r0 
				 
			pmulhw mm4, mmw_mult_U_B //; mm4 = blue chroma 
			paddsw mm5, mm0          //; mm5 = luma + chroma    __r7__r5__r3__r1 
				 
			packuswb mm5, mm5        //; mm6 = r7r5r3r1r7r5r3r1 
			paddsw mm2, mm3          //; mm2 = green chroma 
				 
			movq mm3, mm1            //; mm3 = __y6__y4__y2__y0 
			movq mm6, mm1            //; mm6 = __y6__y4__y2__y0 
				 
			paddsw mm3, mm4          //; mm3 = luma + chroma    __b6__b4__b2__b0 
			paddsw mm6, mm2          //; mm6 = luma + chroma    __g6__g4__g2__g0 
				 
			punpcklbw mm7, mm5       //; mm7 = r7r6r5r4r3r2r1r0 
			paddsw mm2, mm0          //; odd luma part plus chroma part    __g7__g5__g3__g1 
				 
			packuswb mm6, mm6        //; mm2 = g6g4g2g0g6g4g2g0 
			packuswb mm2, mm2        //; mm2 = g7g5g3g1g7g5g3g1 
				 
			packuswb mm3, mm3        //; mm3 = b6b4b2b0b6b4b2b0 
			paddsw mm4, mm0          //; odd luma part plus chroma part    __b7__b5__b3__b1 
				 
			packuswb mm4, mm4        //; mm4 = b7b5b3b1b7b5b3b1 
			punpcklbw mm6, mm2       //; mm6 = g7g6g5g4g3g2g1g0 
				 
			punpcklbw mm3, mm4       //; mm3 = b7b6b5b4b3b2b1b0 
				 
			/* 32-bit shuffle.... */ 
			pxor mm0, mm0            //; is this needed? 
				 
			movq mm1, mm6            //; mm1 = g7g6g5g4g3g2g1g0 
			punpcklbw mm1, mm0       //; mm1 = __g3__g2__g1__g0 
				 
			movq mm0, mm3            //; mm0 = b7b6b5b4b3b2b1b0 
			punpcklbw mm0, mm7       //; mm0 = r3b3r2b2r1b1r0b0 
				 
			movq mm2, mm0            //; mm2 = r3b3r2b2r1b1r0b0 
				 
			punpcklbw mm0, mm1       //; mm0 = __r1g1b1__r0g0b0 
			punpckhbw mm2, mm1       //; mm2 = __r3g3b3__r2g2b2 
				 
			/* 24-bit shuffle and save... */ 
			movd   [eax], mm0        //; eax[0] = __r0g0b0 
			psrlq mm0, 32            //; mm0 = __r1g1b1 
				 
			movd  3[eax], mm0        //; eax[3] = __r1g1b1 
				 
			movd  6[eax], mm2        //; eax[6] = __r2g2b2 
				 
				 
			psrlq mm2, 32            //; mm2 = __r3g3b3 
				 
			movd  9[eax], mm2        //; eax[9] = __r3g3b3 
				 
			/* 32-bit shuffle.... */ 
			pxor mm0, mm0            //; is this needed? 
				 
			movq mm1, mm6            //; mm1 = g7g6g5g4g3g2g1g0 
			punpckhbw mm1, mm0       //; mm1 = __g7__g6__g5__g4 
				 
			movq mm0, mm3            //; mm0 = b7b6b5b4b3b2b1b0 
			punpckhbw mm0, mm7       //; mm0 = r7b7r6b6r5b5r4b4 
			 
			movq mm2, mm0            //; mm2 = r7b7r6b6r5b5r4b4 
				 
			punpcklbw mm0, mm1       //; mm0 = __r5g5b5__r4g4b4 
			punpckhbw mm2, mm1       //; mm2 = __r7g7b7__r6g6b6 
				 
			/* 24-bit shuffle and save... */ 
			movd 12[eax], mm0        //; eax[12] = __r4g4b4 
			psrlq mm0, 32            //; mm0 = __r5g5b5 
				 
			movd 15[eax], mm0        //; eax[15] = __r5g5b5 
			add ebx, 8               //; in_Y   += 8//; 
			 
			movq  mm5, mm2			 //; mm5 = _r7g7b7_r6g6b6 
			psrlq mm0, 8			 //; mm0 = __r5g5 
			psllq mm5, 32			 //; mm5 = _r6g6b6____ 
			psrlq mm2, 32			 //; mm2 = _____r7g7b7 
			psrlq mm5, 16			 //; mm5 = ___r6g6b6__ 
			por	  mm0, mm5			 //; mm0 = ___r6g6b6r5g5 
			psllq mm2, 40			 //; mm2 = r7g7b7_____  
			por	  mm2, mm0			 //; mm2 = r7g7b7r6g6b6r5g5 
			movq 16[eax],mm2		  
 
			add ecx, 4               //; in_U   += 4//; 
			add edx, 4               //; in_V   += 4//; 
				 
			add eax, 24              //; pOut += 24 
					 
			inc edi 
			jne horiz_loop2			 
					 
			pop edi  
			pop edx  
			pop ecx 
			pop ebx  
			pop eax 
					 
			emms 
					 
		}  
        in_Y   += nSrcPitch; 
		in_U   += nSrcPitch / 2; 
		in_V   += nSrcPitch / 2; 
		pOut += nDstPitch;  
	} 
} 
 
void CDirectDisplay::yv12to_rgb32bit_mmx(PBYTE in_Y, PBYTE in_U, PBYTE in_V, DWORD dwWidth, DWORD dwHeight, int nSrcPitch, PBYTE pOut, int nDstPitch,BOOL bFlip) 
{ 
	//int y; 
	if(NULL == in_Y || NULL == in_U || NULL == in_V) 
		return ; 
	int horiz_circle = -1 * (dwWidth >> 3); 
	if(bFlip) 
	{ 
		in_Y = in_Y + (nSrcPitch * (dwHeight -1)); 
		in_U = in_U + (nSrcPitch/2 * (dwHeight /2 -1)) ; 
		in_V = in_V + (nSrcPitch/2 * (dwHeight /2 -1)) ; 
		nSrcPitch = -1 * nSrcPitch; 
	} 
	int nSrcPitchUV = nSrcPitch / 2; 
	__asm  
	{ 
		push eax 
		push ebx 
		push ecx 
		push edx 
		push edi 
		push esi 
		 
		mov esi, dwHeight 
		 
		mov eax, pOut 
		mov ebx, in_Y        
		mov ecx, in_U        
		mov edx, in_V 
loopbegin: 
		mov edi, horiz_circle	 
horiz_loop1: 
		movd mm2, [ecx] 
		pxor mm7, mm7 
		 
		movd mm3, [edx] 
		punpcklbw mm2, mm7       //; mm2 = __u3__u2__u1__u0 
		 
		movq mm0, [ebx]          //; mm0 = y7y6y5y4y3y2y1y0   
		punpcklbw mm3, mm7       //; mm3 = __v3__v2__v1__v0 
		 
		movq mm1, mmw_0x00ff     //; mm1 = 00ff00ff00ff00ff  
		 
		psubusb mm0, mmb_0x10    //; mm0 -= 16   //成组数据相减 
		 
		psubw mm2, mmw_0x0080    //; mm2 -= 128 
		pand mm1, mm0            //; mm1 = __y6__y4__y2__y0 
		 
		psubw mm3, mmw_0x0080    //; mm3 -= 128 
		psllw mm1, 3             //; mm1 *= 8 
		 
		psrlw mm0, 8             //; mm0 = __y7__y5__y3__y1 
		psllw mm2, 3             //; mm2 *= 8 
		 
		pmulhw mm1, mmw_mult_Y   //; mm1 *= luma coeff  
		psllw mm0, 3             //; mm0 *= 8 
		 
		psllw mm3, 3             //; mm3 *= 8 
		movq mm5, mm3            //; mm5 = mm3 = v 
		 
		pmulhw mm5, mmw_mult_V_R //; mm5 = red chroma 
		movq mm4, mm2            //; mm4 = mm2 = u 
		 
		pmulhw mm0, mmw_mult_Y   //; mm0 *= luma coeff  
		movq mm7, mm1            //; even luma part 
		 
		pmulhw mm2, mmw_mult_U_G //; mm2 *= u green coeff  
		paddsw mm7, mm5          //; mm7 = luma + chroma    __r6__r4__r2__r0 
		 
		pmulhw mm3, mmw_mult_V_G //; mm3 *= v green coeff   
		packuswb mm7, mm7        //; mm7 = r6r4r2r0r6r4r2r0 
		 
		pmulhw mm4, mmw_mult_U_B //; mm4 = blue chroma 
		paddsw mm5, mm0          //; mm5 = luma + chroma    __r7__r5__r3__r1 
		 
		packuswb mm5, mm5        //; mm6 = r7r5r3r1r7r5r3r1 
		paddsw mm2, mm3          //; mm2 = green chroma 
		 
		movq mm3, mm1            //; mm3 = __y6__y4__y2__y0 
		movq mm6, mm1            //; mm6 = __y6__y4__y2__y0 
		 
		paddsw mm3, mm4          //; mm3 = luma + chroma    __b6__b4__b2__b0 
		paddsw mm6, mm2          //; mm6 = luma + chroma    __g6__g4__g2__g0 
		 
		punpcklbw mm7, mm5       //; mm7 = r7r6r5r4r3r2r1r0 
		paddsw mm2, mm0          //; odd luma part plus chroma part    __g7__g5__g3__g1 
		 
		packuswb mm6, mm6        //; mm2 = g6g4g2g0g6g4g2g0 
		packuswb mm2, mm2        //; mm2 = g7g5g3g1g7g5g3g1 
		 
		packuswb mm3, mm3        //; mm3 = b6b4b2b0b6b4b2b0 
		paddsw mm4, mm0          //; odd luma part plus chroma part    __b7__b5__b3__b1 
		 
		packuswb mm4, mm4        //; mm4 = b7b5b3b1b7b5b3b1 
		punpcklbw mm6, mm2       //; mm6 = g7g6g5g4g3g2g1g0 
		 
		punpcklbw mm3, mm4       //; mm3 = b7b6b5b4b3b2b1b0 
		 
		/* 32-bit shuffle.... */ 
		pxor mm0, mm0            //; is this needed? 
		 
		movq mm1, mm6            //; mm1 = g7g6g5g4g3g2g1g0 
		punpcklbw mm1, mm0       //; mm1 = __g3__g2__g1__g0 
		 
		movq mm0, mm3            //; mm0 = b7b6b5b4b3b2b1b0 
		punpcklbw mm0, mm7       //; mm0 = r3b3r2b2r1b1r0b0 
		 
		movq mm2, mm0            //; mm2 = r3b3r2b2r1b1r0b0 
		 
		punpcklbw mm0, mm1       //; mm0 = __r1g1b1__r0g0b0 
		punpckhbw mm2, mm1       //; mm2 = __r3g3b3__r2g2b2 
		 
		/* 32-bit save... */ 
		movq  [eax], mm0         //; eax[0] = __r1g1b1__r0g0b0 
		movq mm1, mm6            //; mm1 = g7g6g5g4g3g2g1g0 
		 
		movq 8[eax], mm2         //; eax[8] = __r3g3b3__r2g2b2 
		 
		/* 32-bit shuffle.... */ 
		pxor mm0, mm0            //; is this needed? 
		 
		punpckhbw mm1, mm0       //; mm1 = __g7__g6__g5__g4 
		 
		movq mm0, mm3            //; mm0 = b7b6b5b4b3b2b1b0 
		punpckhbw mm0, mm7       //; mm0 = r7b7r6b6r5b5r4b4 
		 
		movq mm2, mm0            //; mm2 = r7b7r6b6r5b5r4b4 
		 
		punpcklbw mm0, mm1       //; mm0 = __r5g5b5__r4g4b4 
		punpckhbw mm2, mm1       //; mm2 = __r7g7b7__r6g6b6 
		 
		/* 32-bit save... */ 
		add ebx, 8               //; in_Y   += 8//; 
		add ecx, 4               //; in_U   += 4//; 
		 
		movq 16[eax], mm0        //; eax[16] = __r5g5b5__r4g4b4 
		add edx, 4               //; in_V   += 4//; 
		 
		movq 24[eax], mm2        //; eax[24] = __r7g7b7__r6g6b6 
			 
			// 0 1 2 3 4 5 6 7 rgb save order 
			 
		add eax, 32              //; pOut += 32 
			 
		inc edi 
		jne horiz_loop1			 
		 
		mov eax, pOut 
		add eax, nDstPitch//; 
		mov pOut,eax 
		mov ebx, in_Y 
		add ebx,nSrcPitch//; 
		mov in_Y,ebx 
			 
		mov ecx, in_U        
		mov edx, in_V 
		mov edi, horiz_circle 
horiz_loop2: 
		movd mm2, [ecx] 
		pxor mm7, mm7 
			 
		movd mm3, [edx] 
		punpcklbw mm2, mm7       //; mm2 = __u3__u2__u1__u0 
		 
		movq mm0, [ebx]          //; mm0 = y7y6y5y4y3y2y1y0   
		punpcklbw mm3, mm7       //; mm3 = __v3__v2__v1__v0 
		 
		movq mm1, mmw_0x00ff     //; mm1 = 00ff00ff00ff00ff  
		 
		psubusb mm0, mmb_0x10    //; mm0 -= 16    
		 
		psubw mm2, mmw_0x0080    //; mm2 -= 128 
		pand mm1, mm0            //; mm1 = __y6__y4__y2__y0 
		 
		psubw mm3, mmw_0x0080    //; mm3 -= 128 
		psllw mm1, 3             //; mm1 *= 8 
		 
		psrlw mm0, 8             //; mm0 = __y7__y5__y3__y1 
		psllw mm2, 3             //; mm2 *= 8 
		 
		pmulhw mm1, mmw_mult_Y   //; mm1 *= luma coeff  
		psllw mm0, 3             //; mm0 *= 8 
		 
		psllw mm3, 3             //; mm3 *= 8 
		movq mm5, mm3            //; mm5 = mm3 = v 
		 
		pmulhw mm5, mmw_mult_V_R //; mm5 = red chroma 
		movq mm4, mm2            //; mm4 = mm2 = u 
		 
		pmulhw mm0, mmw_mult_Y   //; mm0 *= luma coeff  
		movq mm7, mm1            //; even luma part 
		 
		pmulhw mm2, mmw_mult_U_G //; mm2 *= u green coeff  
		paddsw mm7, mm5          //; mm7 = luma + chroma    __r6__r4__r2__r0 
		 
		pmulhw mm3, mmw_mult_V_G //; mm3 *= v green coeff   
		packuswb mm7, mm7        //; mm7 = r6r4r2r0r6r4r2r0 
		 
		pmulhw mm4, mmw_mult_U_B //; mm4 = blue chroma 
		paddsw mm5, mm0          //; mm5 = luma + chroma    __r7__r5__r3__r1 
		 
		packuswb mm5, mm5        //; mm6 = r7r5r3r1r7r5r3r1 
		paddsw mm2, mm3          //; mm2 = green chroma 
		 
		movq mm3, mm1            //; mm3 = __y6__y4__y2__y0 
		movq mm6, mm1            //; mm6 = __y6__y4__y2__y0 
		 
		paddsw mm3, mm4          //; mm3 = luma + chroma    __b6__b4__b2__b0 
		paddsw mm6, mm2          //; mm6 = luma + chroma    __g6__g4__g2__g0 
		 
		punpcklbw mm7, mm5       //; mm7 = r7r6r5r4r3r2r1r0 
		paddsw mm2, mm0          //; odd luma part plus chroma part    __g7__g5__g3__g1 
		 
		packuswb mm6, mm6        //; mm2 = g6g4g2g0g6g4g2g0 
		packuswb mm2, mm2        //; mm2 = g7g5g3g1g7g5g3g1 
		 
		packuswb mm3, mm3        //; mm3 = b6b4b2b0b6b4b2b0 
		paddsw mm4, mm0          //; odd luma part plus chroma part    __b7__b5__b3__b1 
		 
		packuswb mm4, mm4        //; mm4 = b7b5b3b1b7b5b3b1 
		punpcklbw mm6, mm2       //; mm6 = g7g6g5g4g3g2g1g0 
		 
		punpcklbw mm3, mm4       //; mm3 = b7b6b5b4b3b2b1b0 
		 
		/* 32-bit shuffle.... */ 
		pxor mm0, mm0            //; is this needed? 
		 
		movq mm1, mm6            //; mm1 = g7g6g5g4g3g2g1g0 
		punpcklbw mm1, mm0       //; mm1 = __g3__g2__g1__g0 
		 
		movq mm0, mm3            //; mm0 = b7b6b5b4b3b2b1b0 
		punpcklbw mm0, mm7       //; mm0 = r3b3r2b2r1b1r0b0 
		 
		movq mm2, mm0            //; mm2 = r3b3r2b2r1b1r0b0 
		 
		punpcklbw mm0, mm1       //; mm0 = __r1g1b1__r0g0b0 
		punpckhbw mm2, mm1       //; mm2 = __r3g3b3__r2g2b2 
		 
		/* 32-bit save... */ 
		movq  [eax], mm0         //; eax[0] = __r1g1b1__r0g0b0 
		movq mm1, mm6            //; mm1 = g7g6g5g4g3g2g1g0 
		 
		movq 8[eax], mm2         //; eax[8] = __r3g3b3__r2g2b2 
		 
		/* 32-bit shuffle.... */ 
		pxor mm0, mm0            //; is this needed? 
		 
		punpckhbw mm1, mm0       //; mm1 = __g7__g6__g5__g4 
		 
		movq mm0, mm3            //; mm0 = b7b6b5b4b3b2b1b0 
		punpckhbw mm0, mm7       //; mm0 = r7b7r6b6r5b5r4b4 
		 
		movq mm2, mm0            //; mm2 = r7b7r6b6r5b5r4b4 
		 
		punpcklbw mm0, mm1       //; mm0 = __r5g5b5__r4g4b4 
		punpckhbw mm2, mm1       //; mm2 = __r7g7b7__r6g6b6 
		 
		/* 32-bit save... */ 
		add ebx, 8               //; in_Y   += 8//; 
		add ecx, 4               //; in_U   += 4//; 
		 
		movq 16[eax], mm0        //; eax[16] = __r5g5b5__r4g4b4 
		add edx, 4               //; in_V   += 4//; 
		 
		movq 24[eax], mm2        //; eax[24] = __r7g7b7__r6g6b6 
				 
		// 0 1 2 3 4 5 6 7 rgb save order 
				 
		add eax, 32              //; pOut += 32 
				 
		inc edi 
		jne horiz_loop2			 
			 
		mov eax,pOut 
		add eax,nDstPitch 
		mov pOut,eax 
		mov ebx,in_Y 
		add ebx,nSrcPitch 
		mov in_Y,ebx 
		mov ecx,in_U 
		add ecx,nSrcPitchUV 
		mov in_U,ecx 
		mov edx,in_V 
		add edx,nSrcPitchUV 
		mov in_V,edx 
 
		dec esi 
		dec esi 
		jne loopbegin 
		pop esi 
		pop edi  
		pop edx  
		pop ecx 
		pop ebx  
		pop eax 
		 
		emms 
	}	 
} 
 
void CDirectDisplay::yv12to_yuv422_mmx(PBYTE in_Y, PBYTE in_U, PBYTE in_V, DWORD dwWidth, DWORD dwHeight, int nSrcPitch, PBYTE pOut, int nDstPitch,BOOL bFlip) 
{ 
	if(NULL == in_Y || NULL == in_U || NULL == in_Y ) 
		return; 
	int SrcStride = nSrcPitch + nSrcPitch - dwWidth;	 
	int SrcStrideU = (nSrcPitch - dwWidth)>>1; 
	int DstStride =(nDstPitch - dwWidth)<<1; 
	if(bFlip) 
	{ 
		in_Y = in_Y + (nSrcPitch * (dwHeight -1)); 
		in_U = in_U + (nSrcPitch/2 * (dwHeight /2 -1)) ; 
		in_V = in_V + (nSrcPitch/2 * (dwHeight /2 -1)) ; 
		nSrcPitch = -1 * nSrcPitch; 
	} 
	__asm 
	{ 
			 
			push eax 
			push ebx 
			push ecx 
			push edx 
			push edi 
			push esi 
 
 
			mov edi , [pOut] 
			mov esi , [in_Y] 
			mov eax , [in_U] 
			mov ebx , [in_V] 
			mov ecx , [dwHeight] 
			mov edx , [dwWidth] 
cyc:  
			movq mm2 , qword ptr [eax] //u 
			movq mm3 , qword ptr [ebx] //v 
			 
			movq mm0 , qword ptr [esi] //y1 
			movq mm1 , qword ptr [esi+8] //y2 
			 
			movq mm4 , mm2 
			punpcklbw mm2 , mm3 // uv1 
			punpckhbw mm4 , mm3 // uv2 
			 
			movq mm3 , mm0 
			movq mm5 , mm1 
			punpcklbw mm0 , mm2 // yuyv1 
			punpckhbw mm3 , mm2 // yuyv2 
			punpcklbw mm1 , mm4 // yuyv3 
			punpckhbw mm5 , mm4 // yuyv4 
			 
			movq qword ptr [edi] , mm0 
			movq qword ptr [edi+8] , mm3 
			movq qword ptr [edi+16] , mm1 
			movq qword ptr [edi+24] , mm5 
			 
			add esi , [nSrcPitch] 
			add edi , [nDstPitch] 
			 
			movq mm0 , qword ptr [esi] //y1 
			movq mm1 , qword ptr [esi+8] //y2 
			 
			movq mm3 , mm0 
			movq mm5 , mm1 
			punpcklbw mm0 , mm2 // yuyv1 
			punpcklbw mm1 , mm4 // yuyv3 
			punpckhbw mm3 , mm2 // yuyv2 
			punpckhbw mm5 , mm4 // yuyv4 
			 
			movq qword ptr [edi] , mm0 
			movq qword ptr [edi+8] , mm3 
			movq qword ptr [edi+16] , mm1 
			movq qword ptr [edi+24] , mm5 
			 
			sub esi , [nSrcPitch] 
			sub edi , [nDstPitch] 
			 
			add eax , 8 
			add ebx , 8 
			add esi , 16 
			add edi , 32 
			 
			sub edx,16			 
			ja cyc 
			 
			mov edx,[dwWidth] 
			 
			add esi , [SrcStride] 
			add eax , [SrcStrideU] 
			add ebx , [SrcStrideU] 
			add edi , [DstStride] 
			 
			sub ecx,2 
			ja cyc 
 
			pop esi 
			pop edi  
			pop edx  
			pop ecx 
			pop ebx  
			pop eax 
					 
			emms 
	} 
 
} 
 
/*void CDirectDisplay::yv12to_rgb555_sse2(PBYTE in_Y,PBYTE in_U,PBYTE in_V,DWORD dwWidth,DWORD dwHeight,int nSrcPitch,PBYTE pOut,int nDstPitch,BOOL bFlip) 
{ 
} 
 
void CDirectDisplay::yv12to_rgb565_sse2(PBYTE in_Y, PBYTE in_U, PBYTE in_V, DWORD dwWidth, DWORD dwHeight, int nSrcPitch, PBYTE pOut, int nDstPitch,BOOL bFlip) 
{ 
 
} 
 
void CDirectDisplay::yv12to_rgb24bit_sse2(PBYTE in_Y, PBYTE in_U, PBYTE in_V, DWORD dwWidth, DWORD dwHeight, int nSrcPitch, PBYTE pOut, int nDstPitch,BOOL bFlip) 
{ 
 
} 
 
void CDirectDisplay::yv12to_rgb32bit_sse2(PBYTE in_Y, PBYTE in_U, PBYTE in_V, DWORD dwWidth, DWORD dwHeight, int nSrcPitch, PBYTE pOut, int nDstPitch,BOOL bFlip) 
{ 
 
} 
 
void CDirectDisplay::yv12to_yuv422_sse2(PBYTE in_Y, PBYTE in_U, PBYTE in_V, DWORD dwWidth, DWORD dwHeight, int nSrcPitch, PBYTE pOut, int nDstPitch,BOOL bFlip) 
{ 
	if(NULL == in_Y || NULL == in_U || NULL == in_Y ) 
		return; 
	int SrcStride = nSrcPitch + nSrcPitch - dwWidth;	 
	int SrcStrideU = (nSrcPitch - dwWidth)>>1; 
	int DstStride =(nDstPitch - dwWidth)<<1; 
	if(bFlip) 
	{ 
		in_Y = in_Y + (nSrcPitch * (dwHeight -1)); 
		in_U = in_U + (nSrcPitch/2 * (dwHeight /2 -1)) ; 
		in_V = in_V + (nSrcPitch/2 * (dwHeight /2 -1)) ; 
		nSrcPitch = -1 * nSrcPitch; 
	} 
	__asm 
	{ 
			 
			push eax 
			push ebx 
			push ecx 
			push edx 
			push edi 
			push esi 
 
 
			mov edi , [pOut] 
			mov esi , [in_Y] 
			mov eax , [in_U] 
			mov ebx , [in_V] 
			mov ecx , [dwHeight] 
			mov edx , [dwWidth] 
cyc:  
			movdqa xmm2 , [eax] //u0..u15 
			movdqa xmm3 , [ebx] //v0..v15 
			 
			movdqa xmm0 , [esi] //y0..y15 
			movdqa xmm1 , [esi+16] //y16..y31 
			 
			movdqa xmm4 , xmm2		//mm4 = u0..u15 
			punpcklbw xmm2 , xmm3 // uv1 
			punpckhbw xmm4 , xmm3 // uv2 
			 
			movdqa xmm3 , xmm0 
			movdqa xmm5 , xmm1 
			punpcklbw xmm0 , xmm2 // yuyv1 
			punpckhbw xmm3 , xmm2 // yuyv2 
			punpcklbw xmm1 , xmm4 // yuyv3 
			punpckhbw xmm5 , xmm4 // yuyv4 
			 
			movdqa [edi] , xmm0 
			movdqa [edi+16] , xmm3 
			movdqa [edi+32] , xmm1 
			movdqa [edi+48] , xmm5 
			 
			add esi , [nSrcPitch] 
			add edi , [nDstPitch] 
			 
			movdqa xmm0 , [esi] //y1 
			movdqa xmm1 , [esi+16] //y2 
			 
			movdqa xmm3 , xmm0 
			movdqa xmm5 , xmm1 
			punpcklbw xmm0 , xmm2 // yuyv1 
			punpcklbw xmm1 , xmm4 // yuyv3 
			punpckhbw xmm3 , xmm2 // yuyv2 
			punpckhbw xmm5 , xmm4 // yuyv4 
			 
			movdqa [edi] , xmm0 
			movdqa [edi+16] , xmm3 
			movdqa [edi+32] , xmm1 
			movdqa [edi+48] , xmm5 
			 
			sub esi , [nSrcPitch] 
			sub edi , [nDstPitch] 
			 
			add eax , 16 
			add ebx , 16 
			add esi , 32 
			add edi , 64 
			 
			sub edx,32			 
			ja cyc 
			 
			mov edx,[dwWidth] 
			 
			add esi , [SrcStride] 
			add eax , [SrcStrideU] 
			add ebx , [SrcStrideU] 
			add edi , [DstStride] 
			 
			sub ecx,2 
			ja cyc 
 
			pop esi 
			pop edi  
			pop edx  
			pop ecx 
			pop ebx  
			pop eax 
					 
//			emms 
	} 
}*/ 
 
 
void CDirectDisplay::Create_BmpHeader(PBYTE pBuf) 
{ 
	BITMAPFILEHEADER *pbfh = (BITMAPFILEHEADER *)pBuf; 
	pbfh->bfType=0x4d42; //"BM" 
	pbfh->bfSize=m_dwWidth*m_dwHeight*3+sizeof(BITMAPFILEHEADER)+sizeof(BITMAPINFOHEADER); 
	pbfh->bfReserved1=0; 
	pbfh->bfReserved2=0; 
	pbfh->bfOffBits=sizeof(BITMAPFILEHEADER)+sizeof(BITMAPINFOHEADER); 
 
	BITMAPINFOHEADER *pbih = (BITMAPINFOHEADER *)(pBuf+sizeof(BITMAPFILEHEADER)); 
	pbih->biSize=sizeof(BITMAPINFOHEADER); 
	pbih->biWidth=m_dwWidth; 
	pbih->biHeight=m_dwHeight; 
	pbih->biPlanes=1; 
	pbih->biBitCount=24; 
	pbih->biCompression=BI_RGB; 
	pbih->biSizeImage=m_dwWidth * m_dwHeight * 3; 
	pbih->biXPelsPerMeter=0; 
	pbih->biYPelsPerMeter=0; 
	pbih->biClrUsed=0; 
	pbih->biClrImportant=0; 
 
} 
 
BOOL CDirectDisplay::CaptureImage(int nImageType,int nQuality,unsigned char *pImageBuf, unsigned long *pnImageSize) 
{ 
	BOOL bRet = FALSE; 
	if(NULL == pImageBuf || NULL == pnImageSize) 
		return FALSE; 
	if(IMAGE_TYPE_JPG == nImageType) 
	{ 
		if(WAIT_OBJECT_0 != WaitForSingleObject(m_hCapMutex,200)) 
			return FALSE; 
		if(NULL == m_pY || NULL == m_pU || NULL == m_pV) 
		{ 
			ReleaseMutex(m_hCapMutex); 
			return FALSE; 
		} 
		bRet = (YUV2Jpg(m_pY,m_pU,m_pV,m_dwWidth,m_dwHeight,nQuality,m_nSrcPitch,pImageBuf,pnImageSize) == 0); 
		ReleaseMutex(m_hCapMutex); 
		return bRet; 
	} 
	if(*pnImageSize < (m_dwWidth * m_dwHeight * 3 +sizeof(BITMAPFILEHEADER) + sizeof(BITMAPINFOHEADER))) 
		return FALSE; 
	if(WAIT_OBJECT_0 != WaitForSingleObject(m_hCapMutex,200)) 
		return FALSE; 
	if((m_pY != NULL) && (m_pU != NULL) && (m_pV != NULL)) 
	{ 
		//由于位图数据的保存是从最后一行开始的,因些需倒过来 
		yv12to_rgb24bit(m_pY,m_pU,m_pV,m_dwWidth,m_dwHeight,m_nSrcPitch,pImageBuf+sizeof(BITMAPFILEHEADER)+sizeof(BITMAPINFOHEADER),m_dwWidth * 3,TRUE); 
		Create_BmpHeader(pImageBuf); 
		*pnImageSize = m_dwWidth * m_dwHeight * 3 + sizeof(BITMAPFILEHEADER) + sizeof(BITMAPINFOHEADER); 
		bRet = TRUE; 
	} 
	ReleaseMutex(m_hCapMutex); 
	return bRet; 
}