www.pudn.com > DirectDraw.rar > DirectDisplay2.cpp, change:2007-09-19,size:55559b


// DirectDisplay.cpp: implementation of the CDirectDisplay class. 
// 
////////////////////////////////////////////////////////////////////// 
#include "StdAfx.h" 
#include "DirectDisplay.h" 
 
#include <ddraw.h> 
#include <stdio.h> 
 
#pragma comment (lib,"winmm.lib") 
#pragma comment (lib,"ddraw.lib") 
#pragma comment (lib,"dxerr8.lib") 
#pragma comment (lib,"dxguid.lib") 
 
#pragma pack(1) 
 
#define int8_t char 
#define BYTE unsigned char 
#define int16_t short 
#define uint16_t unsigned short 
#define int32_t int 
#define uint32_t unsigned int  
#define int64_t __int64 
#define uint64_t unsigned __int64 
 
// colourspace conversion matrix values  
static uint64_t mmw_mult_Y    = 0x2568256825682568; 
static uint64_t mmw_mult_U_G  = 0xf36ef36ef36ef36e; 
static uint64_t mmw_mult_U_B  = 0x40cf40cf40cf40cf; 
static uint64_t mmw_mult_V_R  = 0x3343334333433343; 
static uint64_t mmw_mult_V_G  = 0xe5e2e5e2e5e2e5e2; 
 
 
// various masks and other constants  
static uint64_t mmb_0x10      = 0x1010101010101010; 
static uint64_t mmw_0x0080    = 0x0080008000800080; 
static uint64_t mmw_0x00ff    = 0x00ff00ff00ff00ff; 
 
// 5 Jan 2001  - Andrea Graziani     
static uint64_t mask_5		= 0xf8f8f8f8f8f8f8f8; 
static uint64_t mask_6		= 0xfcfcfcfcfcfcfcfc; 
static uint64_t mask_blue	= 0x1f1f1f1f1f1f1f1f; 
 
 
#define SAFE_DELETE(p)  { if(p) { delete (p);     (p)=NULL; } } 
#define SAFE_RELEASE(p) { if(p) { (p)->Release(); (p)=NULL; } } 
 
DDPIXELFORMAT  g_ddpfYU12Formats= {sizeof(DDPIXELFORMAT), DDPF_FOURCC,MAKEFOURCC('Y','V','1','2'),0,0,0,0,0}; 
DDPIXELFORMAT  g_ddpfYUY2Formats= {sizeof(DDPIXELFORMAT), DDPF_FOURCC,MAKEFOURCC('Y','U','Y','2'),0,0,0,0,0}; 
DDPIXELFORMAT  g_ddpfUYVYFormats= {sizeof(DDPIXELFORMAT), DDPF_FOURCC,MAKEFOURCC('U','Y','V','Y'),0,0,0,0,0}; 
DDPIXELFORMAT  g_ddpfRGB16Formats= {sizeof(DDPIXELFORMAT), DDPF_RGB,0,16,0xF800,0x7E0,0x1F,0}; 
DDPIXELFORMAT  g_ddpfOSDFormats= {sizeof(DDPIXELFORMAT), DDPF_RGB,0,32,0xFF0000,0xFF00,0xFF,0}; 
 
HMODULE m_hddrawDll = 0; 
DirectDrawCreateEx_ DirectDrawCreateEx__ = 0; 
// -------------------------------------------------------------------- 
CDirectDisplay::CDirectDisplay() 
{ 
	m_hWnd=NULL; 
	m_nWidth = m_nHeight = 0; 
	m_pDD=NULL; 
	m_pddsFrontBuffer=NULL; 
	m_pddsBackBuffer=NULL; 
	m_pddsOSDBuffer = 0; 
	if(m_hddrawDll == 0) 
		m_hddrawDll = ::LoadLibrary(TEXT("ddraw.dll")); 
	m_Memflag = 0; 
	lpSurface = 0; 
	m_lPitch = 0; 
    iScreenWidth = 0; 
    iScreenHeight = 0; 
	m_pDrawOSD = 0; 
	m_videocallback = 0; 
	m_dwvideouser = 0; 
	m_dwUser = 0; 
	m_bDrawRect = FALSE; 
	ZeroMemory(&m_DrawRect,sizeof(RECT)); 
 
} 
 
CDirectDisplay::~CDirectDisplay() 
{ 
	ReleaseObject(); 
} 
 
void CDirectDisplay::ReleaseObject() 
{ 
    SAFE_RELEASE( m_pddsBackBuffer ); 
    SAFE_RELEASE( m_pddsFrontBuffer ); 
	SAFE_RELEASE(m_pddsOSDBuffer); 
    if( m_pDD ) 
        m_pDD->SetCooperativeLevel( 0, DDSCL_NORMAL ); 
    SAFE_RELEASE( m_pDD ); 
} 
 
void CDirectDisplay::SetDisplayRect(BOOL bDrawRect,RECT *pRect) 
{ 
	if(bDrawRect && pRect) 
	{ 
		m_bDrawRect = TRUE; 
		memcpy(&m_DrawRect,pRect,sizeof(RECT)); 
	} 
	else 
		m_bDrawRect = FALSE; 
} 
 
BOOL CDirectDisplay::CreateDDraw(HWND hWnd) 
{ 
	if(m_hddrawDll == 0) 
	{ 
		MessageBox(0,"Load library ddraw.dll Failed.Please install DirectX 9.0.","Error",MB_ICONERROR); 
		return FALSE; 
	} 
	if(DirectDrawCreateEx__ == 0) 
		DirectDrawCreateEx__ = (DirectDrawCreateEx_)GetProcAddress(m_hddrawDll,"DirectDrawCreateEx"); 
	if(DirectDrawCreateEx__ == 0) 
	{ 
		MessageBox(0,"Load library ddraw.dll Failed.Please install DirectX 9.0.","Error",MB_ICONERROR); 
		return FALSE; 
	} 
	HRESULT hr; 
	DDCAPS	ddcap; 
	ReleaseObject(); 
	if( FAILED( hr = DirectDrawCreateEx__( NULL, (VOID**)&m_pDD,IID_IDirectDraw7, NULL ) ) ) 
	{ 
		return FALSE; 
	} 
	ZeroMemory(&ddcap,sizeof(DDCAPS)); 
	ddcap.dwSize = sizeof(DDCAPS); 
	hr=m_pDD->GetCaps(&ddcap,NULL); 
	if(FAILED(hr)) 
	{ 
		return FALSE; 
	} 
	hr = m_pDD->SetCooperativeLevel( 0, DDSCL_NORMAL); 
	if( FAILED(hr) ) 
	{ 
		return FALSE; 
	} 
	//优先使用显卡内存 
	m_Memflag = DDSCAPS_VIDEOMEMORY; 
	//  创建主表面 
	DDSURFACEDESC2 ddsd;	 
	ZeroMemory(&ddsd, sizeof(ddsd)); 
	ddsd.dwSize = sizeof(ddsd); 
	m_pDD->GetDisplayMode(&ddsd); 
	iScreenWidth = ddsd.dwWidth; 
	iScreenHeight = ddsd.dwHeight; 
	ZeroMemory(&ddsd, sizeof(ddsd)); 
	ddsd.dwSize = sizeof(ddsd); 
	ddsd.dwFlags = DDSD_CAPS; 
	ddsd.ddsCaps.dwCaps = DDSCAPS_PRIMARYSURFACE ; 
	if( FAILED( m_pDD->CreateSurface( &ddsd, &m_pddsFrontBuffer, NULL ) ) ) 
	{ 
		return FALSE; 
	} 
	m_hWnd = hWnd; 
	return TRUE; 
} 
 
BOOL CDirectDisplay::InitDDraw(DWORD dwWidth,DWORD dwHeight,DWORD dwVideoFormat /* = 0 */) 
{ 
	DWORD dwError = 0; 
	m_nWidth = dwWidth; 
	m_nHeight = dwHeight; 
	if(IsWindow(m_hWnd) == FALSE) 
		return FALSE; 
	if(m_pDD == 0 && m_pddsFrontBuffer == 0) 
		return FALSE; 
	DWORD dwSize = 0; 
	DWORD *lpBuffer; 
	HRESULT hr; 
	hr = m_pDD->GetFourCCCodes(&dwSize,NULL); 
	if(FAILED(hr)) 
		return FALSE; 
	//if(dwSize == 0)时,屏幕被锁定,不能创建表面 
	lpBuffer = new DWORD[dwSize]; 
	hr = m_pDD->GetFourCCCodes(&dwSize,lpBuffer); 
	if(FAILED(hr)) 
		return FALSE; 
	char *pString = new char[dwSize * 10 + 20]; 
	ZeroMemory(pString,dwSize * 10); 
	int pPos = 0; 
	sprintf(pString,"共支持%d种格式:\n",dwSize); 
	pPos = strlen(pString ); 
	for(DWORD i = 0; i < dwSize; i++) 
	{ 
		memcpy(pString+pPos,&lpBuffer[i],sizeof(DWORD)); 
		pPos += 4; 
		*(pString +pPos)= '\n'; 
		pPos ++; 
	} 
	MessageBox(NULL,pString,NULL,MB_OK); 
	delete lpBuffer; 
	delete pString; 
 
//	if(CheckDisplayIsSuppor(dwVideoFormat) == FALSE) 
//		return FALSE; 
//	if(dwVideoFormat != VIDEO_MODE_RGB) 
		dwError = CreateYUVSurface(m_hWnd,dwWidth,dwHeight); 
	if(dwError) 
		dwError = CreateRGBSurface(m_hWnd,dwWidth,dwHeight); 
//	m_PixelFormat = dwVideoFormat; 
	if(dwError != 0) 
	{ 
		char szStr[64]; 
		char szBuf[12]; 
		switch(m_PixelFormat) { 
		case VOUT_MODE_RGB555: 
		case VOUT_MODE_RGB565: 
		case VOUT_MODE_RGB24: 
		case VOUT_MODE_RGB32: 
			strcpy(szBuf,"RGB");		break; 
		case VOUT_MODE_YUY2: 
			strcpy(szBuf,"YUY2");		break; 
		case VOUT_MODE_YV12: 
			strcpy(szBuf,"YV12");		break; 
		case VOUT_MODE_UYVY: 
			strcpy(szBuf,"UYVY");		break; 
		} 
		//DDERR_INVALIDPIXELFORMAT  
		sprintf(szStr,"CreateDirectDraw surface (%s) failed, Error = %8X\n",szBuf,dwError); 
		MessageBox(0,szStr,"Error",MB_ICONERROR); 
		return FALSE; 
	} 
	return TRUE; 
} 
 
DWORD CDirectDisplay::CreateYUVSurface(HWND hWnd, DWORD dwWidth, DWORD dwHeight) 
{ 
	HRESULT			hr; 
	DDSURFACEDESC2	ddsd; 
 
	ZeroMemory(&ddsd, sizeof(ddsd)); 
	ddsd.dwSize = sizeof(ddsd); 
//	switch(m_PixelFormat) { 
//	case VOUT_MODE_YUY2: 
		ddsd.ddpfPixelFormat = g_ddpfYUY2Formats; 
		m_PixelFormat = VOUT_MODE_YUY2; 
//		break; 
//	case VOUT_MODE_UYVY: 
//		ddsd.ddpfPixelFormat = g_ddpfUYVYFormats; 
//		break; 
//	case VOUT_MODE_YV12: 
//		ddsd.ddpfPixelFormat = g_ddpfYU12Formats; 
//		break; 
//	default: 
//		return DDERR_UNSUPPORTEDMODE; 
//	} 
	ddsd.dwFlags        = DDSD_CAPS | DDSD_WIDTH | DDSD_HEIGHT|DDSD_PIXELFORMAT ; 
	ddsd.ddsCaps.dwCaps = DDSCAPS_OFFSCREENPLAIN | m_Memflag ; 
	ddsd.dwWidth        = dwWidth; 
	ddsd.dwHeight       = dwHeight; 
	if(FAILED(hr = m_pDD->CreateSurface( &ddsd, &m_pddsBackBuffer, NULL ) ) ) 
	{ 
		m_Memflag = DDSCAPS_SYSTEMMEMORY; 
		ddsd.ddsCaps.dwCaps = DDSCAPS_OFFSCREENPLAIN | m_Memflag; 
		if(FAILED(hr = m_pDD->CreateSurface( &ddsd, &m_pddsBackBuffer, NULL ) ) ) 
			return hr; 
	} 
	GetBackBuffer(); 
	ZeroMemory(&ddsd, sizeof(ddsd)); 
	ddsd.dwSize = sizeof(ddsd); 
	DWORD m_dwRgbCount = 0; 
	m_pDD->GetDisplayMode(&ddsd); 
	m_dwRgbCount = ddsd.ddpfPixelFormat.dwRGBBitCount; 
	ZeroMemory(&ddsd, sizeof(ddsd)); 
	ddsd.dwSize = sizeof(ddsd); 
	ddsd.dwFlags        = DDSD_CAPS | DDSD_WIDTH | DDSD_HEIGHT | DDSD_PIXELFORMAT | DDSD_CKSRCBLT; 
	ddsd.ddsCaps.dwCaps = DDSCAPS_OFFSCREENPLAIN | m_Memflag ; 
	ddsd.dwWidth        = dwWidth; 
	ddsd.dwHeight       = dwHeight; 
	ddsd.ddckCKSrcBlt.dwColorSpaceLowValue = 0; 
	ddsd.ddckCKSrcBlt.dwColorSpaceHighValue = 0; 
	if(m_dwRgbCount == 16) 
		ddsd.ddpfPixelFormat = g_ddpfRGB16Formats; 
	else 
		ddsd.ddpfPixelFormat = g_ddpfOSDFormats; 
	hr = m_pDD->CreateSurface(&ddsd, &m_pddsOSDBuffer,NULL); 
	if(FAILED(hr)) 
	{ 
		SAFE_RELEASE( m_pddsBackBuffer ); 
		return hr; 
	} 
	SetClipperWnd(hWnd); 
	return 0; 
} 
 
 
 
DWORD CDirectDisplay::CreateRGBSurface(HWND hWnd, DWORD dwWidth, DWORD dwHeight) 
{ 
	HRESULT hr; 
	DDSURFACEDESC2 ddsd; 
	ZeroMemory(&ddsd, sizeof(ddsd)); 
	ddsd.dwSize = sizeof(ddsd); 
	ddsd.dwFlags        = DDSD_CAPS | DDSD_WIDTH | DDSD_HEIGHT; 
		 
	ddsd.ddsCaps.dwCaps = DDSCAPS_OFFSCREENPLAIN | m_Memflag ; 
		 
	ddsd.dwWidth = dwWidth ; 
	ddsd.dwHeight = dwHeight ; 
	if( FAILED( hr = m_pDD->CreateSurface( &ddsd, &m_pddsBackBuffer, NULL ) ) ) 
	{ 
		if(hr) 
		{ 
			ddsd.ddsCaps.dwCaps = DDSCAPS_OFFSCREENPLAIN | DDSCAPS_SYSTEMMEMORY; 
			if( FAILED( hr = m_pDD->CreateSurface( &ddsd, &m_pddsBackBuffer, NULL ) ) ) 
				return hr; 
		} 
		else 
			return hr; 
	} 
	 
	ZeroMemory(&ddsd, sizeof(DDSURFACEDESC2)); 
	ddsd.dwSize     = sizeof(DDSURFACEDESC2); 
	hr = m_pddsBackBuffer->Lock(NULL, &ddsd,  
		DDLOCK_SURFACEMEMORYPTR | DDLOCK_WAIT,  
		NULL); 
	if(FAILED(hr)) 
	{	 
		SAFE_RELEASE( m_pddsBackBuffer ); 
		return hr; 
	} 
 
	if (ddsd.ddpfPixelFormat.dwFlags & DDPF_RGB) 
	{ 
		if(ddsd.ddpfPixelFormat.dwRGBBitCount==32) 
			m_PixelFormat=VOUT_MODE_RGB32; 
		else if(ddsd.ddpfPixelFormat.dwRGBBitCount==24) 
			m_PixelFormat=VOUT_MODE_RGB24; 
		else if(ddsd.ddpfPixelFormat.dwRGBBitCount==16) 
		{ 
			if(ddsd.ddpfPixelFormat.dwRBitMask==0xf800) 
				m_PixelFormat=VOUT_MODE_RGB565; 
			else if(ddsd.ddpfPixelFormat.dwRBitMask==0x7C00) // RGB555 
				m_PixelFormat=VOUT_MODE_RGB555; 
			else 
				m_PixelFormat = -1; 
		} 
		else 
		{ 
			char csTmp[128]; 
			sprintf(csTmp,"dwRGBBitCount = %d dwRGBBitCount = %04X ddsd.lPitch = %d\n", 
				ddsd.ddpfPixelFormat.dwRGBBitCount, 
				ddsd.ddpfPixelFormat.dwRBitMask, 
				ddsd.lPitch); 
			OutputDebugString(csTmp); 
			m_pddsBackBuffer->Unlock(NULL); 
			SAFE_RELEASE( m_pddsBackBuffer ); 
			return -4; 
		} 
	} 
	else 
	{ 
		m_pddsBackBuffer->Unlock(NULL); 
		SAFE_RELEASE( m_pddsBackBuffer ); 
		return -3; 
	} 
	m_pddsBackBuffer->Unlock(NULL); 
	 
	if(m_PixelFormat == -1) 
	{ 
		SAFE_RELEASE( m_pddsBackBuffer ); 
		return -2; 
	} 
	 
	if(SetClipperWnd(hWnd)) 
		return 0; 
	return -1; 
} 
 
BOOL CDirectDisplay::SetClipperWnd(HWND hWnd) 
{ 
	if((m_pddsFrontBuffer==NULL) || (m_pDD==NULL)) 
		return FALSE; 
	HRESULT hr; 
	LPDIRECTDRAWCLIPPER pcClipper; 
	m_hWnd = hWnd; 
	if(FAILED(hr=m_pDD->CreateClipper( 0, &pcClipper, NULL ))) 
		return FALSE; 
	if(FAILED(hr=pcClipper->SetHWnd( 0,hWnd))) 
		return FALSE; 
	if(FAILED(hr= m_pddsFrontBuffer->SetClipper( pcClipper ))) 
		return FALSE; 
	pcClipper->Release(); 
	UpdateBounds(); 
	return TRUE; 
} 
 
void CDirectDisplay::UpdateBounds() 
{ 
	GetClientRect( m_hWnd, &m_rcWindow ); 
	ClientToScreen( m_hWnd, (POINT*)&m_rcWindow.left); 
	ClientToScreen( m_hWnd, (POINT*)&m_rcWindow.right); 
} 
 
BOOL CDirectDisplay::UpdataImage(RECT *Rectdes) 
{ 
	if( (m_pddsFrontBuffer==NULL) || (m_pddsBackBuffer==NULL)) 
	{ 
		return FALSE; 
	} 
	if (m_pddsFrontBuffer->Blt(&m_rcWindow,m_pddsBackBuffer,Rectdes,DDBLT_WAIT,NULL) == DDERR_SURFACELOST) 
	{ 
		m_pddsFrontBuffer->Restore(); 
		m_pddsBackBuffer->Restore(); 
	} 
	return TRUE; 
} 
 
BOOL CDirectDisplay::DrawImageFromYUV(PBYTE in_Y, PBYTE in_U, PBYTE in_V,int nStride,RECT *Rectdes) 
{ 
	DWORD	i; 
	if(NULL == Rectdes) 
	{ 
		if(m_bDrawRect) 
			Rectdes = &m_DrawRect; 
	} 
	if((m_pddsFrontBuffer==NULL) || (m_pddsBackBuffer==NULL)) 
		return FALSE; 
	if(IsWindow(m_hWnd) == FALSE) 
		return TRUE; 
	if(IsWindowVisible(m_hWnd) == FALSE) 
		return TRUE; 
	if(lpSurface == 0) 
	{ 
		if(GetBackBuffer() == 0) 
			return TRUE; 
	} 
	try{ 
		switch(m_PixelFormat) 
		{ 
		case VOUT_MODE_RGB32: 
			yuv2rgb_32(in_Y,nStride,in_U,in_V,nStride>>1, 
				lpSurface, 
				m_nWidth,m_nHeight,m_lPitch); 
			break; 
		case VOUT_MODE_RGB24: 
			yuv2rgb_24(in_Y,nStride,in_U,in_V,nStride>>1, 
				lpSurface, 
				m_nWidth,m_nHeight,m_lPitch); 
			break; 
		case VOUT_MODE_RGB555: 
			yuv2rgb_555(in_Y,nStride,in_U,in_V,nStride>>1, 
				lpSurface, 
				m_nWidth,m_nHeight,m_lPitch); 
			break; 
		case VOUT_MODE_RGB565: 
			yuv2rgb_565(in_Y,nStride,in_U,in_V,nStride >> 1, 
				lpSurface, 
				m_nWidth,m_nHeight,m_lPitch); 
			break; 
		case VOUT_MODE_YUY2: 
			yuv2ToYUYV(in_Y,nStride,in_U,in_V,nStride>>1, 
				lpSurface, 
				m_nWidth,m_nHeight,m_lPitch); 
			break; 
		case VOUT_MODE_YV12: 
			if(nStride == (int)m_lPitch) 
			{ 
				memcpy(lpSurface,in_Y,nStride * m_nHeight); 
				memcpy(lpSurface + nStride * m_nHeight, 
					in_V,nStride * m_nHeight / 4); 
				memcpy(lpSurface + nStride * m_nHeight * 5 / 4, 
					in_U,nStride * m_nHeight / 4); 
			} 
			else 
			{ 
				for(i=0;i<m_nHeight;i++) 
					memcpy(lpSurface+i*m_lPitch,in_Y+i*nStride,m_nWidth); 
				for(i=0;i<m_nHeight/2;i++) 
					memcpy(lpSurface+m_nHeight*m_lPitch+i*m_lPitch/2,in_V+i*nStride/2,m_nWidth/2); 
				for(i=0;i<m_nHeight/2;i++) 
					memcpy(lpSurface+m_nHeight*m_lPitch+m_nHeight*m_lPitch/4+i*m_lPitch/2,in_U+i*nStride/2,m_nWidth/2); 
			} 
			break; 
		case VOUT_MODE_UYVY: 
			for(i = 0; i<m_nHeight; i++) 
			{ 
				memcpy(lpSurface+m_lPitch*i, in_Y+i*m_nWidth*2, m_nWidth*2); 
			} 
		break; 
		} 
	} 
	catch(...) 
	{ 
//		ASSERT(FALSE); 
		lpSurface = 0; 
		return TRUE; 
	} 
	UpdateImage(Rectdes); 
	return TRUE; 
} 
 
 
BOOL CDirectDisplay::SetDrawcallback(DrawCallback pOSD, LPVOID dwUser) 
{ 
	m_pDrawOSD = pOSD; 
	m_dwUser = dwUser; 
	return TRUE; 
} 
 
BOOL CDirectDisplay::SetVideoCallback(VideoCallBack videocallback,DWORD dwUser) 
{ 
	m_videocallback = videocallback; 
	m_dwvideouser = dwUser; 
	return TRUE; 
} 
 
void CDirectDisplay::GetShowRect(LPRECT lpRect) 
{ 
	GetClientRect( m_hWnd, lpRect ); 
	ClientToScreen( m_hWnd, (POINT*)&lpRect->left); 
	ClientToScreen( m_hWnd, (POINT*)&lpRect->right); 
	if(lpRect->right > iScreenWidth) 
        lpRect->right = iScreenWidth; 
	if(lpRect->bottom > iScreenHeight) 
		lpRect->bottom = iScreenHeight; 
	if(lpRect->right < 0) 
        lpRect->right = 276; 
	if(lpRect->bottom < 0) 
        lpRect->bottom = 232; 
	if(lpRect->left < 0) 
        lpRect->left = 0; 
	if(lpRect->top < 0) 
        lpRect->top = 0; 
} 
 
BYTE* CDirectDisplay::GetBackBuffer() 
{ 
	HRESULT			hr; 
	DDSURFACEDESC2	ddsd; 
	if(lpSurface) 
		return lpSurface; 
	ZeroMemory(&ddsd, sizeof(DDSURFACEDESC2)); 
	ddsd.dwSize     = sizeof(DDSURFACEDESC2); 
	hr = m_pddsBackBuffer->IsLost(); 
	if(hr = DDERR_SURFACELOST) 
	{ 
		hr = m_pddsFrontBuffer->Restore(); 
		hr = m_pddsBackBuffer->Restore(); 
		if(DD_OK != hr) 
		{ 
			lpSurface = 0; 
			return 0; 
		} 
		m_pDD->RestoreAllSurfaces(); 
	} 
	m_pddsBackBuffer->Lock(NULL, &ddsd,DDLOCK_WAIT,NULL); 
	if(FAILED(hr)) 
		return 0; 
	lpSurface = (BYTE *)ddsd.lpSurface; 
	m_lPitch = ddsd.lPitch; 
	m_pddsBackBuffer->Unlock(NULL); 
	return lpSurface; 
} 
 
/*BOOL CDirectDisplay::CheckDisplayIsSuppor(DWORD dwVideoFormat) 
{ 
	HRESULT hr; 
	if(dwVideoFormat == VIDEO_MODE_RGB) 
		return TRUE; 
	DWORD dwSize = 0; 
	DWORD *lpBuffer; 
	hr = m_pDD->GetFourCCCodes(&dwSize,0); 
	if(FAILED(hr)) 
		return FALSE; 
	//if(dwSize == 0)时,屏幕被锁定,不能创建表面 
	lpBuffer = new DWORD[dwSize]; 
	hr = m_pDD->GetFourCCCodes(&dwSize,lpBuffer); 
	if(FAILED(hr)) 
		return FALSE; 
	BOOL bSuperYV12 = FALSE; 
	BOOL bSuperYUY2 = FALSE; 
	BOOL bSuperUYVY = FALSE; 
	for(DWORD i = 0; i < dwSize; i++) 
	{ 
		if(lpBuffer[i] == MAKEFOURCC('Y','V','1','2')) 
			bSuperYV12 = TRUE; 
		if(lpBuffer[i] == MAKEFOURCC('Y','U','Y','2')) 
			bSuperYUY2 = TRUE;  
		if(lpBuffer[i] == MAKEFOURCC('U','Y','V','Y')) 
			bSuperUYVY = TRUE; 
	} 
	delete lpBuffer; 
	if(dwVideoFormat == VOUT_MODE_YUY2 && bSuperYUY2) 
		return TRUE; 
	if(dwVideoFormat == VOUT_MODE_YV12 && bSuperYV12) 
		return TRUE; 
	if(dwVideoFormat == VOUT_MODE_UYVY && bSuperUYVY) 
		return TRUE; 
	return FALSE; 
}*/ 
 
DWORD CDirectDisplay::GetOutputVideoMode() 
{ 
	return m_PixelFormat; 
} 
 
void CDirectDisplay::UpdateImage(RECT* Rectdes) 
{ 
	HRESULT ddrval; 
	if(m_videocallback) 
		m_videocallback(m_nWidth,m_nHeight,lpSurface,m_lPitch,m_dwvideouser); 
	GetShowRect(&m_rcWindow); 
	if(m_pDrawOSD) 
	{ 
		HDC hDC; 
		if(m_pddsOSDBuffer) 
		{ 
			RECT tmprect; 
			tmprect.left = 0; 
			tmprect.top = 0; 
			tmprect.right = m_nWidth; 
			tmprect.bottom = m_nHeight; 
			ddrval = m_pddsOSDBuffer->Blt(&tmprect,m_pddsBackBuffer,Rectdes,DDBLT_WAIT,NULL); 
			ddrval = m_pddsOSDBuffer->GetDC(&hDC); 
			if(ddrval == 0 && hDC) 
			{ 
				SIZE size; 
				size.cx = m_nWidth; 
				size.cy = m_nHeight; 
				m_pDrawOSD(hDC,&size,m_dwUser); 
				ddrval = m_pddsOSDBuffer->ReleaseDC(hDC); 
			} 
			ddrval = m_pddsFrontBuffer->Blt(&m_rcWindow,m_pddsOSDBuffer,Rectdes,DDBLT_WAIT,NULL); 
		} 
		else 
		{ 
			ddrval = m_pddsBackBuffer->GetDC(&hDC); 
			if(ddrval == 0 && hDC) 
			{ 
				SIZE size; 
				size.cx = m_nWidth; 
				size.cy = m_nHeight; 
				m_pDrawOSD(hDC,&size,m_dwUser); 
				m_pddsBackBuffer->ReleaseDC(hDC); 
			} 
			ddrval = m_pddsFrontBuffer->Blt(&m_rcWindow,m_pddsBackBuffer,Rectdes,DDBLT_DONOTWAIT,NULL); 
		} 
	} 
	else 
		ddrval = m_pddsFrontBuffer->Blt(&m_rcWindow,m_pddsBackBuffer,Rectdes,DDBLT_WAIT,NULL); 
} 
 
 
void CDirectDisplay::yuv2ToYUYV( 
								BYTE* lpSrcY,  
								int SrcPitch, 
								BYTE* lpSrcU, 
								BYTE* lpSrcV,  
								int stride_uv,  
								BYTE* lpDst,  
								int SrcWidth,  
								int SrcHeight, 
								unsigned int _stride_out 
								) 
{ 
	int SrcStride = SrcPitch + SrcPitch - SrcWidth;	 
	int SrcStrideU = (SrcPitch - SrcWidth)>>1; 
	int DstStride =(_stride_out - SrcWidth)<<1; 
	 
 
/*  C Language 
	int iStride = m_nWidth; 
	DWORD iDstStride=_stride_out; 
	unsigned char *pDirectBuf; 
	DWORD i,j; 
	unsigned char *yy=lpSrcY, *uu= lpSrcU, *vv = lpSrcV; 
	pDirectBuf = (unsigned char*)lpDst; 
	for( i=0; i<m_nHeight; i+=2) 
	{ 
		for( j=0; j<m_nWidth; j+=2) 
		{ 
			* pDirectBuf++ = * yy++; 
			* pDirectBuf++ = * uu++; 
			* pDirectBuf++ = * yy++; 
			* pDirectBuf++ = * vv++; 
		} 
		yy+=(iStride-m_nWidth); 
		uu-= (m_nWidth>>1); 
		vv-= (m_nWidth>>1); 
		pDirectBuf+=(iDstStride-(m_nWidth<<1)); 
		 
		for( j=0; j<m_nWidth; j+=2) 
		{ 
			* pDirectBuf++ = * yy++; 
			* pDirectBuf++ = * uu++; 
			* pDirectBuf++ = * yy++; 
			* pDirectBuf++ = * vv++; 
		} 
		yy+=(iStride-m_nWidth); 
		uu+= ((iStride>>1)-(m_nWidth>>1)); 
		vv+= ((iStride>>1)-(m_nWidth>>1)); 
		pDirectBuf+=(iDstStride-(m_nWidth<<1)); 
	} 
 
	return ; 
*/ 
	__asm 
	{ 
			 
			push eax 
			push ebx 
			push ecx 
			push edx 
			push edi 
			push esi 
 
 
			mov edi , [lpDst] 
			mov esi , [lpSrcY] 
			mov eax , [lpSrcU] 
			mov ebx , [lpSrcV] 
			mov ecx , [SrcHeight] 
			mov edx , [SrcWidth] 
cyc:  
			movq mm2 , qword ptr [eax] //u 
			movq mm3 , qword ptr [ebx] //v 
			 
			movq mm0 , qword ptr [esi] //y1 
			movq mm1 , qword ptr [esi+8] //y2 
			 
			movq mm4 , mm2 
			punpcklbw mm2 , mm3 // uv1 
			punpckhbw mm4 , mm3 // uv2 
			 
			movq mm3 , mm0 
			movq mm5 , mm1 
			punpcklbw mm0 , mm2 // yuyv1 
			punpckhbw mm3 , mm2 // yuyv2 
			punpcklbw mm1 , mm4 // yuyv3 
			punpckhbw mm5 , mm4 // yuyv4 
			 
			movq qword ptr [edi] , mm0 
			movq qword ptr [edi+8] , mm3 
			movq qword ptr [edi+16] , mm1 
			movq qword ptr [edi+24] , mm5 
			 
			add esi , [SrcPitch] 
			add edi , [_stride_out] 
			 
			movq mm0 , qword ptr [esi] //y1 
			movq mm1 , qword ptr [esi+8] //y2 
			 
			movq mm3 , mm0 
			movq mm5 , mm1 
			punpcklbw mm0 , mm2 // yuyv1 
			punpcklbw mm1 , mm4 // yuyv3 
			punpckhbw mm3 , mm2 // yuyv2 
			punpckhbw mm5 , mm4 // yuyv4 
			 
			movq qword ptr [edi] , mm0 
			movq qword ptr [edi+8] , mm3 
			movq qword ptr [edi+16] , mm1 
			movq qword ptr [edi+24] , mm5 
			 
			sub esi , [SrcPitch] 
			sub edi , [_stride_out] 
			 
			add eax , 8 
			add ebx , 8 
			add esi , 16 
			add edi , 32 
			 
			sub edx,16			 
			ja cyc 
			 
			mov edx,[SrcWidth] 
			 
			add esi , [SrcStride] 
			add eax , [SrcStrideU] 
			add ebx , [SrcStrideU] 
			add edi , [DstStride] 
			 
			sub ecx,2 
			ja cyc 
 
			pop esi 
			pop edi  
			pop edx  
			pop ecx 
			pop ebx  
			pop eax 
					 
			emms 
 
			 
	} 
} 
 
// -------------------------------------------------------------------------------------- 
 
//--------------------------------------------------------------------------- 
/**** YUV -> RGB conversion, 32-bit output ****/ 
/* if height_y is negative then the output image will be inverted */ 
/*  
note: _stride_out parameter is ignored in yuv to rgb conversion  
it's assumed that stride_out = 4 * width_y for the 32 bit color bitmap 
*/ 
void CDirectDisplay::yuv2rgb_32(BYTE *puc_y, int stride_y,  
								BYTE *puc_u, BYTE *puc_v, int stride_uv,  
								BYTE *puc_out, int width_y, int height_y, 
								unsigned int _stride_out)  
{ 
	 
	//int y; 
	int horiz_count; 
	horiz_count = -(width_y >> 3); 
	 
	_asm  
	{ 
		push eax 
		push ebx 
		push ecx 
		push edx 
		push edi 
		push esi 
		 
		mov esi, height_y 
		 
		mov eax, puc_out 
		mov ebx, puc_y        
		mov ecx, puc_u        
		mov edx, puc_v 
loopbegin: 
		mov edi, horiz_count	 
horiz_loop1: 
		movd mm2, [ecx] 
		pxor mm7, mm7 
		 
		movd mm3, [edx] 
		punpcklbw mm2, mm7       ; mm2 = __u3__u2__u1__u0 
		 
		movq mm0, [ebx]          ; mm0 = y7y6y5y4y3y2y1y0   
		punpcklbw mm3, mm7       ; mm3 = __v3__v2__v1__v0 
		 
		movq mm1, mmw_0x00ff     ; mm1 = 00ff00ff00ff00ff  
		 
		psubusb mm0, mmb_0x10    ; mm0 -= 16   //成组数据相减 
		 
		psubw mm2, mmw_0x0080    ; mm2 -= 128 
		pand mm1, mm0            ; mm1 = __y6__y4__y2__y0 
		 
		psubw mm3, mmw_0x0080    ; mm3 -= 128 
		psllw mm1, 3             ; mm1 *= 8 
		 
		psrlw mm0, 8             ; mm0 = __y7__y5__y3__y1 
		psllw mm2, 3             ; mm2 *= 8 
		 
		pmulhw mm1, mmw_mult_Y   ; mm1 *= luma coeff  
		psllw mm0, 3             ; mm0 *= 8 
		 
		psllw mm3, 3             ; mm3 *= 8 
		movq mm5, mm3            ; mm5 = mm3 = v 
		 
		pmulhw mm5, mmw_mult_V_R ; mm5 = red chroma 
		movq mm4, mm2            ; mm4 = mm2 = u 
		 
		pmulhw mm0, mmw_mult_Y   ; mm0 *= luma coeff  
		movq mm7, mm1            ; even luma part 
		 
		pmulhw mm2, mmw_mult_U_G ; mm2 *= u green coeff  
		paddsw mm7, mm5          ; mm7 = luma + chroma    __r6__r4__r2__r0 
		 
		pmulhw mm3, mmw_mult_V_G ; mm3 *= v green coeff   
		packuswb mm7, mm7        ; mm7 = r6r4r2r0r6r4r2r0 
		 
		pmulhw mm4, mmw_mult_U_B ; mm4 = blue chroma 
		paddsw mm5, mm0          ; mm5 = luma + chroma    __r7__r5__r3__r1 
		 
		packuswb mm5, mm5        ; mm6 = r7r5r3r1r7r5r3r1 
		paddsw mm2, mm3          ; mm2 = green chroma 
		 
		movq mm3, mm1            ; mm3 = __y6__y4__y2__y0 
		movq mm6, mm1            ; mm6 = __y6__y4__y2__y0 
		 
		paddsw mm3, mm4          ; mm3 = luma + chroma    __b6__b4__b2__b0 
		paddsw mm6, mm2          ; mm6 = luma + chroma    __g6__g4__g2__g0 
		 
		punpcklbw mm7, mm5       ; mm7 = r7r6r5r4r3r2r1r0 
		paddsw mm2, mm0          ; odd luma part plus chroma part    __g7__g5__g3__g1 
		 
		packuswb mm6, mm6        ; mm2 = g6g4g2g0g6g4g2g0 
		packuswb mm2, mm2        ; mm2 = g7g5g3g1g7g5g3g1 
		 
		packuswb mm3, mm3        ; mm3 = b6b4b2b0b6b4b2b0 
		paddsw mm4, mm0          ; odd luma part plus chroma part    __b7__b5__b3__b1 
		 
		packuswb mm4, mm4        ; mm4 = b7b5b3b1b7b5b3b1 
		punpcklbw mm6, mm2       ; mm6 = g7g6g5g4g3g2g1g0 
		 
		punpcklbw mm3, mm4       ; mm3 = b7b6b5b4b3b2b1b0 
		 
		/* 32-bit shuffle.... */ 
		pxor mm0, mm0            ; is this needed? 
		 
		movq mm1, mm6            ; mm1 = g7g6g5g4g3g2g1g0 
		punpcklbw mm1, mm0       ; mm1 = __g3__g2__g1__g0 
		 
		movq mm0, mm3            ; mm0 = b7b6b5b4b3b2b1b0 
		punpcklbw mm0, mm7       ; mm0 = r3b3r2b2r1b1r0b0 
		 
		movq mm2, mm0            ; mm2 = r3b3r2b2r1b1r0b0 
		 
		punpcklbw mm0, mm1       ; mm0 = __r1g1b1__r0g0b0 
		punpckhbw mm2, mm1       ; mm2 = __r3g3b3__r2g2b2 
		 
		/* 32-bit save... */ 
		movq  [eax], mm0         ; eax[0] = __r1g1b1__r0g0b0 
		movq mm1, mm6            ; mm1 = g7g6g5g4g3g2g1g0 
		 
		movq 8[eax], mm2         ; eax[8] = __r3g3b3__r2g2b2 
		 
		/* 32-bit shuffle.... */ 
		pxor mm0, mm0            ; is this needed? 
		 
		punpckhbw mm1, mm0       ; mm1 = __g7__g6__g5__g4 
		 
		movq mm0, mm3            ; mm0 = b7b6b5b4b3b2b1b0 
		punpckhbw mm0, mm7       ; mm0 = r7b7r6b6r5b5r4b4 
		 
		movq mm2, mm0            ; mm2 = r7b7r6b6r5b5r4b4 
		 
		punpcklbw mm0, mm1       ; mm0 = __r5g5b5__r4g4b4 
		punpckhbw mm2, mm1       ; mm2 = __r7g7b7__r6g6b6 
		 
		/* 32-bit save... */ 
		add ebx, 8               ; puc_y   += 8; 
		add ecx, 4               ; puc_u   += 4; 
		 
		movq 16[eax], mm0        ; eax[16] = __r5g5b5__r4g4b4 
		add edx, 4               ; puc_v   += 4; 
		 
		movq 24[eax], mm2        ; eax[24] = __r7g7b7__r6g6b6 
			 
			// 0 1 2 3 4 5 6 7 rgb save order 
			 
		add eax, 32              ; puc_out += 32 
			 
		inc edi 
		jne horiz_loop1			 
		 
		mov eax, puc_out 
		add eax, _stride_out; 
		mov puc_out,eax 
		mov ebx, puc_y 
		add ebx,stride_y; 
		mov puc_y,ebx 
			 
		mov ecx, puc_u        
		mov edx, puc_v 
		mov edi, horiz_count 
horiz_loop2: 
		movd mm2, [ecx] 
		pxor mm7, mm7 
			 
		movd mm3, [edx] 
		punpcklbw mm2, mm7       ; mm2 = __u3__u2__u1__u0 
		 
		movq mm0, [ebx]          ; mm0 = y7y6y5y4y3y2y1y0   
		punpcklbw mm3, mm7       ; mm3 = __v3__v2__v1__v0 
		 
		movq mm1, mmw_0x00ff     ; mm1 = 00ff00ff00ff00ff  
		 
		psubusb mm0, mmb_0x10    ; mm0 -= 16   //成组数据相减 
		 
		psubw mm2, mmw_0x0080    ; mm2 -= 128 
		pand mm1, mm0            ; mm1 = __y6__y4__y2__y0 
		 
		psubw mm3, mmw_0x0080    ; mm3 -= 128 
		psllw mm1, 3             ; mm1 *= 8 
		 
		psrlw mm0, 8             ; mm0 = __y7__y5__y3__y1 
		psllw mm2, 3             ; mm2 *= 8 
		 
		pmulhw mm1, mmw_mult_Y   ; mm1 *= luma coeff  
		psllw mm0, 3             ; mm0 *= 8 
		 
		psllw mm3, 3             ; mm3 *= 8 
		movq mm5, mm3            ; mm5 = mm3 = v 
		 
		pmulhw mm5, mmw_mult_V_R ; mm5 = red chroma 
		movq mm4, mm2            ; mm4 = mm2 = u 
		 
		pmulhw mm0, mmw_mult_Y   ; mm0 *= luma coeff  
		movq mm7, mm1            ; even luma part 
		 
		pmulhw mm2, mmw_mult_U_G ; mm2 *= u green coeff  
		paddsw mm7, mm5          ; mm7 = luma + chroma    __r6__r4__r2__r0 
		 
		pmulhw mm3, mmw_mult_V_G ; mm3 *= v green coeff   
		packuswb mm7, mm7        ; mm7 = r6r4r2r0r6r4r2r0 
		 
		pmulhw mm4, mmw_mult_U_B ; mm4 = blue chroma 
		paddsw mm5, mm0          ; mm5 = luma + chroma    __r7__r5__r3__r1 
		 
		packuswb mm5, mm5        ; mm6 = r7r5r3r1r7r5r3r1 
		paddsw mm2, mm3          ; mm2 = green chroma 
		 
		movq mm3, mm1            ; mm3 = __y6__y4__y2__y0 
		movq mm6, mm1            ; mm6 = __y6__y4__y2__y0 
		 
		paddsw mm3, mm4          ; mm3 = luma + chroma    __b6__b4__b2__b0 
		paddsw mm6, mm2          ; mm6 = luma + chroma    __g6__g4__g2__g0 
		 
		punpcklbw mm7, mm5       ; mm7 = r7r6r5r4r3r2r1r0 
		paddsw mm2, mm0          ; odd luma part plus chroma part    __g7__g5__g3__g1 
		 
		packuswb mm6, mm6        ; mm2 = g6g4g2g0g6g4g2g0 
		packuswb mm2, mm2        ; mm2 = g7g5g3g1g7g5g3g1 
		 
		packuswb mm3, mm3        ; mm3 = b6b4b2b0b6b4b2b0 
		paddsw mm4, mm0          ; odd luma part plus chroma part    __b7__b5__b3__b1 
		 
		packuswb mm4, mm4        ; mm4 = b7b5b3b1b7b5b3b1 
		punpcklbw mm6, mm2       ; mm6 = g7g6g5g4g3g2g1g0 
		 
		punpcklbw mm3, mm4       ; mm3 = b7b6b5b4b3b2b1b0 
		 
		/* 32-bit shuffle.... */ 
		pxor mm0, mm0            ; is this needed? 
		 
		movq mm1, mm6            ; mm1 = g7g6g5g4g3g2g1g0 
		punpcklbw mm1, mm0       ; mm1 = __g3__g2__g1__g0 
		 
		movq mm0, mm3            ; mm0 = b7b6b5b4b3b2b1b0 
		punpcklbw mm0, mm7       ; mm0 = r3b3r2b2r1b1r0b0 
		 
		movq mm2, mm0            ; mm2 = r3b3r2b2r1b1r0b0 
		 
		punpcklbw mm0, mm1       ; mm0 = __r1g1b1__r0g0b0 
		punpckhbw mm2, mm1       ; mm2 = __r3g3b3__r2g2b2 
		 
		/* 32-bit save... */ 
		movq  [eax], mm0         ; eax[0] = __r1g1b1__r0g0b0 
		movq mm1, mm6            ; mm1 = g7g6g5g4g3g2g1g0 
		 
		movq 8[eax], mm2         ; eax[8] = __r3g3b3__r2g2b2 
		 
		/* 32-bit shuffle.... */ 
		pxor mm0, mm0            ; is this needed? 
		 
		punpckhbw mm1, mm0       ; mm1 = __g7__g6__g5__g4 
		 
		movq mm0, mm3            ; mm0 = b7b6b5b4b3b2b1b0 
		punpckhbw mm0, mm7       ; mm0 = r7b7r6b6r5b5r4b4 
		 
		movq mm2, mm0            ; mm2 = r7b7r6b6r5b5r4b4 
		 
		punpcklbw mm0, mm1       ; mm0 = __r5g5b5__r4g4b4 
		punpckhbw mm2, mm1       ; mm2 = __r7g7b7__r6g6b6 
		 
		/* 32-bit save... */ 
		add ebx, 8               ; puc_y   += 8; 
		add ecx, 4               ; puc_u   += 4; 
		 
		movq 16[eax], mm0        ; eax[16] = __r5g5b5__r4g4b4 
		add edx, 4               ; puc_v   += 4; 
		 
		movq 24[eax], mm2        ; eax[24] = __r7g7b7__r6g6b6 
				 
		// 0 1 2 3 4 5 6 7 rgb save order 
				 
		add eax, 32              ; puc_out += 32 
				 
		inc edi 
		jne horiz_loop2			 
			 
		mov eax,puc_out 
		add eax,_stride_out 
		mov puc_out,eax 
		mov ebx,puc_y 
		add ebx,stride_y 
		mov puc_y,ebx 
		mov ecx,puc_u 
		add ecx,stride_uv 
		mov puc_u,ecx 
		mov edx,puc_v 
		add edx,stride_uv 
		mov puc_v,edx 
 
		dec esi 
		dec esi 
		jne loopbegin 
		pop esi 
		pop edi  
		pop edx  
		pop ecx 
		pop ebx  
		pop eax 
		 
		emms 
	}	 
} 
//--------------------------------------------------------------------------- 
 
/**** YUV -> RGB conversion, 24-bit output ****/ 
void CDirectDisplay::yuv2rgb_24(BYTE *puc_y, int stride_y,  
								BYTE *puc_u, BYTE *puc_v, int stride_uv,  
								BYTE *puc_out, int width_y, int height_y, 
								unsigned int _stride_out)  
{ 
	 
	int y, horiz_count; 
	horiz_count = -(width_y >> 3); 
	 
	for (y=0; y<height_y; y+=2)  
	{ 
		_asm  
		{ 
			push eax 
				push ebx 
				push ecx 
				push edx 
				push edi 
				 
				mov eax, puc_out        
				mov ebx, puc_y        
				mov ecx, puc_u        
				mov edx, puc_v 
				mov edi, horiz_count 
horiz_loop1: 
			 
			movd mm2, [ecx] 
				pxor mm7, mm7 
				 
				movd mm3, [edx] 
				punpcklbw mm2, mm7       ; mm2 = __u3__u2__u1__u0 
				 
				movq mm0, [ebx]          ; mm0 = y7y6y5y4y3y2y1y0   
				punpcklbw mm3, mm7       ; mm3 = __v3__v2__v1__v0 
				 
				movq mm1, mmw_0x00ff     ; mm1 = 00ff00ff00ff00ff  
				 
				psubusb mm0, mmb_0x10    ; mm0 -= 16 
				 
				psubw mm2, mmw_0x0080    ; mm2 -= 128 
				pand mm1, mm0            ; mm1 = __y6__y4__y2__y0 
				 
				psubw mm3, mmw_0x0080    ; mm3 -= 128 
				psllw mm1, 3             ; mm1 *= 8 
				 
				psrlw mm0, 8             ; mm0 = __y7__y5__y3__y1 
				psllw mm2, 3             ; mm2 *= 8 
				 
				pmulhw mm1, mmw_mult_Y   ; mm1 *= luma coeff  
				psllw mm0, 3             ; mm0 *= 8 
				 
				psllw mm3, 3             ; mm3 *= 8 
				movq mm5, mm3            ; mm5 = mm3 = v 
				 
				pmulhw mm5, mmw_mult_V_R ; mm5 = red chroma 
				movq mm4, mm2            ; mm4 = mm2 = u 
				 
				pmulhw mm0, mmw_mult_Y   ; mm0 *= luma coeff  
				movq mm7, mm1            ; even luma part 
				 
				pmulhw mm2, mmw_mult_U_G ; mm2 *= u green coeff  
				paddsw mm7, mm5          ; mm7 = luma + chroma    __r6__r4__r2__r0 
				 
				pmulhw mm3, mmw_mult_V_G ; mm3 *= v green coeff   
				packuswb mm7, mm7        ; mm7 = r6r4r2r0r6r4r2r0 
				 
				pmulhw mm4, mmw_mult_U_B ; mm4 = blue chroma 
				paddsw mm5, mm0          ; mm5 = luma + chroma    __r7__r5__r3__r1 
				 
				packuswb mm5, mm5        ; mm6 = r7r5r3r1r7r5r3r1 
				paddsw mm2, mm3          ; mm2 = green chroma 
				 
				movq mm3, mm1            ; mm3 = __y6__y4__y2__y0 
				movq mm6, mm1            ; mm6 = __y6__y4__y2__y0 
				 
				paddsw mm3, mm4          ; mm3 = luma + chroma    __b6__b4__b2__b0 
				paddsw mm6, mm2          ; mm6 = luma + chroma    __g6__g4__g2__g0 
				 
				punpcklbw mm7, mm5       ; mm7 = r7r6r5r4r3r2r1r0 
				paddsw mm2, mm0          ; odd luma part plus chroma part    __g7__g5__g3__g1 
				 
				packuswb mm6, mm6        ; mm2 = g6g4g2g0g6g4g2g0 
				packuswb mm2, mm2        ; mm2 = g7g5g3g1g7g5g3g1 
				 
				packuswb mm3, mm3        ; mm3 = b6b4b2b0b6b4b2b0 
				paddsw mm4, mm0          ; odd luma part plus chroma part    __b7__b5__b3__b1 
				 
				packuswb mm4, mm4        ; mm4 = b7b5b3b1b7b5b3b1 
				punpcklbw mm6, mm2       ; mm6 = g7g6g5g4g3g2g1g0 
				 
				punpcklbw mm3, mm4       ; mm3 = b7b6b5b4b3b2b1b0 
				 
				/* 32-bit shuffle.... */ 
				pxor mm0, mm0            ; is this needed? 
				 
				movq mm1, mm6            ; mm1 = g7g6g5g4g3g2g1g0 
				punpcklbw mm1, mm0       ; mm1 = __g3__g2__g1__g0 
				 
				movq mm0, mm3            ; mm0 = b7b6b5b4b3b2b1b0 
				punpcklbw mm0, mm7       ; mm0 = r3b3r2b2r1b1r0b0 
				 
				movq mm2, mm0            ; mm2 = r3b3r2b2r1b1r0b0 
				 
				punpcklbw mm0, mm1       ; mm0 = __r1g1b1__r0g0b0 
				punpckhbw mm2, mm1       ; mm2 = __r3g3b3__r2g2b2 
				 
				/* 24-bit shuffle and save... */ 
				movd   [eax], mm0        ; eax[0] = __r0g0b0 
				psrlq mm0, 32            ; mm0 = __r1g1b1 
				 
				movd  3[eax], mm0        ; eax[3] = __r1g1b1 
				 
				movd  6[eax], mm2        ; eax[6] = __r2g2b2 
				 
				 
				psrlq mm2, 32            ; mm2 = __r3g3b3 
				 
				movd  9[eax], mm2        ; eax[9] = __r3g3b3 
				 
				/* 32-bit shuffle.... */ 
				pxor mm0, mm0            ; is this needed? 
				 
				movq mm1, mm6            ; mm1 = g7g6g5g4g3g2g1g0 
				punpckhbw mm1, mm0       ; mm1 = __g7__g6__g5__g4 
				 
				movq mm0, mm3            ; mm0 = b7b6b5b4b3b2b1b0 
				punpckhbw mm0, mm7       ; mm0 = r7b7r6b6r5b5r4b4 
				 
				movq mm2, mm0            ; mm2 = r7b7r6b6r5b5r4b4 
				 
				punpcklbw mm0, mm1       ; mm0 = __r5g5b5__r4g4b4 
				punpckhbw mm2, mm1       ; mm2 = __r7g7b7__r6g6b6 
				 
				/* 24-bit shuffle and save... */ 
				movd 12[eax], mm0        ; eax[12] = __r4g4b4 
				psrlq mm0, 32            ; mm0 = __r5g5b5 
				 
				movd 15[eax], mm0        ; eax[15] = __r5g5b5 
				add ebx, 8               ; puc_y   += 8; 
				 
				movd 18[eax], mm2        ; eax[18] = __r6g6b6 
					psrlq mm2, 32            ; mm2 = __r7g7b7 
					 
					add ecx, 4               ; puc_u   += 4; 
				add edx, 4               ; puc_v   += 4; 
				 
				movd 21[eax], mm2        ; eax[21] = __r7g7b7 
					add eax, 24              ; puc_out += 24 
					 
					inc edi 
					jne horiz_loop1			 
					 
			mov eax, puc_out 
			add eax, _stride_out; 
			mov puc_out,eax 
			mov ebx, puc_y 
			add ebx,stride_y; 
			mov puc_y,ebx 
 
			mov ecx, puc_u        
			mov edx, puc_v 
			mov edi, horiz_count 
horiz_loop2: 
			 
			movd mm2, [ecx] 
				pxor mm7, mm7 
				 
				movd mm3, [edx] 
				punpcklbw mm2, mm7       ; mm2 = __u3__u2__u1__u0 
				 
				movq mm0, [ebx]          ; mm0 = y7y6y5y4y3y2y1y0   
				punpcklbw mm3, mm7       ; mm3 = __v3__v2__v1__v0 
				 
				movq mm1, mmw_0x00ff     ; mm1 = 00ff00ff00ff00ff  
				 
				psubusb mm0, mmb_0x10    ; mm0 -= 16 
				 
				psubw mm2, mmw_0x0080    ; mm2 -= 128 
				pand mm1, mm0            ; mm1 = __y6__y4__y2__y0 
				 
				psubw mm3, mmw_0x0080    ; mm3 -= 128 
				psllw mm1, 3             ; mm1 *= 8 
				 
				psrlw mm0, 8             ; mm0 = __y7__y5__y3__y1 
				psllw mm2, 3             ; mm2 *= 8 
				 
				pmulhw mm1, mmw_mult_Y   ; mm1 *= luma coeff  
				psllw mm0, 3             ; mm0 *= 8 
				 
				psllw mm3, 3             ; mm3 *= 8 
				movq mm5, mm3            ; mm5 = mm3 = v 
				 
				pmulhw mm5, mmw_mult_V_R ; mm5 = red chroma 
				movq mm4, mm2            ; mm4 = mm2 = u 
				 
				pmulhw mm0, mmw_mult_Y   ; mm0 *= luma coeff  
				movq mm7, mm1            ; even luma part 
				 
				pmulhw mm2, mmw_mult_U_G ; mm2 *= u green coeff  
				paddsw mm7, mm5          ; mm7 = luma + chroma    __r6__r4__r2__r0 
				 
				pmulhw mm3, mmw_mult_V_G ; mm3 *= v green coeff   
				packuswb mm7, mm7        ; mm7 = r6r4r2r0r6r4r2r0 
				 
				pmulhw mm4, mmw_mult_U_B ; mm4 = blue chroma 
				paddsw mm5, mm0          ; mm5 = luma + chroma    __r7__r5__r3__r1 
				 
				packuswb mm5, mm5        ; mm6 = r7r5r3r1r7r5r3r1 
				paddsw mm2, mm3          ; mm2 = green chroma 
				 
				movq mm3, mm1            ; mm3 = __y6__y4__y2__y0 
				movq mm6, mm1            ; mm6 = __y6__y4__y2__y0 
				 
				paddsw mm3, mm4          ; mm3 = luma + chroma    __b6__b4__b2__b0 
				paddsw mm6, mm2          ; mm6 = luma + chroma    __g6__g4__g2__g0 
				 
				punpcklbw mm7, mm5       ; mm7 = r7r6r5r4r3r2r1r0 
				paddsw mm2, mm0          ; odd luma part plus chroma part    __g7__g5__g3__g1 
				 
				packuswb mm6, mm6        ; mm2 = g6g4g2g0g6g4g2g0 
				packuswb mm2, mm2        ; mm2 = g7g5g3g1g7g5g3g1 
				 
				packuswb mm3, mm3        ; mm3 = b6b4b2b0b6b4b2b0 
				paddsw mm4, mm0          ; odd luma part plus chroma part    __b7__b5__b3__b1 
				 
				packuswb mm4, mm4        ; mm4 = b7b5b3b1b7b5b3b1 
				punpcklbw mm6, mm2       ; mm6 = g7g6g5g4g3g2g1g0 
				 
				punpcklbw mm3, mm4       ; mm3 = b7b6b5b4b3b2b1b0 
				 
				/* 32-bit shuffle.... */ 
				pxor mm0, mm0            ; is this needed? 
				 
				movq mm1, mm6            ; mm1 = g7g6g5g4g3g2g1g0 
				punpcklbw mm1, mm0       ; mm1 = __g3__g2__g1__g0 
				 
				movq mm0, mm3            ; mm0 = b7b6b5b4b3b2b1b0 
				punpcklbw mm0, mm7       ; mm0 = r3b3r2b2r1b1r0b0 
				 
				movq mm2, mm0            ; mm2 = r3b3r2b2r1b1r0b0 
				 
				punpcklbw mm0, mm1       ; mm0 = __r1g1b1__r0g0b0 
				punpckhbw mm2, mm1       ; mm2 = __r3g3b3__r2g2b2 
				 
				/* 24-bit shuffle and save... */ 
				movd   [eax], mm0        ; eax[0] = __r0g0b0 
				psrlq mm0, 32            ; mm0 = __r1g1b1 
				 
				movd  3[eax], mm0        ; eax[3] = __r1g1b1 
				 
				movd  6[eax], mm2        ; eax[6] = __r2g2b2 
				 
				 
				psrlq mm2, 32            ; mm2 = __r3g3b3 
				 
				movd  9[eax], mm2        ; eax[9] = __r3g3b3 
				 
				/* 32-bit shuffle.... */ 
				pxor mm0, mm0            ; is this needed? 
				 
				movq mm1, mm6            ; mm1 = g7g6g5g4g3g2g1g0 
				punpckhbw mm1, mm0       ; mm1 = __g7__g6__g5__g4 
				 
				movq mm0, mm3            ; mm0 = b7b6b5b4b3b2b1b0 
				punpckhbw mm0, mm7       ; mm0 = r7b7r6b6r5b5r4b4 
				 
				movq mm2, mm0            ; mm2 = r7b7r6b6r5b5r4b4 
				 
				punpcklbw mm0, mm1       ; mm0 = __r5g5b5__r4g4b4 
				punpckhbw mm2, mm1       ; mm2 = __r7g7b7__r6g6b6 
				 
				/* 24-bit shuffle and save... */ 
				movd 12[eax], mm0        ; eax[12] = __r4g4b4 
				psrlq mm0, 32            ; mm0 = __r5g5b5 
				 
				movd 15[eax], mm0        ; eax[15] = __r5g5b5 
				add ebx, 8               ; puc_y   += 8; 
				 
				movd 18[eax], mm2        ; eax[18] = __r6g6b6 
					psrlq mm2, 32            ; mm2 = __r7g7b7 
					 
					add ecx, 4               ; puc_u   += 4; 
				add edx, 4               ; puc_v   += 4; 
				 
				movd 21[eax], mm2        ; eax[21] = __r7g7b7 
					add eax, 24              ; puc_out += 24 
					 
					inc edi 
					jne horiz_loop2			 
					 
					pop edi  
					pop edx  
					pop ecx 
					pop ebx  
					pop eax 
					 
					emms 
					 
		}  
		puc_out += _stride_out;  
        puc_y   += stride_y; 
		puc_u   += stride_uv; 
		puc_v   += stride_uv; 
	} 
} 
//--------------------------------------------------------------------------- 
/**** YUV -> RGB conversion, 16-bit output (two flavours) ****/ 
/* all stride values are in _bytes_ */ 
void CDirectDisplay::yuv2rgb_555(BYTE *puc_y, int stride_y,  
								 BYTE *puc_u, BYTE *puc_v, int stride_uv,  
								 BYTE *puc_out,	int width_y, int height_y, 
								 unsigned int _stride_out)  
{ 
    int y, horiz_count; 
	 
	horiz_count = -(width_y >> 3); 
	 
	for (y=0; y<height_y; y+=2)  
	{ 
		 
		_asm  
		{ 
			push eax 
				push ebx 
				push ecx 
				push edx 
				push edi 
				 
				mov eax, puc_out        
				mov ebx, puc_y        
				mov ecx, puc_u        
				mov edx, puc_v 
				mov edi, horiz_count 
				 
horiz_loop1: 
			 
			// load data 
			movd mm2, [ecx]					 ; mm2 = ________u3u2u1u0 
				movd mm3, [edx]					 ; mm3 = ________v3v2v1v0 
				movq mm0, [ebx]          ; mm0 = y7y6y5y4y3y2y1y0   
				 
				pxor mm7, mm7						 ; zero mm7 
				 
				// convert chroma part 
				punpcklbw mm2, mm7       ; mm2 = __u3__u2__u1__u0 
				punpcklbw mm3, mm7       ; mm3 = __v3__v2__v1__v0 
				psubw mm2, mmw_0x0080    ; mm2 -= 128 
				psubw mm3, mmw_0x0080    ; mm3 -= 128 
				psllw mm2, 3             ; mm2 *= 8 
				psllw mm3, 3             ; mm3 *= 8 
				movq mm4, mm2            ; mm4 = mm2 = u 
				movq mm5, mm3            ; mm5 = mm3 = v 
				pmulhw mm2, mmw_mult_U_G ; mm2 *= u green coeff  
				pmulhw mm3, mmw_mult_V_G ; mm3 *= v green coeff   
				pmulhw mm4, mmw_mult_U_B ; mm4 = blue chroma 
				pmulhw mm5, mmw_mult_V_R ; mm5 = red chroma 
				paddsw mm2, mm3					 ; mm2 = green chroma 
				 
				// convert luma part 
				psubusb mm0, mmb_0x10    ; mm0 -= 16 
				movq mm1, mmw_0x00ff     ; mm1 = 00ff00ff00ff00ff  
				pand mm1, mm0            ; mm1 = __y6__y4__y2__y0 luma even 
				psrlw mm0, 8             ; mm0 = __y7__y5__y3__y1 luma odd 
				psllw mm0, 3             ; mm0 *= 8 
				psllw mm1, 3             ; mm1 *= 8 
				pmulhw mm0, mmw_mult_Y   ; mm0 luma odd *= luma coeff  
				pmulhw mm1, mmw_mult_Y   ; mm1 luma even *= luma coeff  
				 
				// complete the matrix calc with the addictions 
				movq mm3, mm4						 ; copy blue chroma 
				movq mm6, mm5						 ; copy red chroma 
				movq mm7, mm2						 ; copy green chroma 
				paddsw mm3, mm0					 ; mm3 = luma odd + blue chroma 
				paddsw mm4, mm1					 ; mm4 = luma even + blue chroma 
				paddsw mm6, mm0					 ; mm6 = luma odd + red chroma 
				paddsw mm5, mm1					 ; mm5 = luma even + red chroma 
				paddsw mm7, mm0					 ; mm7 = luma odd + green chroma 
				paddsw mm2, mm1					 ; mm2 = luma even + green chroma 
				// clipping 
				packuswb mm3, mm3 
				packuswb mm4, mm4 
				packuswb mm6, mm6 
				packuswb mm5, mm5 
				packuswb mm7, mm7 
				packuswb mm2, mm2 
				// interleave odd and even parts 
				punpcklbw mm4, mm3			 ; mm4 = b7b6b5b4b3b2b1b0 blue 
				punpcklbw mm5, mm6			 ; mm5 = r7r6r5r4r3r2r1r0 red 
				punpcklbw mm2, mm7			 ; mm2 = g7g6g5g4g3g2g1g0 green 
				 
				// mask not needed bits (using 555) 
				pand mm4, mask_5 
				pand mm5, mask_5 
				pand mm2, mask_5 
				 
				// mix colors and write 
				 
				psrlw mm4, 3						 ; mm4 = blue shifted 
				pand mm4, mask_blue			 ; mask the blue again 
				pxor mm7, mm7						 ; zero mm7 
				movq mm1, mm4						 ; mm1 = copy blue 
				movq mm3, mm5						 ; mm3 = copy red 
				movq mm6, mm2						 ; mm6 = copy green 
				 
				punpckhbw mm1, mm7 
				punpckhbw mm3, mm7 
				punpckhbw mm6, mm7 
				psllw mm6, 2						 ; shift green 
				psllw mm3, 7						 ; shift red 
				por mm6, mm3 
				por mm6, mm1 
				movq 8[eax], mm6 
				 
				punpcklbw mm2, mm7			 ; mm2 = __g3__g2__g1__g0 already masked 
				punpcklbw mm5, mm7 
				punpcklbw mm4, mm7 
				psllw mm2, 2						 ; shift green 
				psllw mm5, 7						 ; shift red 
				por mm2, mm5 
				por mm2, mm4 
				movq [eax], mm2 
				 
				add ebx, 8               ; puc_y   += 8; 
			add ecx, 4               ; puc_u   += 4; 
			add edx, 4               ; puc_v   += 4; 
			add eax, 16              ; puc_out += 16 // wrote 16 bytes 
				 
				inc edi 
				jne horiz_loop1			 
			mov eax, puc_out 
			add eax, _stride_out; 
			mov puc_out,eax 
			mov ebx, puc_y 
			add ebx,stride_y; 
			mov puc_y,ebx 
				mov ecx, puc_u        
				mov edx, puc_v 
				mov edi, horiz_count 
				 
horiz_loop2: 
			 
			// load data 
			movd mm2, [ecx]					 ; mm2 = ________u3u2u1u0 
				movd mm3, [edx]					 ; mm3 = ________v3v2v1v0 
				movq mm0, [ebx]          ; mm0 = y7y6y5y4y3y2y1y0   
				 
				pxor mm7, mm7						 ; zero mm7 
				 
				// convert chroma part 
				punpcklbw mm2, mm7       ; mm2 = __u3__u2__u1__u0 
				punpcklbw mm3, mm7       ; mm3 = __v3__v2__v1__v0 
				psubw mm2, mmw_0x0080    ; mm2 -= 128 
				psubw mm3, mmw_0x0080    ; mm3 -= 128 
				psllw mm2, 3             ; mm2 *= 8 
				psllw mm3, 3             ; mm3 *= 8 
				movq mm4, mm2            ; mm4 = mm2 = u 
				movq mm5, mm3            ; mm5 = mm3 = v 
				pmulhw mm2, mmw_mult_U_G ; mm2 *= u green coeff  
				pmulhw mm3, mmw_mult_V_G ; mm3 *= v green coeff   
				pmulhw mm4, mmw_mult_U_B ; mm4 = blue chroma 
				pmulhw mm5, mmw_mult_V_R ; mm5 = red chroma 
				paddsw mm2, mm3					 ; mm2 = green chroma 
				 
				// convert luma part 
				psubusb mm0, mmb_0x10    ; mm0 -= 16 
				movq mm1, mmw_0x00ff     ; mm1 = 00ff00ff00ff00ff  
				pand mm1, mm0            ; mm1 = __y6__y4__y2__y0 luma even 
				psrlw mm0, 8             ; mm0 = __y7__y5__y3__y1 luma odd 
				psllw mm0, 3             ; mm0 *= 8 
				psllw mm1, 3             ; mm1 *= 8 
				pmulhw mm0, mmw_mult_Y   ; mm0 luma odd *= luma coeff  
				pmulhw mm1, mmw_mult_Y   ; mm1 luma even *= luma coeff  
				 
				// complete the matrix calc with the addictions 
				movq mm3, mm4						 ; copy blue chroma 
				movq mm6, mm5						 ; copy red chroma 
				movq mm7, mm2						 ; copy green chroma 
				paddsw mm3, mm0					 ; mm3 = luma odd + blue chroma 
				paddsw mm4, mm1					 ; mm4 = luma even + blue chroma 
				paddsw mm6, mm0					 ; mm6 = luma odd + red chroma 
				paddsw mm5, mm1					 ; mm5 = luma even + red chroma 
				paddsw mm7, mm0					 ; mm7 = luma odd + green chroma 
				paddsw mm2, mm1					 ; mm2 = luma even + green chroma 
				// clipping 
				packuswb mm3, mm3 
				packuswb mm4, mm4 
				packuswb mm6, mm6 
				packuswb mm5, mm5 
				packuswb mm7, mm7 
				packuswb mm2, mm2 
				// interleave odd and even parts 
				punpcklbw mm4, mm3			 ; mm4 = b7b6b5b4b3b2b1b0 blue 
				punpcklbw mm5, mm6			 ; mm5 = r7r6r5r4r3r2r1r0 red 
				punpcklbw mm2, mm7			 ; mm2 = g7g6g5g4g3g2g1g0 green 
				 
				// mask not needed bits (using 555) 
				pand mm4, mask_5 
				pand mm5, mask_5 
				pand mm2, mask_5 
				 
				// mix colors and write 
				 
				psrlw mm4, 3						 ; mm4 = blue shifted 
				pand mm4, mask_blue			 ; mask the blue again 
				pxor mm7, mm7						 ; zero mm7 
				movq mm1, mm4						 ; mm1 = copy blue 
				movq mm3, mm5						 ; mm3 = copy red 
				movq mm6, mm2						 ; mm6 = copy green 
				 
				punpckhbw mm1, mm7 
				punpckhbw mm3, mm7 
				punpckhbw mm6, mm7 
				psllw mm6, 2						 ; shift green 
				psllw mm3, 7						 ; shift red 
				por mm6, mm3 
				por mm6, mm1 
				movq 8[eax], mm6 
				 
				punpcklbw mm2, mm7			 ; mm2 = __g3__g2__g1__g0 already masked 
				punpcklbw mm5, mm7 
				punpcklbw mm4, mm7 
				psllw mm2, 2						 ; shift green 
				psllw mm5, 7						 ; shift red 
				por mm2, mm5 
				por mm2, mm4 
				movq [eax], mm2 
				 
				add ebx, 8               ; puc_y   += 8; 
			add ecx, 4               ; puc_u   += 4; 
			add edx, 4               ; puc_v   += 4; 
			add eax, 16              ; puc_out += 16 // wrote 16 bytes 
				 
				inc edi 
				jne horiz_loop2			 
				 
				pop edi  
				pop edx  
				pop ecx 
				pop ebx  
				pop eax 
				 
				emms 
				 
		} 
		puc_out += _stride_out; 
		puc_y   += stride_y; 
		puc_u   += stride_uv; 
		puc_v   += stride_uv; 
		 
	} 
} 
 
//--------------------------------------------------------------------------- 
 
void CDirectDisplay::yuv2rgb_565(BYTE *puc_y, int stride_y,  
								 BYTE *puc_u, BYTE *puc_v, int stride_uv,  
								 BYTE *puc_out, int width_y, int height_y, 
								 unsigned int _stride_out)  
{ 
     
	int y, horiz_count; 
	unsigned short * pus_out; 
	pus_out = (unsigned short *) puc_out; 
	horiz_count = -(width_y >> 3); 
	 
	for (y=0; y<height_y; y+=2)  
	{ 
		_asm  
		{ 
			push eax 
				push ebx 
				push ecx 
				push edx 
				push edi 
				 
				mov eax, puc_out        
				mov ebx, puc_y        
				mov ecx, puc_u        
				mov edx, puc_v 
				mov edi, horiz_count 
				 
horiz_loop1: 
			 
			// load data 
			movd mm2, [ecx]					 ; mm2 = ________u3u2u1u0 
				movd mm3, [edx]					 ; mm3 = ________v3v2v1v0 
				movq mm0, [ebx]          ; mm0 = y7y6y5y4y3y2y1y0   
				 
				pxor mm7, mm7			 ; zero mm7 
				 
				// convert chroma part 
				punpcklbw mm2, mm7       ; mm2 = __u3__u2__u1__u0 
				punpcklbw mm3, mm7       ; mm3 = __v3__v2__v1__v0 
				psubw mm2, mmw_0x0080    ; mm2 -= 128  //0x0080008000800080 
				psubw mm3, mmw_0x0080    ; mm3 -= 128 
				psllw mm2, 3             ; mm2 *= 8    // u 
				psllw mm3, 3             ; mm3 *= 8    // v 
				movq mm4, mm2            ; mm4 = mm2 = u 
				movq mm5, mm3            ; mm5 = mm3 = v 
				pmulhw mm2, mmw_mult_U_G ; mm2 *= u green coeff  //mmw_mult_U_G=0xf36ef36ef36ef36e; 
				pmulhw mm3, mmw_mult_V_G ; mm3 *= v green coeff  //mmw_mult_V_G=0xe5e2e5e2e5e2e5e2; 
				pmulhw mm4, mmw_mult_U_B ; mm4 = blue chroma     //mmw_mult_U_B=0x40cf40cf40cf40cf;   
				pmulhw mm5, mmw_mult_V_R ; mm5 = red chroma      //mmw_mult_V_R=0x3343334333433343; 
				paddsw mm2, mm3			 ; mm2 = green chroma    // u+v 
				 
				// convert luma part 
				psubusb mm0, mmb_0x10    ; mm0 -= 16     ;y-16        //mmb_0x10=0x1010101010101010; 
				 
				movq mm1, mmw_0x00ff     ; mm1 = 00ff00ff00ff00ff  
				pand mm1, mm0            ; mm1 = __y6__y4__y2__y0 luma even 
				psrlw mm0, 8             ; mm0 = __y7__y5__y3__y1 luma odd 
				 
				psllw mm0, 3             ; mm0 *= 8 
				psllw mm1, 3             ; mm1 *= 8 
				pmulhw mm0, mmw_mult_Y   ; mm0 luma odd *= luma coeff   //mmw_mult_Y= 0x2568256825682568; 
				pmulhw mm1, mmw_mult_Y   ; mm1 luma even *= luma coeff  // 
				 
				// complete the matrix calc with the addictions 
				movq mm3, mm4						 ; copy blue chroma 
				movq mm6, mm5						 ; copy red chroma 
				movq mm7, mm2						 ; copy green chroma 
				paddsw mm3, mm0					 ; mm3 = luma odd + blue chroma 
				paddsw mm4, mm1					 ; mm4 = luma even + blue chroma 
				paddsw mm6, mm0					 ; mm6 = luma odd + red chroma 
				paddsw mm5, mm1					 ; mm5 = luma even + red chroma 
				paddsw mm7, mm0					 ; mm7 = luma odd + green chroma 
				paddsw mm2, mm1					 ; mm2 = luma even + green chroma 
				// clipping 
				packuswb mm3, mm3 
				packuswb mm4, mm4 
				packuswb mm6, mm6 
				packuswb mm5, mm5 
				packuswb mm7, mm7 
				packuswb mm2, mm2 
				// interleave odd and even parts 
				punpcklbw mm4, mm3			 ; mm4 = b7b6b5b4b3b2b1b0 blue 
				punpcklbw mm5, mm6			 ; mm5 = r7r6r5r4r3r2r1r0 red 
				punpcklbw mm2, mm7			 ; mm2 = g7g6g5g4g3g2g1g0 green 
				 
				// mask not needed bits (using 555) 
				pand mm4, mask_5 
				pand mm5, mask_5 
				pand mm2, mask_5 
				 
				// mix colors and write 
				 
				psrlw mm4, 3						 ; mm4 = red shifted 
				pand mm4, mask_blue			 ; mask the blue again 
				pxor mm7, mm7						 ; zero mm7 
				movq mm1, mm5						 ; mm1 = copy blue 
				movq mm3, mm4						 ; mm3 = copy red 
				movq mm6, mm2						 ; mm6 = copy green 
				 
				punpckhbw mm1, mm7 
				punpckhbw mm3, mm7 
				punpckhbw mm6, mm7 
				psllw mm6, 3						 ; shift green 
				psllw mm1, 8						 ; shift blue 
				por mm6, mm3 
				por mm6, mm1 
				movq 8[eax], mm6 
				 
				punpcklbw mm2, mm7			 ; mm2 = __g3__g2__g1__g0 already masked 
				punpcklbw mm4, mm7 
				punpcklbw mm5, mm7 
				psllw mm2, 3						 ; shift green 
				psllw mm5, 8						 ; shift blue 
				por mm2, mm4 
				por mm2, mm5 
				movq [eax], mm2 
				 
				add ebx, 8               ; puc_y   += 8; 
			add ecx, 4               ; puc_u   += 4; 
			add edx, 4               ; puc_v   += 4; 
			add eax, 16              ; puc_out += 16 // wrote 16 bytes 
				 
				inc edi 
				jne horiz_loop1			 
				 
			mov eax, puc_out 
			add eax, _stride_out; 
			mov puc_out,eax 
			mov ebx, puc_y 
			add ebx,stride_y; 
			mov puc_y,ebx 
				 
				mov ecx, puc_u        
				mov edx, puc_v 
				mov edi, horiz_count 
				 
horiz_loop2: 
			 
			// load data 
			movd mm2, [ecx]					 ; mm2 = ________u3u2u1u0 
				movd mm3, [edx]					 ; mm3 = ________v3v2v1v0 
				movq mm0, [ebx]          ; mm0 = y7y6y5y4y3y2y1y0   
				 
				pxor mm7, mm7			 ; zero mm7 
				 
				// convert chroma part 
				punpcklbw mm2, mm7       ; mm2 = __u3__u2__u1__u0 
				punpcklbw mm3, mm7       ; mm3 = __v3__v2__v1__v0 
				psubw mm2, mmw_0x0080    ; mm2 -= 128  //0x0080008000800080 
				psubw mm3, mmw_0x0080    ; mm3 -= 128 
				psllw mm2, 3             ; mm2 *= 8    // u 
				psllw mm3, 3             ; mm3 *= 8    // v 
				movq mm4, mm2            ; mm4 = mm2 = u 
				movq mm5, mm3            ; mm5 = mm3 = v 
				pmulhw mm2, mmw_mult_U_G ; mm2 *= u green coeff  //mmw_mult_U_G=0xf36ef36ef36ef36e; 
				pmulhw mm3, mmw_mult_V_G ; mm3 *= v green coeff  //mmw_mult_V_G=0xe5e2e5e2e5e2e5e2; 
				pmulhw mm4, mmw_mult_U_B ; mm4 = blue chroma     //mmw_mult_U_B=0x40cf40cf40cf40cf;   
				pmulhw mm5, mmw_mult_V_R ; mm5 = red chroma      //mmw_mult_V_R=0x3343334333433343; 
				paddsw mm2, mm3			 ; mm2 = green chroma    // u+v 
				 
				// convert luma part 
				psubusb mm0, mmb_0x10    ; mm0 -= 16     ;y-16        //mmb_0x10=0x1010101010101010; 
				 
				movq mm1, mmw_0x00ff     ; mm1 = 00ff00ff00ff00ff  
				pand mm1, mm0            ; mm1 = __y6__y4__y2__y0 luma even 
				psrlw mm0, 8             ; mm0 = __y7__y5__y3__y1 luma odd 
				 
				psllw mm0, 3             ; mm0 *= 8 
				psllw mm1, 3             ; mm1 *= 8 
				pmulhw mm0, mmw_mult_Y   ; mm0 luma odd *= luma coeff   //mmw_mult_Y= 0x2568256825682568; 
				pmulhw mm1, mmw_mult_Y   ; mm1 luma even *= luma coeff  // 
				 
				// complete the matrix calc with the addictions 
				movq mm3, mm4						 ; copy blue chroma 
				movq mm6, mm5						 ; copy red chroma 
				movq mm7, mm2						 ; copy green chroma 
				paddsw mm3, mm0					 ; mm3 = luma odd + blue chroma 
				paddsw mm4, mm1					 ; mm4 = luma even + blue chroma 
				paddsw mm6, mm0					 ; mm6 = luma odd + red chroma 
				paddsw mm5, mm1					 ; mm5 = luma even + red chroma 
				paddsw mm7, mm0					 ; mm7 = luma odd + green chroma 
				paddsw mm2, mm1					 ; mm2 = luma even + green chroma 
				// clipping 
				packuswb mm3, mm3 
				packuswb mm4, mm4 
				packuswb mm6, mm6 
				packuswb mm5, mm5 
				packuswb mm7, mm7 
				packuswb mm2, mm2 
				// interleave odd and even parts 
				punpcklbw mm4, mm3			 ; mm4 = b7b6b5b4b3b2b1b0 blue 
				punpcklbw mm5, mm6			 ; mm5 = r7r6r5r4r3r2r1r0 red 
				punpcklbw mm2, mm7			 ; mm2 = g7g6g5g4g3g2g1g0 green 
				 
				// mask not needed bits (using 555) 
				pand mm4, mask_5 
				pand mm5, mask_5 
				pand mm2, mask_5 
				 
				// mix colors and write 
				 
				psrlw mm4, 3						 ; mm4 = red shifted 
				pand mm4, mask_blue			 ; mask the blue again 
				pxor mm7, mm7						 ; zero mm7 
				movq mm1, mm5						 ; mm1 = copy blue 
				movq mm3, mm4						 ; mm3 = copy red 
				movq mm6, mm2						 ; mm6 = copy green 
				 
				punpckhbw mm1, mm7 
				punpckhbw mm3, mm7 
				punpckhbw mm6, mm7 
				psllw mm6, 3						 ; shift green 
				psllw mm1, 8						 ; shift blue 
				por mm6, mm3 
				por mm6, mm1 
				movq 8[eax], mm6 
				 
				punpcklbw mm2, mm7			 ; mm2 = __g3__g2__g1__g0 already masked 
				punpcklbw mm4, mm7 
				punpcklbw mm5, mm7 
				psllw mm2, 3						 ; shift green 
				psllw mm5, 8						 ; shift blue 
				por mm2, mm4 
				por mm2, mm5 
				movq [eax], mm2 
				 
				add ebx, 8               ; puc_y   += 8; 
			add ecx, 4               ; puc_u   += 4; 
			add edx, 4               ; puc_v   += 4; 
			add eax, 16              ; puc_out += 16 // wrote 16 bytes 
				 
				inc edi 
				jne horiz_loop2			 
				 
				pop edi  
				pop edx  
				pop ecx 
				pop ebx  
				pop eax 
				 
				emms 
				 
		} 
		puc_y   += stride_y; 
		puc_out += _stride_out; 
		puc_u   += stride_uv; 
		puc_v   += stride_uv; 
		 
	} 
	 
} 
 
BOOL CDirectDisplay::CaptureImage(int nImageType,int nQuality,unsigned char *pImageBuf, unsigned long *pnImageSize) 
{ 
	return FALSE; 
}