www.pudn.com > mmxswarm.zip > SSE2Surface16.cpp


// SSE2Surface16.cpp : implementation of the CSSE2Surface16Intrinsic 
// class 
// 
// This is a part of the Microsoft Foundation Classes C++ library. 
// Copyright (c) Microsoft Corporation.  All rights reserved. 
// 
// This source code is only intended as a supplement to the 
// Microsoft Foundation Classes Reference and related 
// electronic documentation provided with the library. 
// See these sources for detailed information regarding the 
// Microsoft Foundation Classes product. 
// 
#include "stdafx.h" 
#include "SSE2Surface.h" 
#include "SSE2Wrapper.h" 
 
typedef CSSE2Unsigned16Saturated CSSE2; 
 
// Optimized for a 8 pixel processing 16 bit buffer 
void CSSE2Surface16Intrinsic::AdjustWidth(int *pWidth) 
{ 
	ASSERT(pWidth != NULL); 
	ASSERT(m_kDeltaX <= 7); 
 
	// need to align/round-up to 128-bits/16-bytes 
	// so that each line starts on an aligned boundary. 
 
	// Since a pixel is 2 bytes wide, this means rounding 
	// the nearest 8 pixels. 
	*pWidth = (*pWidth+7-m_kDeltaX)& ~0x7; 
} 
 
void CSSE2Surface16Intrinsic::OnCreated() 
{ 
	ASSERT(GetBitDepth() == 16); 
	ASSERT((GetPitch() & 0xF) == 0); 
	ASSERT(GetVisibleWidth() && GetVisibleHeight()); 
 
	int width = GetVisibleWidth(); 
    m_qqwpl  = GetPitch()/16; // qwords Per Line 
    m_width = (width+7)/8; // 8 pixels at a time 
} 
 
// Note: It's still faster than the brute force approach, 
// However, it's slower than choosing 24 bit and blitting to 
// a 16 bit screen. breaking out the bits into MMX friendly sizes 
// doesn't work out well. 
void CSSE2Surface16Intrinsic::BlurBits() 
{ 
    int height = GetVisibleHeight(); 
    __m128i *pCur  = (__m128i *)GetPixelAddress(0,0); 
 
	CSSE2 cUpBase, cDownBase, cCurBase, cLeftBase, cRightBase; 
	CSSE2 cUp, cDown, cCur, cLeft, cRight; 
	CSSE2 cDest; 
	CSSE2 cMask; 
	cMask.Fill(0x001f001f001f001fu); // colorspace mask - 5 bits per color 
 
	do { 
		int width = m_width; 
		do { 
			// Load pixels and do the mmx unpack 
			// Note: pwCur is used to do non-aligned 
			// data reads - which is not normally recommended. 
			// on X86, it is faster than loading aligned and 
			// shift-oring. 
			WORD *pwCur = (WORD *)pCur; 
			cLeftBase.LoadU(pwCur-1); 
			cCurBase.Load(pCur); 
			cRightBase.LoadU(pwCur+1); 
			cUpBase.Load(pCur-m_qqwpl); 
			cDownBase.Load(pCur+m_qqwpl); 
 
			cLeft = cLeftBase & cMask; 
			cCur = cCurBase & cMask; 
			cRight = cRightBase & cMask; 
			cUp = cUpBase & cMask; 
			cDown = cDownBase & cMask; 
 
			// Actual math. Don't step on current, or right. 
			// Sum the 4 around and double the middle 
			// Do current pixel in this line 
			cDest = ((cDown+cUp+cLeft+cRight+(cCur<<2))>>3); 
			cMask <<= 5; 
 
			cLeft = cLeftBase & cMask; 
			cCur = cCurBase & cMask; 
			cRight = cRightBase & cMask; 
			cUp = cUpBase & cMask; 
			cDown = cDownBase & cMask; 
 
			// Actual math for next color space 
			cDest |= cMask & ((cDown+cUp+cLeft+cRight+(cCur<<2))>>3); 
			cMask <<= 5; 
 
			cLeft = (cLeftBase & cMask) >> 3; 
			cCur = (cCurBase & cMask) >> 3; 
			cRight = (cRightBase & cMask) >> 3; 
			cUp = (cUpBase & cMask) >> 3; 
			cDown = (cDownBase & cMask) >> 3; 
 
			// Actual math for next color space 
			cDest |= cMask & (cDown+cUp+cLeft+cRight+(cCur<<2)); 
			cDest.Store(pCur++); 
			cMask >>= 10; 
		} while (--width > 0); 
	} while (--height > 0); 
}