www.pudn.com > Estereo.rar > processingSSE2.cpp


/***************************************************************************  
* 
* Copyright 2004 by the Massachusetts Institute  of Technology.   All    
* rights reserved.  
*   
* Developed  by the Vision Interface Group   
* at the Computer Sciences and Artificial Intelligence Laboratory,  
* MIT, Cambridge, Massachusetts.  
*   
* Permission to use, copy, or modify this software and  its documentation  
* for  educational  and  research purposes only and without fee  is hereby  
* granted, provided  that this copyright notice and the original authors's  
* names appear  on all copies and supporting documentation.  If individual  
* files are  separated from  this  distribution directory  structure, this  
* copyright notice must be included.  For any other uses of this software,  
* in original or  modified form, including but not limited to distribution  
* in whole or in  part, specific  prior permission  must be  obtained from  
* MIT.  These programs shall not  be  used, rewritten, or  adapted as  the  
* basis  of  a  commercial  software  or  hardware product  without  first  
* obtaining appropriate licenses  from MIT.  MIT. makes no representations  
* about the suitability of this  software for any purpose.  It is provided  
* "as is" without express or implied warranty.  
*   
**************************************************************************/ 
 
#include "stereoMatching.h" 
#include "processingmmx.h" 
 
// Src1, Src2 and Dest suppose to point on 16-bytes memory block 
inline int ImgSubandAdd_sse2(const unsigned char *Src1, const unsigned char *Src2,  
				 const unsigned char *Src3, unsigned char *Dest, int l) 
{ 
 
	if (l < 8) return 0;              // image size must be at least 8 bytes  
 
  __asm  
  {		 
        mov eax, Src1      
        mov ebx, Src2 
		mov edx, Src3 
        mov edi, Dest     
        mov	ecx, l    
        shr	ecx, 4	 
	 
align 16 
inner_loop: 
		movdqa	xmm1,[eax]	// xmm1=src1 
		movdqa	xmm2,[ebx]	// mm2=src2 
 
		movdqa	xmm4,xmm1		// mm4=mm1 
 
		psubusb	xmm4,xmm2		// mm4 = src1 - src2 
 
		movdqu	xmm3,[edx]	// mm3=src3 
		psubusb	xmm2,xmm1		// mm2 = src2 - src1 
         
		movdqa	xmm5,xmm1		// mm5=src1 
		por		xmm2,xmm4		// mm2=|src1-src2| 
 
        psubusb	xmm5,xmm3		// mm4=src1-src3 
 
        psubusb	xmm3,xmm1	 	// mm3=src3-src1 
 
		por		xmm3,xmm5		// mm3=|src1-src3| 
 
		paddusb xmm2,xmm3		// mm2 = |src1-src2|+|src1-src3| 
 
        movdqa    [edi], xmm2	  
        add eax,16         
        add ebx,16     
        add edx,16     
        add edi,16		 
        dec ecx       
        jnz inner_loop     
        emms   		 
  } 
	 
  return 1; 
} 
 
 
 
 
 
 
 
#define macro_add_sse2 __asm \ 
{						\ 
	__asm 	paddusw xmm3, [edx]	\ 
	__asm 	paddusw xmm2, [edx+16]	\ 
	__asm	add edx, edi		\ 
} 
 
 
inline void avg_Col_5_sse2(ushort* im, uchar* im_out, int dataSize, int width) 
{ 
	__asm { 
 
	mov edi, width 
	shl edi, 1  // edi = 2*width 
 
	mov eax, dataSize 
	mov ecx, im_out 
 
	mov ebx, im 
	sub ebx, edi 
	sub ebx, edi // ebx = ebx-4*width 
	 
	test eax, eax // Is there anything to do?"  
	jz end_sum_loop // Jump out if necessary  
 
	row_sum_loop: 
 
		test eax, eax // Is there anything to do?  
		jz end_sum_loop // Jump out if necessary  
 
		mov edx, ebx 
		add ebx, 32 
 
		// 1 
		movdqa xmm3, [edx] // xmm3 = 8 words of im 
		movdqa xmm2, [edx+16] // xmm3 = 8 words of im 
		add edx, edi 
 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		 
		// divide results by ... 
		psrlw xmm3, 3 
		psrlw xmm2, 3 
 
		// convert [xmm2 xmm3] as 8 words 
		packuswb xmm3,xmm2 
		movdqa [ecx], xmm3 
 
		sub eax, 16 // Update the number of points left  
		add ecx, 16 // Update output pointer  
 
		jmp row_sum_loop // Loop  
 
		//Cleanup  
	end_sum_loop: 
	emms  
	} 
} 
 
inline void avg_Col_7_sse2(ushort* im, uchar* im_out, int dataSize, int width) 
{ 
	__asm { 
 
	mov edi, width 
	shl edi, 1  // edi = 2*width 
 
	mov eax, dataSize 
	mov ecx, im_out 
 
	mov ebx, im 
	sub ebx, edi 
	sub ebx, edi 
	sub ebx, edi // ebx = ebx-4*width 
	 
	test eax, eax // Is there anything to do?"  
	jz end_sum_loop // Jump out if necessary  
 
	row_sum_loop: 
 
		test eax, eax // Is there anything to do?  
		jz end_sum_loop // Jump out if necessary  
 
		mov edx, ebx 
		add ebx, 32 
 
		// 1 
		movdqa xmm3, [edx] // xmm3 = 8 words of im 
		movdqa xmm2, [edx+16] // xmm3 = 8 words of im 
		add edx, edi 
 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		 
		// divide results by ... 
		psrlw xmm3, 3 
		psrlw xmm2, 3 
 
		// convert [xmm2 xmm3] as 8 words 
		packuswb xmm3,xmm2 
		movdqa [ecx], xmm3 
 
		sub eax, 16 // Update the number of points left  
		add ecx, 16 // Update output pointer  
 
		jmp row_sum_loop // Loop  
 
		//Cleanup  
	end_sum_loop: 
	emms  
	} 
} 
 
inline void avg_Col_9_sse2(ushort* im, uchar* im_out, int dataSize, int width) 
{ 
	__asm { 
 
	mov edi, width 
	shl edi, 1  // edi = 2*width 
 
	mov eax, dataSize 
	mov ecx, im_out 
 
	mov ebx, im 
	sub ebx, edi 
	sub ebx, edi 
	sub ebx, edi 
	sub ebx, edi // ebx = ebx-4*width 
	 
	test eax, eax // Is there anything to do?"  
	jz end_sum_loop // Jump out if necessary  
 
	row_sum_loop: 
 
		test eax, eax // Is there anything to do?  
		jz end_sum_loop // Jump out if necessary  
 
		mov edx, ebx 
		add ebx, 32 
 
		// 1 
		movdqa xmm3, [edx] // xmm3 = 8 words of im 
		movdqa xmm2, [edx+16] // xmm3 = 8 words of im 
		add edx, edi 
 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		 
		// divide results by ... 
		psrlw xmm3, 3 
		psrlw xmm2, 3 
 
		// convert [xmm2 xmm3] as 8 words 
		packuswb xmm3,xmm2 
		movdqa [ecx], xmm3 
 
		sub eax, 16 // Update the number of points left  
		add ecx, 16 // Update output pointer  
 
		jmp row_sum_loop // Loop  
 
		//Cleanup  
	end_sum_loop: 
	emms  
	} 
} 
 
inline void avg_Col_11_sse2(ushort* im, uchar* im_out, int dataSize, int width) 
{ 
	__asm { 
 
	mov edi, width 
	shl edi, 1  // edi = 2*width 
 
	mov eax, dataSize 
	mov ecx, im_out 
 
	mov ebx, im 
	sub ebx, edi 
	sub ebx, edi 
	sub ebx, edi 
	sub ebx, edi 
	sub ebx, edi // ebx = ebx-4*width 
	 
	test eax, eax // Is there anything to do?"  
	jz end_sum_loop // Jump out if necessary  
 
	row_sum_loop: 
 
		test eax, eax // Is there anything to do?  
		jz end_sum_loop // Jump out if necessary  
 
		mov edx, ebx 
		add ebx, 32 
 
		// 1 
		movdqa xmm3, [edx] // xmm3 = 8 words of im 
		movdqa xmm2, [edx+16] // xmm3 = 8 words of im 
		add edx, edi 
 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		 
		// divide results by ... 
		psrlw xmm3, 3 
		psrlw xmm2, 3 
 
		// convert [xmm2 xmm3] as 8 words 
		packuswb xmm3,xmm2 
		movdqa [ecx], xmm3 
 
		sub eax, 16 // Update the number of points left  
		add ecx, 16 // Update output pointer  
 
		jmp row_sum_loop // Loop  
 
		//Cleanup  
	end_sum_loop: 
	emms  
	} 
} 
 
inline void avg_Col_13_sse2(ushort* im, uchar* im_out, int dataSize, int width) 
{ 
	__asm { 
 
	mov edi, width 
	shl edi, 1  // edi = 2*width 
 
	mov eax, dataSize 
	mov ecx, im_out 
 
	mov ebx, im 
	sub ebx, edi 
	sub ebx, edi 
	sub ebx, edi 
	sub ebx, edi 
	sub ebx, edi 
	sub ebx, edi // ebx = ebx-4*width 
	 
	test eax, eax // Is there anything to do?"  
	jz end_sum_loop // Jump out if necessary  
 
	row_sum_loop: 
 
		test eax, eax // Is there anything to do?  
		jz end_sum_loop // Jump out if necessary  
 
		mov edx, ebx 
		add ebx, 32 
 
		// 1 
		movdqa xmm3, [edx] // xmm3 = 8 words of im 
		movdqa xmm2, [edx+16] // xmm3 = 8 words of im 
		add edx, edi 
 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		macro_add_sse2 
		 
		// divide results by ... 
		psrlw xmm3, 3 
		psrlw xmm2, 3 
 
		// convert [xmm2 xmm3] as 8 words 
		packuswb xmm3,xmm2 
		movdqa [ecx], xmm3 
 
		sub eax, 16 // Update the number of points left  
		add ecx, 16 // Update output pointer  
 
		jmp row_sum_loop // Loop  
 
		//Cleanup  
	end_sum_loop: 
	emms  
	} 
} 
 
// apply vertical mask 1/16*[1 1 1 ... 1]^T to 'im' 
// result in 'im_out' 
void avg_Col_sse2(ushort* im, uchar* im_out, int dataSize, int width, int sizeMask) 
{ 
	switch (sizeMask) 
	{ 
	case 5: avg_Col_5_sse2(im,im_out,dataSize,width); 
		break; 
	case 7: avg_Col_7_sse2(im,im_out,dataSize,width); 
		break; 
	case 9: avg_Col_9_sse2(im,im_out,dataSize,width); 
		break;	 
	case 11: avg_Col_11_sse2(im,im_out,dataSize,width); 
		break; 
	case 13: avg_Col_13_sse2(im,im_out,dataSize,width); 
		break; 
	case 15: avg_Col_15(im,im_out,dataSize,width); 
		break; 
	case 17: avg_Col_17(im,im_out,dataSize,width); 
		break; 
 
	default: avg_Col_5_sse2(im,im_out,dataSize,width); 
		break; 
 
	} 
}