www.pudn.com > Estereo.rar > processingMMX.cpp


/***************************************************************************  
* 
* Copyright 2004 by the Massachusetts Institute  of Technology.   All    
* rights reserved.  
*   
* Developed  by the Vision Interface Group   
* at the Computer Sciences and Artificial Intelligence Laboratory,  
* MIT, Cambridge, Massachusetts.  
*   
* Permission to use, copy, or modify this software and  its documentation  
* for  educational  and  research purposes only and without fee  is hereby  
* granted, provided  that this copyright notice and the original authors's  
* names appear  on all copies and supporting documentation.  If individual  
* files are  separated from  this  distribution directory  structure, this  
* copyright notice must be included.  For any other uses of this software,  
* in original or  modified form, including but not limited to distribution  
* in whole or in  part, specific  prior permission  must be  obtained from  
* MIT.  These programs shall not  be  used, rewritten, or  adapted as  the  
* basis  of  a  commercial  software  or  hardware product  without  first  
* obtaining appropriate licenses  from MIT.  MIT. makes no representations  
* about the suitability of this  software for any purpose.  It is provided  
* "as is" without express or implied warranty.  
*   
**************************************************************************/ 
 
#include "stereoMatching.h" 
#include "processingMMX.h" 
	 
//#include  
 
//  ImgSub2: D = saturation0(|S1 - S2| + |S1 - S3|) 
// TODO? divide the result by 2 (shift) 
int ImgSubandAdd(const unsigned char *Src1, const unsigned char *Src2,  
				 const unsigned char *Src3, unsigned char *Dest, int l) 
{ 
 
	if (l < 8) return 0;              // image size must be at least 8 bytes  
 
  __asm  
  {		 
        mov eax, Src1      
        mov ebx, Src2 
		mov edx, Src3 
        mov edi, Dest     
        mov	ecx, l    
        shr	ecx, 3	 
	 
align 16 
inner_loop: 
		movq	mm1,[eax]	// mm1=src1 
 
		movq	mm2,[ebx]	// mm2=src2 
 
		movq	mm4,mm1		// mm4=mm1 
 
		psubusb	mm4,mm2		// mm4 = src1 - src2 
 
		movq	mm3,[edx]	// mm3=src3 
		psubusb	mm2,mm1		// mm2 = src2 - src1 
         
		movq	mm5,mm1		// mm5=src1 
		por		mm2,mm4		// mm2=|src1-src2| 
 
        psubusb	mm5,mm3		// mm4=src1-src3 
 
        psubusb	mm3,mm1	 	// mm3=src3-src1 
 
		por		mm3,mm5		// mm3=|src1-src3| 
 
		paddusb mm2,mm3		// mm2 = |src1-src2|+|src1-src3| 
 
        movq    [edi], mm2	  
        add eax,8          
        add ebx,8      
        add edx,8      
        add edi,8		 
        dec ecx       
        jnz inner_loop     
        emms   		 
  } 
	 
  return 1; 
} 
 
//  ImgSub2: D = saturation0(|S1 - S2|) 
// TODO? divide the result by 2 (shift) 
int ImgSubandAdd(const unsigned char *Src1, const unsigned char *Src2,  
				 const unsigned char *Dest, int l) 
{ 
 
	if (l < 8) return 0;              // image size must be at least 8 bytes  
 
  __asm  
  {		 
        mov eax, Src1      
        mov ebx, Src2 
        mov edi, Dest     
        mov	ecx, l    
        shr	ecx, 3	 
	 
align 16 
inner_loop: 
		movq	mm1,[eax]	// mm1=src1 
		movq	mm2,[ebx]	// mm2=src2 
 
		movq	mm4,mm1		// mm4=mm1 
		psubusb	mm4,mm2		// mm4 = src1 - src2 
 
		psubusb	mm2,mm1		// mm2 = src2 - src1   
		por		mm2,mm4		// mm2=|src1-src2| 
 
        movq    [edi], mm2	  
        add eax,8          
        add ebx,8      
        add edi,8		 
        dec ecx       
        jnz inner_loop     
        emms   		 
  } 
	 
  return 1; 
} 
 
 
 
 
 
 
#define _ABS_DIFF_TRI(Z) __asm \ 
{ \ 
	__asm	movq	mm4,mm1		/* mm4=mm1 */ \ 
	__asm	add ebx, width  \ 
	__asm	add edi, imageSize \ 
	__asm	por		mm3,mm7		/* here mm2=new src2		mm3=new src3 */ \ 
\ 
	__asm	movq	mm7, mm0 \ 
	__asm	psubusb	mm4,mm2		/* mm4 = src1 - src2 */ \ 
\ 
	__asm	psubusb	mm2,mm1		/* mm2 = src2 - src1 */ \ 
	__asm	psllq	mm7,Z	\ 
\ 
	__asm	movq	mm5,mm1		/* mm5=src1 */ \ 
	__asm	por		mm4,mm2		/* mm2=|src1-src2| */ \ 
\ 
	__asm	movq	mm2,[ebx]	/* mm2= src2 + 'width' = new src2*/ \ 
	__asm	psubusb	mm5,mm3		/* mm5=src1-src3*/ \ 
\ 
	__asm	movq	mm6,mm3		/* mm6=src3*/ \ 
    __asm   psubusb	mm6,mm1	 	/* mm3=src3-src1*/ \ 
\ 
	__asm	por		mm6,mm5		/* mm6=|src1-src3|*/ \ 
	__asm	paddusb mm4,mm6		/* mm4 = |src1-src2|+|src1-src3|*/ \ 
\ 
	__asm	movq    [edi], mm4	 /* here mm1=src1*/	 \ 
	__asm	psrlq	mm3, 8		/* mm3 = src3 + '1' ... with [x00000000] at the end*/\ 
} 
 
 
#define _ABS_DIFF_TRI_prefetch(Z, X) __asm \ 
{ \ 
	__asm	movq	mm4,mm1		/* mm4=mm1 */ \ 
	__asm	add ebx, width  \ 
	__asm	add edi, imageSize \ 
	__asm	por		mm3,mm7		/* here mm2=new src2		mm3=new src3 */ \ 
\ 
	__asm	movq	mm7, mm0 \ 
	__asm	psubusb	mm4,mm2		/* mm4 = src1 - src2 */ \ 
\ 
	__asm	psubusb	mm2,mm1		/* mm2 = src2 - src1 */ \ 
	__asm   prefetcht0 [ebx + X] \ 
	__asm	psllq	mm7,Z	\ 
\ 
	__asm	movq	mm5,mm1		/* mm5=src1 */ \ 
	__asm	por		mm4,mm2		/* mm2=|src1-src2| */ \ 
\ 
\ 
	__asm	movq	mm2,[ebx]	/* mm2= src2 + 'width' = new src2*/ \ 
	__asm	psubusb	mm5,mm3		/* mm5=src1-src3*/ \ 
\ 
	__asm	movq	mm6,mm3		/* mm6=src3*/ \ 
    __asm   psubusb	mm6,mm1	 	/* mm3=src3-src1*/ \ 
\ 
	__asm	por		mm6,mm5		/* mm6=|src1-src3|*/ \ 
	__asm	paddusb mm4,mm6		/* mm4 = |src1-src2|+|src1-src3|*/ \ 
\ 
	__asm	movq    [edi], mm4	 /* here mm1=src1*/	 \ 
	__asm	psrlq	mm3, 8		/* mm3 = src3 + '1' ... with [x00000000] at the end*/\ 
} 
 
//  ImgSubandAdd2: D = saturation0(|S1 - S2| + |S1 - S3|) 
// process 8 disparities at a time 
// 
//			Src1: right	 
//			Src2: top	 
//			Src3: left 
// 
// TODO? divide the result by 2 (shift) 
int ImgSubandAdd2(const unsigned char *Src1, const unsigned char *Src2,  
				 const unsigned char *Src3,  
				 unsigned char* Dest1, int l, int imageSize, int width) 
{	 
	if (l < 8) return 0;              // image size must be at least 8 bytes  
	const int back_step1 = 7*width; 
	const int back_step2 = 7*imageSize; 
  __asm  
  {		 
        mov eax, Src1      
        mov ebx, Src2 
		mov edx, Src3 
        mov edi, Dest1    
 
        mov	ecx, l    
        shr	ecx, 3	 
	 
		movq	mm0,[edx]	// mm0=src3 
		movq	mm0,[edx]	// mm0=src3 
align 16 
inner_loop: 
		movq	mm1,[eax]	// mm1=src1 
		movq	mm3,mm0		// mm3=src3 
 
		movq	mm2,[ebx]	// mm2=src2 
        add eax,8          
  
		// -- 1 --------- in : mm1,mm2,mm3     out: mm4=SAD  mm2=new mm2 -- 
		movq	mm4,mm1		// mm4=mm1 
 
		add		ebx,width 
 
		psubusb	mm4,mm2		// mm4 = src1 - src2 
		//prefetcht0 [ebx + 32 + 2*320] 
 
		movq	mm0,[edx+8] 
		psubusb	mm2,mm1		// mm2 = src2 - src1 
         
		movq	mm5,mm1		// mm5=src1 
		por		mm4,mm2		// mm2=|src1-src2| 
 
		movq	mm2,[ebx]	// mm2= src2 + 'width' = new src2 
		psubusb	mm5,mm3		// mm5=src1-src3 
 
		movq	mm6,mm3		// mm6=src3 
        psubusb	mm6,mm1	 	// mm3=src3-src1 
 
		movq	mm7, mm0 
		psrlq	mm3, 8		// mm3 = src3 + '1' ... with [x00000000] at the end 
 
		por		mm6,mm5		// mm6=|src1-src3| 
		paddusb mm4,mm6		// mm4 = |src1-src2|+|src1-src3| 
 
        movq    [edi], mm4	  
		psllq	mm7, 56		// here mm1=src1	mm2=NEW src2	mm3=begin of NEWsrc3 	  mm7=end of NEWsrc3 
		// ------------------------------------------------------------- 
		 
 
		// - 2 ---------------- 
		_ABS_DIFF_TRI(48) 
	 
 		// - 3 ---------------- 
		_ABS_DIFF_TRI(40) 
 
 		// - 4 ---------------- 
		_ABS_DIFF_TRI(32) 
//		_ABS_DIFF_TRI_prefetch(32,24 + 3*320) 
 
		// - 5 ---------------- 
		_ABS_DIFF_TRI(24) 
		 
		// - 6 ---------------- 
		_ABS_DIFF_TRI(16) 
		 
		// - 7 ---------------- 
		_ABS_DIFF_TRI(8) 
	 
 
		// - 8 ---------------- 
		movq	mm4,mm1		// mm4=mm1 
		por		mm3,mm7		// here mm2=new src2		mm3=new src3 
 
		psubusb	mm4,mm2		// mm4 = src1 - src2 
		psubusb	mm2,mm1		// mm2 = src2 - src1 
         
		movq	mm5,mm1		// mm5=src1 
		por		mm4,mm2		// mm2=|src1-src2| 
 
		psubusb	mm5,mm3		// mm5=src1-src3 
        psubusb	mm3,mm1	 	// mm3=src3-src1 
 
		por		mm3,mm5		// mm6=|src1-src3| 
		paddusb mm4,mm3		// mm4 = |src1-src2|+|src1-src3| 
		 
		add edi, imageSize 
 
        movq    [edi], mm4	 // here mm1=src1	 
		// ------------------------------------------------------------- 
 		//  
		sub ebx, back_step1 
        add ebx,8 
        add edx,8      
		sub edi, back_step2 
        add edi,8		 
        dec ecx       
        jnz inner_loop     
        emms   		 
  } 
	 
  return 1; 
} 
 
 
// macro: in: mm1,mm2 
#define _ABS_DIFF_ __asm \ 
{ \ 
	__asm  movq	mm4,mm1 	/* mm4=mm1 */ \ 
	__asm  psubusb mm4,mm2 	/* mm4 = src1 - src2 */ \ 
	__asm  psubusb mm2,mm1 	/* mm2 = src2 - src1 */ \ 
	__asm  por 	mm4,mm2 	/* mm2=|src1-src2| */ \ 
	__asm  add ebx, width \ 
	__asm  add edi, imageSize \ 
    __asm  movq mm2,[ebx] \ 
	__asm  movq [edi], mm4 	 /* here mm1=src1	 */ \ 
} 
 
//  ImgSubandAdd2: D = saturation0(|S1 - S2| + |S1 - S3|) 
// process 8 disparities at a time 
//			Src1: right	 
//			Src2: top	 
// TODO? divide the result by 2 (shift) 
int ImgSubandAdd2_Vert(const unsigned char *Src1, const unsigned char *Src2,  
						unsigned char* Dest1, int l, int imageSize, int width) 
{ 
 
	if (l < 8) return 0;              // image size must be at least 8 bytes  
	const int back_step1 = 7*width; 
	const int back_step2 = 7*imageSize; 
  __asm  
  {		 
        mov eax, Src1      
        mov ebx, Src2 
        mov edi, Dest1    
 
        mov	ecx, l    
        shr	ecx, 3	 
 
align 16 
inner_loop: 
 
		movq	mm1,[eax]	// mm1=src1 
		movq	mm2,[ebx]	// mm2=src2 
        add eax,8          
  
		// -- 1 --------- in : mm1,mm2,mm3     out: mm4=SAD  mm2=new mm2 -- 
		_ABS_DIFF_ 
		_ABS_DIFF_ 
		_ABS_DIFF_ 
		_ABS_DIFF_ 
		_ABS_DIFF_ 
		_ABS_DIFF_ 
		_ABS_DIFF_ 
 
		// - 8 ---------------- 
		movq	mm4,mm1		// mm4=mm1 
 
		psubusb	mm4,mm2		// mm4 = src1 - src2 
		psubusb	mm2,mm1		// mm2 = src2 - src1 
         
		por		mm4,mm2		// mm2=|src1-src2| 
		add edi, imageSize 
 
        movq    [edi], mm4	 // here mm1=src1	 
		// ------------------------------------------------------------- 
 		//  
		sub ebx, back_step1 
        add ebx,8 
 		sub edi, back_step2 
        add edi,8		 
        dec ecx       
        jnz inner_loop     
        emms   		 
  } 
	 
  return 1; 
} 
 
// macro: in: mm1,mm2 
#define _ABS_DIFF_HORIZ(Z) __asm \ 
{ \ 
	__asm  movq	mm7, mm0 \ 
	__asm  add edi, imageSize \ 
	__asm  movq	mm5,mm1		/* mm5=src1 */ \ 
	__asm  psllq	mm7, Z \ 
	__asm  psubusb	mm5,mm3		/* mm5=src1-src3 */ \ 
	__asm  movq	mm6,mm3		/* mm6=src3 */ \ 
    __asm  psubusb	mm6,mm1	 	/* mm3=src3-src1 */ \ 
	__asm  por		mm6,mm5		/* mm6=|src1-src3| */ \ 
    __asm  movq    [edi], mm6	 /* here mm1=src1 */ \ 
	__asm  psrlq	mm3, 8		/* mm3 = src3 + '1' ... with [x00000000] at the end */ \ 
	__asm  por		mm3,mm7		/* here mm3=new src3 */ \ 
}	 
 
//  ImgSubandAdd2: D = saturation0(|S1 - S2| + |S1 - S3|) 
// process 8 disparities at a time 
// 
//			Src1: right	 
//			Src2: top	 
//			Src3: left 
// 
// TODO? divide the result by 2 (shift) 
int ImgSubandAdd_Horiz(const unsigned char *rightIm, const unsigned char *leftIm,  
						 unsigned char* Dest, int l, int imageSize, int width) 
{ 
 
	if (l < 8) return 0;              // image size must be at least 8 bytes  
	const int back_step2 = 7*imageSize; 
  __asm  
  {		 
        mov eax, rightIm      
 		mov edx, leftIm 
        mov edi, Dest  
 
        mov	ecx, l    
        shr	ecx, 3	 
	 
		movq	mm0,[edx]	// mm0=src3 
		movq	mm0,[edx]	// mm0=src3 
align 16 
inner_loop: 
 
		movq	mm1,[eax]	// mm1=src1 
		movq	mm3,mm0		// mm3=src3 
 
		// -- 1 --------- in : mm1,mm2,mm3     out: mm4=SAD  mm2=new mm2 -- 
		movq	mm0,[edx+8] 
        add eax,8          
    
		movq	mm5,mm1		// mm5=src1 
		psubusb	mm5,mm3		// mm5=src1-src3 
 
		movq	mm6,mm3		// mm6=src3 
        psubusb	mm6,mm1	 	// mm3=src3-src1 
 
		movq	mm7, mm0 
		psrlq	mm3, 8		// mm3 = src3 + '1' ... with [x00000000] at the end 
 
		por		mm6,mm5		// mm6=|src1-src3| 
 
        movq    [edi], mm6	  
		psllq	mm7, 56		// here mm1=src1	mm3=begin of NEWsrc3 	  mm7=end of NEWsrc3 
		por		mm3,mm7		// here mm3=new src3 
 
		// - 2 ---------------- 
		_ABS_DIFF_HORIZ(48) 
		_ABS_DIFF_HORIZ(40) 
		_ABS_DIFF_HORIZ(32) 
		_ABS_DIFF_HORIZ(24) 
		_ABS_DIFF_HORIZ(16) 
		_ABS_DIFF_HORIZ(8) 
 
		// - 8 ---------------- 
		movq	mm5,mm1		// mm5=src1 
		add edi, imageSize 
 
		psubusb	mm5,mm3		// mm5=src1-src3 
        psubusb	mm3,mm1	 	// mm3=src3-src1 
 
		por		mm3,mm5		// mm6=|src1-src3|		 
        movq    [edi], mm3	  
		// ------------------------------------------------------------- 
 		//  
        add edx,8      
		sub edi, back_step2 
        add edi,8		 
        dec ecx       
        jnz inner_loop     
        emms   		 
  } 
	 
  return 1; 
} 
 
 
// ---------------------- 
// FULL IMAGE, BEST ONLY : Keith's code 
int findMinimumCorrelation( 
	const unsigned char *CurrentCorrelation, 
	unsigned char CurrentDisparity, 
	unsigned char *Disparity, 
	unsigned char *BestCorrelation, int bytecount)  
{	 
	if ((bytecount < 8) || ((bytecount % 8) != 0)) { 
		return 0; 
	} 
	 
	__asm { 
		// load ecx with the pixelblock count = bytecount / 8 
		mov			ecx,	bytecount 
		shr			ecx,	3 
		 
		// setup mm0 with 8 copies of the disparity constant 
		mov			al, 	CurrentDisparity	 
		mov			ah, 	al 
		mov			bx, 	ax 
		shl			eax, 	16 
		mov			ax, 	bx 
		movd		mm0, 	eax 
		movd		mm1, 	eax 
		punpckldq	mm0, 	mm1 
 
		// setup mm1 with 8 copies of the xor constant for unsigned => signed conversion 
		mov			eax,	0x80808080 
		movd		mm1,	eax 
		movd		mm2,	eax 
		punpckldq	mm1,	mm2 
						 
		 
		// setup the image pointers		 
		mov		eax, 	BestCorrelation 
		mov		esi, 	CurrentCorrelation 
		mov		edi, 	Disparity 
		 
	pixel_loop: 
			movq		mm2,	[esi]	// current correlation 
			movq		mm3,	[eax]	// best correlation 
 
			// check for updates			 
			movq		mm5,	mm2		// copy the current correlation 
			pxor		mm5,	mm1		// convert from unsigned range to signed range 
 
			movq		mm6,	mm3		// copy the best correlation 
			pxor		mm6,	mm1		// convert from unsigned range to signed range 
 
			pcmpgtb		mm5,	mm6		// mm5 := (current signed> best) mask 
										//		1 indicates current >  best, so keep best 
										//		0 indicates	current <= best, so use new value 
 
			// BYPASS 
			// this phase adds 8 additional instructions, but could skip 2 writes and 1 read						 
			// abort remainder if not updating best correlation 
			pcmpeqb		mm6,	mm6		// mm6 = 0xFFFFFFFF			 
			pxor		mm6, 	mm5		// mm6 = mm5 xor 0xFFFFFFFF = not mm5 
								//		0 indicates current >  best, so keep best 
								//		1 indicates current <= best, so use new value 
						 
			packsswb	mm6,	mm6		// pack it into the lower dword of mm6 (unsigned saturation) 
								// 		11111111 11111111 => 11111111	some replaced 
								//		11111111 00000000 => 11111111	some replaced 
								//		00000000 11111111 => 11111111	some replaced 
								//		00000000 00000000 => 00000000	no replacements 
			 
			// we don't need to backup ebx because its not used in this routine 
			// movd		mm7,	ebx		// make a backup of eax 
			movd		ebx,	mm6		// get the saturated mask			 
			test		ebx,	ebx		// test ebx => yields 0 iff no substitutions will occur 
			// movd		ebx,	mm7		// restore ebx			 
			jz		bypass			// store mm4 (second correlation) to [ebx]		 
 
 
			// Update best Correlation 
			movq		mm6,	mm5		// mm6 := mask 
			movq		mm7,	mm5		// mm7 := mask 
 
			pand		mm6,	mm3		// best correlation values to keep 
			pandn		mm7,	mm2		// current correlation value to move to best correlation 
 
			por			mm6,	mm7		// merge values 
			movq		[eax],	mm6		// store values 
 
			// update disparity 
			movq		mm2,	[edi]	// get disparity map 
			movq		mm6,	mm5		// mm6 := mask 
 
			pand		mm5,	mm2		// select disparity map values to keep 
			pandn		mm6,	mm0		// select current disparity values to move to disparity map 
			 
			por			mm5,	mm6		// merge values 
			movq		[edi],	mm5		// store values 
			 
		bypass: 
			add		eax,	8 
			add		esi,	8 
			add		edi,	8 
 
			dec		ecx			 
		jnz pixel_loop 
 
	 
		emms; 
	} 
	 
	return 1; 
} 
 
/*int initMinimumCorrelation( 
	const unsigned char *CurrentCorrelation, 
	unsigned char disparityInit, 
	unsigned char *Disparity, 
	unsigned char *BestCorrelation, 
	unsigned char *SecondCorrelation, 
	int bytecount)  
{ 
	for (int i=0; i signed conversion 
		mov			eax,	0x80808080 
		movd		mm1,	eax 
		movd		mm2,	eax 
		punpckldq	mm1,	mm2 
						 
		 
		// setup the image pointers		 
		mov		eax, 	BestCorrelation 
		mov		ebx, 	SecondCorrelation 
		mov		esi, 	CurrentCorrelation 
		mov		edi, 	Disparity 
		 
	pixel_loop: 
			movq		mm2,	[esi]	// current correlation 
			movq		mm4,	[ebx]	// second correlation 
 
			// convert the current correlation from unsigned range to signed range 
			movq		mm5,	mm2		// copy the current correlation 
			pxor		mm5,	mm1		// convert from unsigned range to signed range 
			movq		mm7,	mm5		// copy converted to mm7 
 
 
			// check for second correlation updates 
			movq		mm6,	mm4		// copy second best correlation 
			pxor		mm6,	mm1		// convert from unsigned range to signed range 
 
			pcmpgtb		mm7,	mm6		// mm7 := (current signed> second best) mask 
 
			// BYPASS 1 
			// skip remainder if second correlation is not to be updated 
			// this phase adds an addition 8 instructions, but it could save as 1 memory read and 3 writes 
			pcmpeqb		mm6,	mm6		// mm6 = 0xFFFFFFFF			 
			pxor		mm6, 	mm7		// mm6 = mm7 xor 0xFFFFFFFF = not mm7 
								//		0 indicates current >  second, so keep old value 
								//		1 indicates current <= second, so use new value 
								 
						 
			packsswb	mm6,	mm6		// pack it into the lower dword of mm6 (unsigned saturation) 
								// 		11111111 11111111 => 11111111	some replaced 
								//		11111111 00000000 => 11111111	some replaced 
								//		00000000 11111111 => 11111111	some replaced 
								//		00000000 00000000 => 00000000	no replacements 
			 
			// don't need to backup edx because its not used in this routine 
			// movd		mm3,	edx		// make a backup of edx 
			movd		edx,	mm6		// get the saturated mask			 
			test		edx,	edx		// test edx => yields 0 iff no replacements will occur 
			// movd		edx,	mm3		// restore edx			 
			jz			bypass1 
 
 
			// direct update second correlation (get values from current) 
										// mm7 already has mask 
//			movq		mm6,	mm7		// mm6 := mask 
//			pand		mm6,	mm4		// second correlation values to keep 
//			pandn		mm7,	mm2		// current correlation values to move to second correlation 
//			por			mm6,	mm7		// merge value => direct updated second correlation 
//			movq		mm4,	mm6		// store values (*** this instruction could be eliminated!)  
 
			pand		mm4,	mm7		// second correlation values to keep 
			pandn		mm7,	mm2		// current correlation values to move to second correlation 
			por			mm4,	mm7		// merge value => direct updated second correlation 
 
 
			// check for best correlation updates			 
			movq		mm3,	[eax]	// best correlation 
			//		mm5 has converted current correlation 
			movq		mm6,	mm3		// copy the best correlation 
			pxor		mm6,	mm1		// convert from unsigned range to signed range 
 
			pcmpgtb		mm5,	mm6		// mm5 := (current signed> best) mask 
										//		1 indicates current >  best, so keep best 
										//		0 indicates	current <= best, so use new value 
			// BYPASS 2		 
			// this phase adds 8 additional instructions, but could skip 2 writes and 1 read						 
			// abort remainder if not updating best correlation 
			pcmpeqb		mm6,	mm6		// mm6 = 0xFFFFFFFF			 
			pxor		mm6, 	mm5		// mm6 = mm5 xor 0xFFFFFFFF = not mm5 
								//		0 indicates current >  best, so keep best 
								//		1 indicates current <= best, so use new value 
						 
			packsswb	mm6,	mm6		// pack it into the lower dword of mm6 (unsigned saturation) 
								// 		11111111 11111111 => 11111111	some replaced 
								//		11111111 00000000 => 11111111	some replaced 
								//		00000000 11111111 => 11111111	some replaced 
								//		00000000 00000000 => 00000000	no replacements 
			 
			// don't need to backup edx because its not used in this routine 
			// movd		mm7,	edx		// make a backup of edx 
			movd		edx,	mm6		// get the saturated mask			 
			test		edx,	edx		// test edx => yields 0 iff no substitutions will occur 
			// movd		edx,	mm7		// restore edx			 
			jz		bypass2			// store mm4 (second correlation) to [ebx]		 
 
 
			// indirect update second correlation (pushed down from best) 
			movq		mm6,	mm5		// mm6 := mask 
			movq		mm7,	mm5		// mm7 := mask 
 
			pand		mm6,	mm4		// second correlation values to keep 
			pandn		mm7,	mm3		// best correlations to move to second correlation 
 
			por			mm6,	mm7		// merge values 
			movq		[ebx],	mm6		// store values 
 
			// direct Update best Correlation 
			movq		mm6,	mm5		// mm6 := mask 
			movq		mm7,	mm5		// mm7 := mask 
 
			pand		mm6,	mm3		// best correlation values to keep 
			pandn		mm7,	mm2		// current correlation value to move to best correlation 
 
			por			mm6,	mm7		// merge values 
			movq		[eax],	mm6		// store values 
 
			// update disparity 
			movq		mm2,	[edi]	// get disparity map 
			movq		mm6,	mm5		// mm6 := mask 
 
			pand		mm5,	mm2		// select disparity map values to keep 
			pandn		mm6,	mm0		// select current disparity values to move to disparity map 
			 
			por			mm5,	mm6		// merge values 
			movq		[edi],	mm5		// store values 
 
 
		bypass1: 
		next_pixel:			 
			add		eax,	8 
			add		ebx,	8 
			add		esi,	8 
			add		edi,	8 
 
			dec		ecx			 
		jnz pixel_loop 
 
		jmp done 
 
		bypass2: 
			movq	[ebx], mm4; 
			jmp next_pixel 
 
	done: 
		emms; 
	} 
	 
	return 1; 
} 
 
 
 
 
 void sum_Row(uchar* im, unsigned short* im_out, int rowSize, int maskSize)  
{ 
	switch (maskSize) 
	{ 
	case 5: 
		sum_Row_5(im, im_out, rowSize); 
	break; 
 
	case 9: 
	case 8: 
	case 7: 
	case 6: 
		sum_Row_5(im, im_out, rowSize); 
		sum_Row_5(im_out, im_out, rowSize); 
	break; 
 
	case 13: 
	case 12: 
	case 11: 
	case 10: 
		sum_Row_5(im, im_out, rowSize); 
		sum_Row_5(im_out, im_out, rowSize); 
		sum_Row_5(im_out, im_out, rowSize); 
	break; 
 
	case 17: 
	case 16: 
	case 15: 
	case 14 : 
		sum_Row_5(im, im_out, rowSize); 
		sum_Row_5(im_out, im_out, rowSize); 
		sum_Row_5(im_out, im_out, rowSize); 
		sum_Row_5(im_out, im_out, rowSize); 
	break; 
 
	default: 
	break; 
	} 
} 
 
 void sum_Row(unsigned short* im, unsigned short* im_out, int rowSize, int maskSize)  
{ 
	switch (maskSize) 
	{ 
	case 5: 
		sum_Row_5(im, im_out, rowSize); 
	break; 
 
	case 9: 
	case 8: 
	case 7: 
	case 6: 
		sum_Row_5(im, im_out, rowSize); 
		sum_Row_5(im_out, im_out, rowSize); 
	break; 
 
	case 13: 
	case 12: 
	case 11: 
	case 10: 
		sum_Row_5(im, im_out, rowSize); 
		sum_Row_5(im_out, im_out, rowSize); 
		sum_Row_5(im_out, im_out, rowSize); 
	break; 
 
	case 17: 
	case 16: 
	case 15: 
	case 14 : 
		sum_Row_5(im, im_out, rowSize); 
		sum_Row_5(im_out, im_out, rowSize); 
		sum_Row_5(im_out, im_out, rowSize); 
		sum_Row_5(im_out, im_out, rowSize); 
	break; 
 
	default: 
	break; 
	} 
} 
 
#define aim_Sum_Words_In_MM1 __asm \ 
{						\ 
	__asm movq mm4, mm1 \ 
	__asm movq mm2, mm1 \ 
\ 
	__asm movq mm3, mm1 \ 
	__asm psllq mm1, 16 \ 
\ 
	__asm psrlq mm2, 16 \ 
	__asm paddw mm4, mm2 \ 
\ 
	__asm paddw mm3, mm1 \ 
	__asm psrlq mm2, 16 \ 
\ 
	__asm psllq mm1, 16 \ 
	__asm paddw mm4, mm2 \ 
\ 
	__asm psrlq mm2, 16 \ 
	__asm paddw mm3, mm1 \ 
\ 
	__asm psllq mm1, 16 \ 
	__asm paddw mm4, mm2 \ 
\ 
	__asm paddw mm3, mm1 \ 
} 
 
 
 
 
 
// apply the mask [1 1 1 1 1] to the 1-D array im (bytes) 
// output : im_out (words) 
void sum_Row_5(uchar* im, unsigned short* im_out, int rowSize)  
{ 
	__asm { 
 
	mov eax, rowSize 
	mov ebx, im 
	mov ecx, im_out 
 
	pxor mm6, mm6	// mm6 = x00000000 
 
	//Process the first quad word, but save only the second result"  
	test eax, eax // Is there anything to do?"  
	jz end_sum_loop // Jump out if necessary  
 
 
	//Process low word  
	movq mm1, [ebx] // Copy...  
	punpcklbw mm1, mm6 // Expand low word bytes into words		// mm1 =[D C B A] 
 
	aim_Sum_Words_In_MM1 
 
	//Store the result Only in the accumulator  
	movq mm7, mm4 // Update accumulator    mm4=[D  C+D   B+C+D   A+B+C+D] 
 
	//Process high word  
	movq mm1, [ebx] // Copy...  
	punpckhbw mm1, mm6 // Expand high word bytes into words   // mm1 =[H G F E] 
	add ebx, 8 // Update input pointer  
 
    aim_Sum_Words_In_MM1 
 
	//Add to the previous data	...  
	// mm3=[E+F+G+H  E+F+G  E+F   E] 
	// mm4=[H	G+H    F+G+H   E+F+G+H] 
	paddw mm7, mm3 // The current word of the accum   // mm7=[D+E+F+G+H  C+D+E+F+G  B+C+D+E+F  A+B+C+D+E] 
 
	// translate everything to 2 words on the left 
	movq	mm1, mm7	// mm1 = [D+E+F+G+H  C+D+E+F+G  B+C+D+E+F  A+B+C+D+E] 
	psrlq   mm1, 32		// mm1 = [0  0  D+E+F+G+H  C+D+E+F+G] 
 
	movq	mm0, mm1	// mm0 = [D+E+F+G+H  C+D+E+F+G] 
 
	psllq	mm7, 32		// mm7 = [B+C+D+E+F  A+B+C+D+E  0   0] 
 
	movq [ecx], mm7 // Store the final result  
	add ecx, 8 // Update output pointer  
 
	movq mm7, mm4 // Update accumulator    mm4=[H	G+H    F+G+H   E+F+G+H] 
	sub eax, 8 // Update the number of points left  
 
	// Start the loop  
	row_sum_loop: 
		test eax, eax // Is there anything to do?  
		jz end_sum_loop // Jump out if necessary  
 
		movq mm1, [ebx] // Load data  
 
		//Process low word  
		punpcklbw mm1, mm6 // Expand low word bytes into words  
 
		aim_Sum_Words_In_MM1 
 
		//Add to the previous data  
		//prefetcht1 [ecx+16] 
		paddw mm7, mm3	// The current word of the accum  
 
		// translate everything to 2 words on the left 
		// mm0 = [0 0 D C]  mm7 = [H G F E] ----> mm7=[0 0 H G]  [ecx]=[F E D C] 
		punpckldq	mm0, mm7		// mm0 = [F E D C] 
 
		movq		[ecx], mm0 
		sub eax, 8 // Update the number of points left 	 
 
		movq mm0, mm4	// Update accumulator  
		psrlq		mm7, 32			// mm7 = [0 0 H G] 
 
		//Process high word  
		movq mm1, [ebx]		// Copy...  
		punpckhbw mm1, mm6	// Expand high word bytes into words  
		 
		aim_Sum_Words_In_MM1 
 
		//Add to the previous data  
		paddw mm0, mm3	// The current word of the accum  
 
		// translate everything to 2 words on the left 
		// mm7 = [0 0 D C]  mm0 = [H G F E] ----> mm0=[0 0 H G]  [ecx+8]=[F E D C] 
		punpckldq	mm7, mm0		// mm7 = [F E D C] 
		add ebx, 8 // Update input pointer  
 
		movq		[ecx+8], mm7 
		psrlq		mm0, 32			// mm0 = [0 0 H G]  
 
		movq mm7, mm4	// Update accumulator  
		add ecx, 16 // Update output pointer  
		 
		jmp row_sum_loop // Loop  
 
		//Cleanup  
	end_sum_loop: 
	emms  
	} 
} 
 
 
 
// apply the mask (1/4)*[1 1 1 1 1] to the 1-D array im (words) 
// output : im_out (words) 
void sum_Row_5(ushort* im, ushort* im_out, int rowSize)  
{ 
	__asm { 
 
	mov eax, rowSize 
	mov ebx, im 
	mov ecx, im_out 
 
	//Process the first quad word, but save only the second result"  
	test eax, eax // Is there anything to do?"  
	jz end_sum_loop // Jump out if necessary  
 
	movq mm1, [ebx] // Load data (4 words) 
	add ebx, 8 // Update input pointer  
 
	//Process low word  
	aim_Sum_Words_In_MM1 
 
	//Store the result Only in the accumulator  
	movq mm7, mm4 // Update accumulator  
 
	//Process high word  
	movq mm1, [ebx] // Copy...  
 
	aim_Sum_Words_In_MM1 
	add ebx, 8 
 
	//Add to the previous data  
	paddw mm7, mm3 // The current word of the accum  
 
	// translate everything to 2 words on the left 
	movq	mm1, mm7	// mm1 = [D+E+F+G+H  C+D+E+F+G  B+C+D+E+F  A+B+C+D+E] 
	psrlq   mm1, 32		// mm1 = [0  0  D+E+F+G+H  C+D+E+F+G] 
	movq	mm0, mm1	// mm0 = [0  0  D+E+F+G+H  C+D+E+F+G] 
	psllq	mm7, 32		// mm7 = [B+C+D+E+F  A+B+C+D+E  0   0] 
 
	movq [ecx], mm7 // Store the final result  
	movq mm7, mm4 // Update accumulator  
 
	add ecx, 8 // Update output pointer  
	sub eax, 8 // Update the number of points left  
 
	// Start the loop  
	row_sum_loop: 
		test eax, eax // Is there anything to do?  
		jz end_sum_loop // Jump out if necessary  
 
		movq mm1, [ebx] // Load data  
 
		aim_Sum_Words_In_MM1 
 
		//Add to the previous data  
		//prefetcht0	[ecx + 32] 
		//prefetcht0	[ebx + 48] 
		paddw mm7, mm3 // The current word of the accum  
		psrlw mm7, 2 // divide result by ... 
 
		// translate everything to 2 words on the left 
		// mm0 = [0 0 D C]  mm7 = [H G F E] ----> mm7 =[0 0 H G]  [ecx]=[F E D C] 
		punpckldq	mm0, mm7  // mm0 = [F E D C] 
		 
		movq		[ecx], mm0 
		sub eax, 8 // Update the number of points left  
 
		movq	mm0, mm4			// Update accumulator  
		psrlq	mm7, 32		// mm7 =[0 0 H G] 
		 
		//Process high word  
		movq mm1, [ebx+8] // Copy...  
		 
		aim_Sum_Words_In_MM1 
 
		//Add to the previous data  
		paddw mm0, mm3 // The current word of the accum  
		psrlw mm0, 2 // divide result by ... 
		 
		// translate everything to 2 words on the left 
		// mm7 = [0 0 D C]  mm0 = [H G F E] ----> mm0=[0 0 H G]  [ecx+8]=[F E D C] 
		punpckldq	mm7, mm0	// mm7 = [F E D C] 
		add ebx, 16 // Update input pointer  
 
		movq		[ecx+8], mm7 
		psrlq		mm0, 32		// mm0 = [0 0 H G] 
 
		movq mm7, mm4	// Update accumulator  
		add ecx, 16 // Update output pointer */ 
 
		jmp row_sum_loop // Loop  
 
		//Cleanup  
	end_sum_loop: 
	emms  
	} 
} 
 
 
// apply vertical mask 1/16*[1 1 1 ... 1]^T to 'im' 
// result in 'im_out' 
void avg_Col(ushort* im, uchar* im_out, int dataSize, int width, int sizeMask) 
{ 
	switch (sizeMask) 
	{ 
	case 5: avg_Col_5(im,im_out,dataSize,width); 
		break; 
	case 7: avg_Col_7(im,im_out,dataSize,width); 
		break; 
	case 9: avg_Col_9(im,im_out,dataSize,width); 
		break;	 
	case 11: avg_Col_11(im,im_out,dataSize,width); 
		break; 
	case 13: avg_Col_13(im,im_out,dataSize,width); 
		break; 
	case 15: avg_Col_15(im,im_out,dataSize,width); 
		break; 
	case 17: avg_Col_17(im,im_out,dataSize,width); 
		break; 
 
	default: avg_Col_5(im,im_out,dataSize,width); 
		break; 
 
	} 
} 
 
 
 
#define macro_add __asm \ 
{						\ 
	__asm 	paddusw mm3, [edx]	\ 
	__asm	paddusw mm2, [edx+8]  \ 
	__asm	add edx, edi		\ 
} 
 
 
void avg_Col_5(ushort* im, uchar* im_out, int dataSize, int width) 
{ 
	__asm { 
 
	mov edi, width 
	shl edi, 1  // edi = 2*width 
 
	mov eax, dataSize 
	mov ecx, im_out 
 
	mov ebx, im 
	sub ebx, edi 
	sub ebx, edi // ebx = ebx-4*width 
	 
	test eax, eax // Is there anything to do?"  
	jz end_sum_loop // Jump out if necessary  
 
	row_sum_loop: 
 
		test eax, eax // Is there anything to do?  
		jz end_sum_loop // Jump out if necessary  
 
		mov edx, ebx 
		add ebx, 16 
 
		// 1 
		movq mm3, [edx] // mm3 = 4 words of im 
		movq mm2, [edx+8] // mm2 = next 4 words of im 
		add edx, edi 
 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		 
		// divide results by ... 
		psrlw mm3, 3 
		psrlw mm2, 3 
 
		// convert [mm2 mm3] as 8 bytes 
		packuswb mm3,mm2 
		movq [ecx], mm3 
 
		sub eax, 8 // Update the number of points left  
		add ecx, 8 // Update output pointer  
 
		jmp row_sum_loop // Loop  
 
		//Cleanup  
	end_sum_loop: 
	emms  
	} 
} 
 
void avg_Col_7(ushort* im, uchar* im_out, int dataSize, int width) 
{ 
	__asm { 
 
	mov edi, width 
	shl edi, 1  // edi = 2*width 
 
	mov eax, dataSize 
	mov ecx, im_out 
 
	mov ebx, im 
	sub ebx, edi 
	sub ebx, edi 
	sub ebx, edi // ebx = ebx-6*width 
	 
	test eax, eax // Is there anything to do?"  
	jz end_sum_loop // Jump out if necessary  
 
	row_sum_loop: 
 
		test eax, eax // Is there anything to do?  
		jz end_sum_loop // Jump out if necessary  
 
		mov edx, ebx 
 
		// 1 
		movq mm3, [edx] // mm3 = 4 words of im 
		add ebx, 16 
		movq mm2, [edx+8] // mm2 = next 4 words of im 
		add edx, edi 
 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		 
		// divide results by ... 
		psrlw mm3, 3 
		psrlw mm2, 3 
 
		// convert [mm2 mm3] as 8 bytes 
		packuswb mm3,mm2 
		movq [ecx], mm3 
 
		sub eax, 8 // Update the number of points left  
		add ecx, 8 // Update output pointer  
 
		jmp row_sum_loop // Loop  
 
		//Cleanup  
	end_sum_loop: 
	emms  
	} 
} 
 
void avg_Col_9(ushort* im, uchar* im_out, int dataSize, int width) 
{ 
	__asm { 
 
	mov edi, width 
	shl edi, 1  // edi = 2*width 
 
	mov eax, dataSize 
	mov ecx, im_out 
 
	mov ebx, im 
	sub ebx, edi 
	sub ebx, edi 
	sub ebx, edi 
	sub ebx, edi // ebx = ebx-8*width 
	 
	test eax, eax // Is there anything to do?"  
	jz end_sum_loop // Jump out if necessary  
 
	row_sum_loop: 
 
		test eax, eax // Is there anything to do?  
		jz end_sum_loop // Jump out if necessary  
 
		mov edx, ebx 
		add ebx, 16 
 
		// 1 
		movq mm3, [edx] // mm3 = 4 words of im 
		movq mm2, [edx+8] // mm2 = next 4 words of im 
		add edx, edi 
 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		 
		// divide results by ... 
		psrlw mm3, 3 
		psrlw mm2, 3 
 
		// convert [mm2 mm3] as 8 bytes 
		packuswb mm3,mm2 
		movq [ecx], mm3 
 
		sub eax, 8 // Update the number of points left  
		add ecx, 8 // Update output pointer  
 
		jmp row_sum_loop // Loop  
 
		//Cleanup  
	end_sum_loop: 
	emms  
	} 
} 
 
void avg_Col_11(ushort* im, uchar* im_out, int dataSize, int width) 
{ 
	__asm { 
 
	mov edi, width 
	shl edi, 1  // edi = 2*width 
 
	mov eax, dataSize 
	mov ecx, im_out 
 
	mov ebx, im 
	sub ebx, edi 
	sub ebx, edi 
	sub ebx, edi 
	sub ebx, edi  
	sub ebx, edi // ebx = ebx-10*width 
	 
	test eax, eax // Is there anything to do?"  
	jz end_sum_loop // Jump out if necessary  
 
	row_sum_loop: 
 
		test eax, eax // Is there anything to do?  
		jz end_sum_loop // Jump out if necessary  
 
		mov edx, ebx 
		add ebx, 16 
 
		// 1 
		movq mm3, [edx] // mm3 = 4 words of im 
		movq mm2, [edx+8] // mm2 = next 4 words of im 
		add edx, edi 
 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		 
		// divide results by ... 
		psrlw mm3, 4 
		psrlw mm2, 4 
 
		// convert [mm2 mm3] as 8 bytes 
		packuswb mm3,mm2 
		movq [ecx], mm3 
 
		sub eax, 8 // Update the number of points left  
		add ecx, 8 // Update output pointer  
 
		jmp row_sum_loop // Loop  
 
		//Cleanup  
	end_sum_loop: 
	emms  
	} 
} 
 
void avg_Col_13(ushort* im, uchar* im_out, int dataSize, int width) 
{ 
	__asm { 
 
	mov edi, width 
	shl edi, 1  // edi = 2*width 
 
	mov eax, dataSize 
	mov ecx, im_out 
 
	mov ebx, im 
	sub ebx, edi 
	sub ebx, edi 
	sub ebx, edi 
	sub ebx, edi  
	sub ebx, edi 
	sub ebx, edi // ebx = ebx-12*width 
	 
	test eax, eax // Is there anything to do?"  
	jz end_sum_loop // Jump out if necessary  
 
	row_sum_loop: 
 
		test eax, eax // Is there anything to do?  
		jz end_sum_loop // Jump out if necessary  
 
		mov edx, ebx 
		add ebx, 16 
 
		// 1 
		movq mm3, [edx] // mm3 = 4 words of im 
		movq mm2, [edx+8] // mm2 = next 4 words of im 
		add edx, edi 
 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		 
		// divide results by ... 
		psrlw mm3, 4 
		psrlw mm2, 4 
 
		// convert [mm2 mm3] as 8 bytes 
		packuswb mm3,mm2 
		movq [ecx], mm3 
 
		sub eax, 8 // Update the number of points left  
		add ecx, 8 // Update output pointer  
 
		jmp row_sum_loop // Loop  
 
		//Cleanup  
	end_sum_loop: 
	emms  
	} 
} 
 
void avg_Col_15(ushort* im, uchar* im_out, int dataSize, int width) 
{ 
	__asm { 
 
	mov edi, width 
	shl edi, 1  // edi = 2*width 
 
	mov eax, dataSize 
	mov ecx, im_out 
 
	mov ebx, im 
	sub ebx, edi 
	sub ebx, edi 
	sub ebx, edi 
	sub ebx, edi 
	sub ebx, edi  
	sub ebx, edi 
	sub ebx, edi // ebx = ebx-14*width 
	 
	test eax, eax // Is there anything to do?"  
	jz end_sum_loop // Jump out if necessary  
 
	row_sum_loop: 
 
		test eax, eax // Is there anything to do?  
		jz end_sum_loop // Jump out if necessary  
 
		mov edx, ebx 
		add ebx, 16 
 
		// 1 
		movq mm3, [edx] // mm3 = 4 words of im 
		movq mm2, [edx+8] // mm2 = next 4 words of im 
		add edx, edi 
 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		 
		// divide results by ... 
		psrlw mm3, 4 
		psrlw mm2, 4 
 
		// convert [mm2 mm3] as 8 bytes 
		packuswb mm3,mm2 
		movq [ecx], mm3 
 
		sub eax, 8 // Update the number of points left  
		add ecx, 8 // Update output pointer  
 
		jmp row_sum_loop // Loop  
 
		//Cleanup  
	end_sum_loop: 
	emms  
	} 
} 
 
void avg_Col_17(ushort* im, uchar* im_out, int dataSize, int width) 
{ 
	__asm { 
 
	mov edi, width 
	shl edi, 1  // edi = 2*width 
 
	mov eax, dataSize 
	mov ecx, im_out 
 
	mov ebx, im 
	sub ebx, edi 
	sub ebx, edi 
	sub ebx, edi 
	sub ebx, edi 
	sub ebx, edi 
	sub ebx, edi  
	sub ebx, edi 
	sub ebx, edi // ebx = ebx-16*width 
	 
	test eax, eax // Is there anything to do?"  
	jz end_sum_loop // Jump out if necessary  
 
	row_sum_loop: 
 
		test eax, eax // Is there anything to do?  
		jz end_sum_loop // Jump out if necessary  
 
		mov edx, ebx 
		add ebx, 16 
 
		// 1 
		movq mm3, [edx] // mm3 = 4 words of im 
		movq mm2, [edx+8] // mm2 = next 4 words of im 
		add edx, edi 
 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		 
		// divide results by ... 
		psrlw mm3, 4 
		psrlw mm2, 4 
 
		// convert [mm2 mm3] as 8 bytes 
		packuswb mm3,mm2 
		movq [ecx], mm3 
 
		sub eax, 8 // Update the number of points left  
		add ecx, 8 // Update output pointer  
 
		jmp row_sum_loop // Loop  
 
		//Cleanup  
	end_sum_loop: 
	emms  
	} 
} 
 
 
void add_Col_5_wb(ushort* im, uchar* im_out, int dataSize, int width) 
{ 
	__asm { 
 
	mov edi, width 
	shl edi, 1  // edi = 2*width 
 
	mov eax, dataSize 
	mov ecx, im_out 
 
	mov ebx, im 
	sub ebx, edi 
	sub ebx, edi // ebx = ebx-4*width 
	 
	test eax, eax // Is there anything to do?"  
	jz end_sum_loop // Jump out if necessary  
 
	row_sum_loop: 
 
		test eax, eax // Is there anything to do?  
		jz end_sum_loop // Jump out if necessary  
 
		mov edx, ebx 
		add ebx, 16 
 
		// 1 
		movq mm3, [edx] // mm3 = 4 words of im 
		movq mm2, [edx+8] // mm2 = next 4 words of im 
		add edx, edi 
 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		 
		// save [mm2 mm3] as 8 bytes 
		packuswb mm3,mm2 
		movq [ecx], mm3 
 
		sub eax, 8 // Update the number of points left  
		add ecx, 8 // Update output pointer  
 
		jmp row_sum_loop // Loop  
 
		//Cleanup  
	end_sum_loop: 
	emms  
	} 
} 
 
void add_Col_5_ww(ushort* im, ushort* im_out, int dataSize, int width) 
{ 
	__asm { 
 
	mov edi, width 
	shl edi, 1  // edi = 2*width 
 
	mov eax, dataSize 
	mov ecx, im_out 
 
	mov ebx, im 
	sub ebx, edi 
	sub ebx, edi // ebx = ebx-4*width 
	 
	test eax, eax // Is there anything to do?"  
	jz end_sum_loop // Jump out if necessary  
 
	row_sum_loop: 
 
		test eax, eax // Is there anything to do?  
		jz end_sum_loop // Jump out if necessary  
 
		mov edx, ebx 
		add ebx, 16 
 
		// 1 
		movq mm3, [edx] // mm3 = 4 words of im 
		movq mm2, [edx+8] // mm2 = next 4 words of im 
		add edx, edi 
 
		macro_add 
		macro_add 
		macro_add 
		macro_add 
		 
		// save [mm2 mm3] as words 
		movq [ecx], mm3 
		movq [ecx+8], mm2 
 
		sub eax, 8 // Update the number of points left  
		add ecx, 16 // Update output pointer  
 
		jmp row_sum_loop // Loop  
 
		//Cleanup  
	end_sum_loop: 
	emms  
	} 
} 
 
// compare bestScores and secondScores. if secondthresh 
							//		 0 otherwise 
			 
		pand	mm3, mm2 
		pandn	mm2, mm7 
 
		por		mm3, mm2 
		movq	[edx], mm3 
 
		sub eax, 8 // Update the number of points left  
		add ebx, 8 // Update output pointer  
		add ecx, 8  
		add edx, 8 
 
		jmp comp_loop // Loop  
 
		//Cleanup  
	end_loop: 
	emms  
	} 
} 
 
// windowWidth must be multiple of 8 
void cropImage(const uchar* imSrc, int width, int height,  
			   uchar* imDest, int x0, int y0, int windowWidth, int windowHeight) 
{ 
	int w8 = windowWidth/8; 
 
	int step = width-windowWidth; 
	const uchar* srcNewOrigin = imSrc+x0+y0*width; 
 
	__asm { 
 
		mov		ecx,	windowHeight 
 
		mov		edx,	w8 
		mov		eax, 	srcNewOrigin 
		mov		ebx, 	imDest 
 
	pixel_loop: 
 
		movq	mm1, [eax] 
		movq	[ebx], mm1 
		add		eax, 8 
		add		ebx, 8 
	 
		dec		edx 
		jnz	pixel_loop 
 
		mov		edx, w8 
		add		eax, step 
 
		dec		ecx		 
		jnz pixel_loop 
 
		jmp done 
 
	done: 
		emms; 
	} 
} 
 
// return the average pixel value 
float pixelMean(const uchar* im, int imageSize) 
{ 
	int sum; 
 
	__asm { 
 
		mov		ecx, imageSize 
		shr		ecx, 3 
 
		mov		eax, im 
		pxor	mm7,mm7		// mm7 used as accumulator 
		pxor	mm0,mm0		// mm0 = 0 
 
	pixel_loop: 
			 
			movq	mm1, [eax] 
			movq	mm2,mm1 
 
			punpcklbw	mm2, mm0 
			punpckhbw	mm1, mm0 
 
			paddw	mm2,mm1 
			 
			movq	mm1,mm2 
			punpcklwd	mm2, mm0 
			punpckhwd	mm1, mm0 
 
			paddd	mm2,mm1 
			paddd	mm7,mm2 
 
			add eax, 8 
			dec	ecx 
			jnz	pixel_loop 
 
			jmp done 
 
	done: 
			movd	ebx, mm7 
			psrlq	mm7, 32 
			movd	edx, mm7 
			add	ebx, edx 
			mov sum, ebx 
 
			emms 
	} 
 
	return sum / (float)imageSize; 
} 
 
 
 
 
// ------------------------------------------------------------- 
// apply mask: 
// if mask[]=undefined_val	im[]->im[] 
// otherwise,				im[]->mask[] 
// ....... this one may not be exact :-( 
void overrideImageMMX(uchar* im, const uchar* mask, uchar undefined_val, int imageSize) 
{ 
	__asm { 
		// setup mm0 with 8 copies of 'undefined_val' 
		mov			al, 	undefined_val	 
		mov			ah, 	al 
		mov			bx, 	ax 
		shl			eax, 	16 
		mov			ax, 	bx 
		movd		mm0, 	eax 
		movd		mm1, 	eax 
		punpckldq	mm0, 	mm1 
		 
		mov		ecx, imageSize 
		shr		ecx, 3 
 
		mov		eax, im 
		mov		ebx, mask 
 
		pixel_loop: 
			movq	mm1, [eax] 
			movq	mm2, [ebx] 
 
			movq	mm3, mm2 
			pcmpeqb	mm3, mm0	// mm3[]	-> xFF if mm2[]==undefined_val 
								//			-> x00 otherwise 
			pand	mm3, mm1	// mm3[] = mm1[] if mm2[]==undefined_val 
								//		 = x00 	 otherwise 
			por		mm3, mm2 
			movq	[eax], mm3 
 
			add	eax, 8 
			add ebx, 8 
			dec ecx 
			jnz pixel_loop 
 
			jmp done 
 
		done: 
			emms 
	} 
} 
 
void overrideImage(uchar* im, const uchar* mask, uchar undefined_val, int imageSize) 
{ 
	for (int i=0; i