www.pudn.com > Estereo.rar > processingMMX.inl
/***************************************************************************
*
* Copyright 2004 by the Massachusetts Institute of Technology. All
* rights reserved.
*
* Developed by David Demirdjian
* at the Computer Sciences and Artificial Intelligence Laboratory,
* MIT, Cambridge, Massachusetts.
*
* Permission to use, copy, or modify this software and its documentation
* for educational and research purposes only and without fee is hereby
* granted, provided that this copyright notice and the original authors's
* names appear on all copies and supporting documentation. If individual
* files are separated from this distribution directory structure, this
* copyright notice must be included. For any other uses of this software,
* in original or modified form, including but not limited to distribution
* in whole or in part, specific prior permission must be obtained from
* MIT. These programs shall not be used, rewritten, or adapted as the
* basis of a commercial software or hardware product without first
* obtaining appropriate licenses from MIT. MIT. makes no representations
* about the suitability of this software for any purpose. It is provided
* "as is" without express or implied warranty.
*
**************************************************************************/
#include "stereoMatching.h"
#include "processingMMX.h"
// ImgSub2: D = saturation0(|S1 - S2| + |S1 - S3|)
// TODO? divide the result by 2 (shift)
inline int ImgSubandAdd(const unsigned char *Src1, const unsigned char *Src2,
const unsigned char *Src3, unsigned char *Dest, int l)
{
if (l < 8) return 0; // image size must be at least 8 bytes
__asm
{
mov eax, Src1
mov ebx, Src2
mov edx, Src3
mov edi, Dest
mov ecx, l
shr ecx, 3
align 16
inner_loop:
movq mm1,[eax] // mm1=src1
movq mm2,[ebx] // mm2=src2
movq mm4,mm1 // mm4=mm1
psubusb mm4,mm2 // mm4 = src1 - src2
movq mm3,[edx] // mm3=src3
psubusb mm2,mm1 // mm2 = src2 - src1
movq mm5,mm1 // mm5=src1
por mm2,mm4 // mm2=|src1-src2|
psubusb mm5,mm3 // mm4=src1-src3
psubusb mm3,mm1 // mm3=src3-src1
por mm3,mm5 // mm3=|src1-src3|
paddusb mm2,mm3 // mm2 = |src1-src2|+|src1-src3|
movq [edi], mm2
add eax,8
add ebx,8
add edx,8
add edi,8
dec ecx
jnz inner_loop
emms
}
return 1;
}
// ImgSub2: D = saturation0(|S1 - S2|)
// TODO? divide the result by 2 (shift)
inline int ImgSubandAdd(const unsigned char *Src1, const unsigned char *Src2,
const unsigned char *Dest, int l)
{
if (l < 8) return 0; // image size must be at least 8 bytes
__asm
{
mov eax, Src1
mov ebx, Src2
mov edi, Dest
mov ecx, l
shr ecx, 3
align 16
inner_loop:
movq mm1,[eax] // mm1=src1
movq mm2,[ebx] // mm2=src2
movq mm4,mm1 // mm4=mm1
psubusb mm4,mm2 // mm4 = src1 - src2
psubusb mm2,mm1 // mm2 = src2 - src1
por mm2,mm4 // mm2=|src1-src2|
movq [edi], mm2
add eax,8
add ebx,8
add edi,8
dec ecx
jnz inner_loop
emms
}
return 1;
}
#define _ABS_DIFF_TRI(Z) __asm \
{ \
__asm movq mm4,mm1 /* mm4=mm1 */ \
__asm add ebx, width \
__asm add edi, imageSize \
__asm por mm3,mm7 /* here mm2=new src2 mm3=new src3 */ \
\
__asm movq mm7, mm0 \
__asm psubusb mm4,mm2 /* mm4 = src1 - src2 */ \
\
__asm psubusb mm2,mm1 /* mm2 = src2 - src1 */ \
__asm psllq mm7,Z \
\
__asm movq mm5,mm1 /* mm5=src1 */ \
__asm por mm4,mm2 /* mm2=|src1-src2| */ \
\
__asm movq mm2,[ebx] /* mm2= src2 + 'width' = new src2*/ \
__asm psubusb mm5,mm3 /* mm5=src1-src3*/ \
\
__asm movq mm6,mm3 /* mm6=src3*/ \
__asm psubusb mm6,mm1 /* mm3=src3-src1*/ \
\
__asm por mm6,mm5 /* mm6=|src1-src3|*/ \
__asm paddusb mm4,mm6 /* mm4 = |src1-src2|+|src1-src3|*/ \
\
__asm movq [edi], mm4 /* here mm1=src1*/ \
__asm psrlq mm3, 8 /* mm3 = src3 + '1' ... with [x00000000] at the end*/\
}
#define _ABS_DIFF_TRI_prefetch(Z, X) __asm \
{ \
__asm movq mm4,mm1 /* mm4=mm1 */ \
__asm add ebx, width \
__asm add edi, imageSize \
__asm por mm3,mm7 /* here mm2=new src2 mm3=new src3 */ \
\
__asm movq mm7, mm0 \
__asm psubusb mm4,mm2 /* mm4 = src1 - src2 */ \
\
__asm psubusb mm2,mm1 /* mm2 = src2 - src1 */ \
__asm prefetcht0 [ebx + X] \
__asm psllq mm7,Z \
\
__asm movq mm5,mm1 /* mm5=src1 */ \
__asm por mm4,mm2 /* mm2=|src1-src2| */ \
\
\
__asm movq mm2,[ebx] /* mm2= src2 + 'width' = new src2*/ \
__asm psubusb mm5,mm3 /* mm5=src1-src3*/ \
\
__asm movq mm6,mm3 /* mm6=src3*/ \
__asm psubusb mm6,mm1 /* mm3=src3-src1*/ \
\
__asm por mm6,mm5 /* mm6=|src1-src3|*/ \
__asm paddusb mm4,mm6 /* mm4 = |src1-src2|+|src1-src3|*/ \
\
__asm movq [edi], mm4 /* here mm1=src1*/ \
__asm psrlq mm3, 8 /* mm3 = src3 + '1' ... with [x00000000] at the end*/\
}
// ImgSubandAdd2: D = saturation0(|S1 - S2| + |S1 - S3|)
// process 8 disparities at a time
//
// Src1: right
// Src2: top
// Src3: left
//
// TODO? divide the result by 2 (shift)
inline int ImgSubandAdd2(const unsigned char *Src1, const unsigned char *Src2,
const unsigned char *Src3,
unsigned char* Dest1, int l, int imageSize, int width)
{
if (l < 8) return 0; // image size must be at least 8 bytes
const int back_step1 = 7*width;
const int back_step2 = 7*imageSize;
__asm
{
mov eax, Src1
mov ebx, Src2
mov edx, Src3
mov edi, Dest1
mov ecx, l
shr ecx, 3
movq mm0,[edx] // mm0=src3
movq mm0,[edx] // mm0=src3
align 16
inner_loop:
movq mm1,[eax] // mm1=src1
movq mm3,mm0 // mm3=src3
movq mm2,[ebx] // mm2=src2
add eax,8
// -- 1 --------- in : mm1,mm2,mm3 out: mm4=SAD mm2=new mm2 --
movq mm4,mm1 // mm4=mm1
add ebx,width
psubusb mm4,mm2 // mm4 = src1 - src2
//prefetcht0 [ebx + 32 + 2*320]
movq mm0,[edx+8]
psubusb mm2,mm1 // mm2 = src2 - src1
movq mm5,mm1 // mm5=src1
por mm4,mm2 // mm2=|src1-src2|
movq mm2,[ebx] // mm2= src2 + 'width' = new src2
psubusb mm5,mm3 // mm5=src1-src3
movq mm6,mm3 // mm6=src3
psubusb mm6,mm1 // mm3=src3-src1
movq mm7, mm0
psrlq mm3, 8 // mm3 = src3 + '1' ... with [x00000000] at the end
por mm6,mm5 // mm6=|src1-src3|
paddusb mm4,mm6 // mm4 = |src1-src2|+|src1-src3|
movq [edi], mm4
psllq mm7, 56 // here mm1=src1 mm2=NEW src2 mm3=begin of NEWsrc3 mm7=end of NEWsrc3
// -------------------------------------------------------------
// - 2 ----------------
_ABS_DIFF_TRI(48)
// - 3 ----------------
_ABS_DIFF_TRI(40)
// - 4 ----------------
_ABS_DIFF_TRI(32)
// _ABS_DIFF_TRI_prefetch(32,24 + 3*320)
// - 5 ----------------
_ABS_DIFF_TRI(24)
// - 6 ----------------
_ABS_DIFF_TRI(16)
// - 7 ----------------
_ABS_DIFF_TRI(8)
// - 8 ----------------
movq mm4,mm1 // mm4=mm1
por mm3,mm7 // here mm2=new src2 mm3=new src3
psubusb mm4,mm2 // mm4 = src1 - src2
psubusb mm2,mm1 // mm2 = src2 - src1
movq mm5,mm1 // mm5=src1
por mm4,mm2 // mm2=|src1-src2|
psubusb mm5,mm3 // mm5=src1-src3
psubusb mm3,mm1 // mm3=src3-src1
por mm3,mm5 // mm6=|src1-src3|
paddusb mm4,mm3 // mm4 = |src1-src2|+|src1-src3|
add edi, imageSize
movq [edi], mm4 // here mm1=src1
// -------------------------------------------------------------
//
sub ebx, back_step1
add ebx,8
add edx,8
sub edi, back_step2
add edi,8
dec ecx
jnz inner_loop
emms
}
return 1;
}
// macro: in: mm1,mm2
#define _ABS_DIFF_ __asm \
{ \
__asm movq mm4,mm1 /* mm4=mm1 */ \
__asm psubusb mm4,mm2 /* mm4 = src1 - src2 */ \
__asm psubusb mm2,mm1 /* mm2 = src2 - src1 */ \
__asm por mm4,mm2 /* mm2=|src1-src2| */ \
__asm add ebx, width \
__asm add edi, imageSize \
__asm movq mm2,[ebx] \
__asm movq [edi], mm4 /* here mm1=src1 */ \
}
// ImgSubandAdd2: D = saturation0(|S1 - S2| + |S1 - S3|)
// process 8 disparities at a time
// Src1: right
// Src2: top
// TODO? divide the result by 2 (shift)
inline int ImgSubandAdd2_Vert(const unsigned char *Src1, const unsigned char *Src2,
unsigned char* Dest1, int l, int imageSize, int width)
{
if (l < 8) return 0; // image size must be at least 8 bytes
const int back_step1 = 7*width;
const int back_step2 = 7*imageSize;
__asm
{
mov eax, Src1
mov ebx, Src2
mov edi, Dest1
mov ecx, l
shr ecx, 3
align 16
inner_loop:
movq mm1,[eax] // mm1=src1
movq mm2,[ebx] // mm2=src2
add eax,8
// -- 1 --------- in : mm1,mm2,mm3 out: mm4=SAD mm2=new mm2 --
_ABS_DIFF_
_ABS_DIFF_
_ABS_DIFF_
_ABS_DIFF_
_ABS_DIFF_
_ABS_DIFF_
_ABS_DIFF_
// - 8 ----------------
movq mm4,mm1 // mm4=mm1
psubusb mm4,mm2 // mm4 = src1 - src2
psubusb mm2,mm1 // mm2 = src2 - src1
por mm4,mm2 // mm2=|src1-src2|
add edi, imageSize
movq [edi], mm4 // here mm1=src1
// -------------------------------------------------------------
//
sub ebx, back_step1
add ebx,8
sub edi, back_step2
add edi,8
dec ecx
jnz inner_loop
emms
}
return 1;
}
// macro: in: mm1,mm2
#define _ABS_DIFF_HORIZ(Z) __asm \
{ \
__asm movq mm7, mm0 \
__asm add edi, imageSize \
__asm movq mm5,mm1 /* mm5=src1 */ \
__asm psllq mm7, Z \
__asm psubusb mm5,mm3 /* mm5=src1-src3 */ \
__asm movq mm6,mm3 /* mm6=src3 */ \
__asm psubusb mm6,mm1 /* mm3=src3-src1 */ \
__asm por mm6,mm5 /* mm6=|src1-src3| */ \
__asm movq [edi], mm6 /* here mm1=src1 */ \
__asm psrlq mm3, 8 /* mm3 = src3 + '1' ... with [x00000000] at the end */ \
__asm por mm3,mm7 /* here mm3=new src3 */ \
}
// ImgSubandAdd2: D = saturation0(|S1 - S2| + |S1 - S3|)
// process 8 disparities at a time
//
// Src1: right
// Src2: top
// Src3: left
//
// TODO? divide the result by 2 (shift)
inline int ImgSubandAdd_Horiz(const unsigned char *rightIm, const unsigned char *leftIm,
unsigned char* Dest, int l, int imageSize, int width)
{
if (l < 8) return 0; // image size must be at least 8 bytes
const int back_step2 = 7*imageSize;
__asm
{
mov eax, rightIm
mov edx, leftIm
mov edi, Dest
mov ecx, l
shr ecx, 3
movq mm0,[edx] // mm0=src3
movq mm0,[edx] // mm0=src3
align 16
inner_loop:
movq mm1,[eax] // mm1=src1
movq mm3,mm0 // mm3=src3
// -- 1 --------- in : mm1,mm2,mm3 out: mm4=SAD mm2=new mm2 --
movq mm0,[edx+8]
add eax,8
movq mm5,mm1 // mm5=src1
psubusb mm5,mm3 // mm5=src1-src3
movq mm6,mm3 // mm6=src3
psubusb mm6,mm1 // mm3=src3-src1
movq mm7, mm0
psrlq mm3, 8 // mm3 = src3 + '1' ... with [x00000000] at the end
por mm6,mm5 // mm6=|src1-src3|
movq [edi], mm6
psllq mm7, 56 // here mm1=src1 mm3=begin of NEWsrc3 mm7=end of NEWsrc3
por mm3,mm7 // here mm3=new src3
// - 2 ----------------
_ABS_DIFF_HORIZ(48)
_ABS_DIFF_HORIZ(40)
_ABS_DIFF_HORIZ(32)
_ABS_DIFF_HORIZ(24)
_ABS_DIFF_HORIZ(16)
_ABS_DIFF_HORIZ(8)
// - 8 ----------------
movq mm5,mm1 // mm5=src1
add edi, imageSize
psubusb mm5,mm3 // mm5=src1-src3
psubusb mm3,mm1 // mm3=src3-src1
por mm3,mm5 // mm6=|src1-src3|
movq [edi], mm3
// -------------------------------------------------------------
//
add edx,8
sub edi, back_step2
add edi,8
dec ecx
jnz inner_loop
emms
}
return 1;
}
// ----------------------
// FULL IMAGE, BEST ONLY : Keith's code
inline int findMinimumCorrelation(
const unsigned char *CurrentCorrelation,
unsigned char CurrentDisparity,
unsigned char *Disparity,
unsigned char *BestCorrelation, int bytecount)
{
if ((bytecount < 8) || ((bytecount % 8) != 0)) {
return 0;
}
__asm {
// load ecx with the pixelblock count = bytecount / 8
mov ecx, bytecount
shr ecx, 3
// setup mm0 with 8 copies of the disparity constant
mov al, CurrentDisparity
mov ah, al
mov bx, ax
shl eax, 16
mov ax, bx
movd mm0, eax
movd mm1, eax
punpckldq mm0, mm1
// setup mm1 with 8 copies of the xor constant for unsigned => signed conversion
mov eax, 0x80808080
movd mm1, eax
movd mm2, eax
punpckldq mm1, mm2
// setup the image pointers
mov eax, BestCorrelation
mov esi, CurrentCorrelation
mov edi, Disparity
pixel_loop:
movq mm2, [esi] // current correlation
movq mm3, [eax] // best correlation
// check for updates
movq mm5, mm2 // copy the current correlation
pxor mm5, mm1 // convert from unsigned range to signed range
movq mm6, mm3 // copy the best correlation
pxor mm6, mm1 // convert from unsigned range to signed range
pcmpgtb mm5, mm6 // mm5 := (current signed> best) mask
// 1 indicates current > best, so keep best
// 0 indicates current <= best, so use new value
// BYPASS
// this phase adds 8 additional instructions, but could skip 2 writes and 1 read
// abort remainder if not updating best correlation
pcmpeqb mm6, mm6 // mm6 = 0xFFFFFFFF
pxor mm6, mm5 // mm6 = mm5 xor 0xFFFFFFFF = not mm5
// 0 indicates current > best, so keep best
// 1 indicates current <= best, so use new value
packsswb mm6, mm6 // pack it into the lower dword of mm6 (unsigned saturation)
// 11111111 11111111 => 11111111 some replaced
// 11111111 00000000 => 11111111 some replaced
// 00000000 11111111 => 11111111 some replaced
// 00000000 00000000 => 00000000 no replacements
// we don't need to backup ebx because its not used in this routine
// movd mm7, ebx // make a backup of eax
movd ebx, mm6 // get the saturated mask
test ebx, ebx // test ebx => yields 0 iff no substitutions will occur
// movd ebx, mm7 // restore ebx
jz bypass // store mm4 (second correlation) to [ebx]
// Update best Correlation
movq mm6, mm5 // mm6 := mask
movq mm7, mm5 // mm7 := mask
pand mm6, mm3 // best correlation values to keep
pandn mm7, mm2 // current correlation value to move to best correlation
por mm6, mm7 // merge values
movq [eax], mm6 // store values
// update disparity
movq mm2, [edi] // get disparity map
movq mm6, mm5 // mm6 := mask
pand mm5, mm2 // select disparity map values to keep
pandn mm6, mm0 // select current disparity values to move to disparity map
por mm5, mm6 // merge values
movq [edi], mm5 // store values
bypass:
add eax, 8
add esi, 8
add edi, 8
dec ecx
jnz pixel_loop
emms;
}
return 1;
}
/*int initMinimumCorrelation(
const unsigned char *CurrentCorrelation,
unsigned char disparityInit,
unsigned char *Disparity,
unsigned char *BestCorrelation,
unsigned char *SecondCorrelation,
int bytecount)
{
for (int i=0; i signed conversion
mov eax, 0x80808080
movd mm1, eax
movd mm2, eax
punpckldq mm1, mm2
// setup the image pointers
mov eax, BestCorrelation
mov ebx, SecondCorrelation
mov esi, CurrentCorrelation
mov edi, Disparity
pixel_loop:
movq mm2, [esi] // current correlation
movq mm4, [ebx] // second correlation
// convert the current correlation from unsigned range to signed range
movq mm5, mm2 // copy the current correlation
pxor mm5, mm1 // convert from unsigned range to signed range
movq mm7, mm5 // copy converted to mm7
// check for second correlation updates
movq mm6, mm4 // copy second best correlation
pxor mm6, mm1 // convert from unsigned range to signed range
pcmpgtb mm7, mm6 // mm7 := (current signed> second best) mask
// BYPASS 1
// skip remainder if second correlation is not to be updated
// this phase adds an addition 8 instructions, but it could save as 1 memory read and 3 writes
pcmpeqb mm6, mm6 // mm6 = 0xFFFFFFFF
pxor mm6, mm7 // mm6 = mm7 xor 0xFFFFFFFF = not mm7
// 0 indicates current > second, so keep old value
// 1 indicates current <= second, so use new value
packsswb mm6, mm6 // pack it into the lower dword of mm6 (unsigned saturation)
// 11111111 11111111 => 11111111 some replaced
// 11111111 00000000 => 11111111 some replaced
// 00000000 11111111 => 11111111 some replaced
// 00000000 00000000 => 00000000 no replacements
// don't need to backup edx because its not used in this routine
// movd mm3, edx // make a backup of edx
movd edx, mm6 // get the saturated mask
test edx, edx // test edx => yields 0 iff no replacements will occur
// movd edx, mm3 // restore edx
jz bypass1
// direct update second correlation (get values from current)
// mm7 already has mask
// movq mm6, mm7 // mm6 := mask
// pand mm6, mm4 // second correlation values to keep
// pandn mm7, mm2 // current correlation values to move to second correlation
// por mm6, mm7 // merge value => direct updated second correlation
// movq mm4, mm6 // store values (*** this instruction could be eliminated!)
pand mm4, mm7 // second correlation values to keep
pandn mm7, mm2 // current correlation values to move to second correlation
por mm4, mm7 // merge value => direct updated second correlation
// check for best correlation updates
movq mm3, [eax] // best correlation
// mm5 has converted current correlation
movq mm6, mm3 // copy the best correlation
pxor mm6, mm1 // convert from unsigned range to signed range
pcmpgtb mm5, mm6 // mm5 := (current signed> best) mask
// 1 indicates current > best, so keep best
// 0 indicates current <= best, so use new value
// BYPASS 2
// this phase adds 8 additional instructions, but could skip 2 writes and 1 read
// abort remainder if not updating best correlation
pcmpeqb mm6, mm6 // mm6 = 0xFFFFFFFF
pxor mm6, mm5 // mm6 = mm5 xor 0xFFFFFFFF = not mm5
// 0 indicates current > best, so keep best
// 1 indicates current <= best, so use new value
packsswb mm6, mm6 // pack it into the lower dword of mm6 (unsigned saturation)
// 11111111 11111111 => 11111111 some replaced
// 11111111 00000000 => 11111111 some replaced
// 00000000 11111111 => 11111111 some replaced
// 00000000 00000000 => 00000000 no replacements
// don't need to backup edx because its not used in this routine
// movd mm7, edx // make a backup of edx
movd edx, mm6 // get the saturated mask
test edx, edx // test edx => yields 0 iff no substitutions will occur
// movd edx, mm7 // restore edx
jz bypass2 // store mm4 (second correlation) to [ebx]
// indirect update second correlation (pushed down from best)
movq mm6, mm5 // mm6 := mask
movq mm7, mm5 // mm7 := mask
pand mm6, mm4 // second correlation values to keep
pandn mm7, mm3 // best correlations to move to second correlation
por mm6, mm7 // merge values
movq [ebx], mm6 // store values
// direct Update best Correlation
movq mm6, mm5 // mm6 := mask
movq mm7, mm5 // mm7 := mask
pand mm6, mm3 // best correlation values to keep
pandn mm7, mm2 // current correlation value to move to best correlation
por mm6, mm7 // merge values
movq [eax], mm6 // store values
// update disparity
movq mm2, [edi] // get disparity map
movq mm6, mm5 // mm6 := mask
pand mm5, mm2 // select disparity map values to keep
pandn mm6, mm0 // select current disparity values to move to disparity map
por mm5, mm6 // merge values
movq [edi], mm5 // store values
bypass1:
next_pixel:
add eax, 8
add ebx, 8
add esi, 8
add edi, 8
dec ecx
jnz pixel_loop
jmp done
bypass2:
movq [ebx], mm4;
jmp next_pixel
done:
emms;
}
return 1;
}
inline void sum_Row(uchar* im, unsigned short* im_out, int rowSize, int maskSize)
{
switch (maskSize)
{
case 5:
sum_Row_5(im, im_out, rowSize);
break;
case 9:
case 8:
case 7:
case 6:
sum_Row_5(im, im_out, rowSize);
sum_Row_5(im_out, im_out, rowSize);
break;
case 13:
case 12:
case 11:
case 10:
sum_Row_5(im, im_out, rowSize);
sum_Row_5(im_out, im_out, rowSize);
sum_Row_5(im_out, im_out, rowSize);
break;
case 17:
case 16:
case 15:
case 14 :
sum_Row_5(im, im_out, rowSize);
sum_Row_5(im_out, im_out, rowSize);
sum_Row_5(im_out, im_out, rowSize);
sum_Row_5(im_out, im_out, rowSize);
break;
default:
break;
}
}
inline void sum_Row(unsigned short* im, unsigned short* im_out, int rowSize, int maskSize)
{
switch (maskSize)
{
case 5:
sum_Row_5(im, im_out, rowSize);
break;
case 9:
case 8:
case 7:
case 6:
sum_Row_5(im, im_out, rowSize);
sum_Row_5(im_out, im_out, rowSize);
break;
case 13:
case 12:
case 11:
case 10:
sum_Row_5(im, im_out, rowSize);
sum_Row_5(im_out, im_out, rowSize);
sum_Row_5(im_out, im_out, rowSize);
break;
case 17:
case 16:
case 15:
case 14 :
sum_Row_5(im, im_out, rowSize);
sum_Row_5(im_out, im_out, rowSize);
sum_Row_5(im_out, im_out, rowSize);
sum_Row_5(im_out, im_out, rowSize);
break;
default:
break;
}
}
#define aim_Sum_Words_In_MM1 __asm \
{ \
__asm movq mm4, mm1 \
__asm movq mm2, mm1 \
\
__asm movq mm3, mm1 \
__asm psllq mm1, 16 \
\
__asm psrlq mm2, 16 \
__asm paddw mm4, mm2 \
\
__asm paddw mm3, mm1 \
__asm psrlq mm2, 16 \
\
__asm psllq mm1, 16 \
__asm paddw mm4, mm2 \
\
__asm psrlq mm2, 16 \
__asm paddw mm3, mm1 \
\
__asm psllq mm1, 16 \
__asm paddw mm4, mm2 \
\
__asm paddw mm3, mm1 \
}
// apply the mask [1 1 1 1 1] to the 1-D array im (bytes)
// output : im_out (words)
inline void sum_Row_5(uchar* im, unsigned short* im_out, int rowSize)
{
__asm {
mov eax, rowSize
mov ebx, im
mov ecx, im_out
pxor mm6, mm6 // mm6 = x00000000
//Process the first quad word, but save only the second result"
test eax, eax // Is there anything to do?"
jz end_sum_loop // Jump out if necessary
//Process low word
movq mm1, [ebx] // Copy...
punpcklbw mm1, mm6 // Expand low word bytes into words // mm1 =[D C B A]
aim_Sum_Words_In_MM1
//Store the result Only in the accumulator
movq mm7, mm4 // Update accumulator mm4=[D C+D B+C+D A+B+C+D]
//Process high word
movq mm1, [ebx] // Copy...
punpckhbw mm1, mm6 // Expand high word bytes into words // mm1 =[H G F E]
add ebx, 8 // Update input pointer
aim_Sum_Words_In_MM1
//Add to the previous data ...
// mm3=[E+F+G+H E+F+G E+F E]
// mm4=[H G+H F+G+H E+F+G+H]
paddw mm7, mm3 // The current word of the accum // mm7=[D+E+F+G+H C+D+E+F+G B+C+D+E+F A+B+C+D+E]
// translate everything to 2 words on the left
movq mm1, mm7 // mm1 = [D+E+F+G+H C+D+E+F+G B+C+D+E+F A+B+C+D+E]
psrlq mm1, 32 // mm1 = [0 0 D+E+F+G+H C+D+E+F+G]
movq mm0, mm1 // mm0 = [D+E+F+G+H C+D+E+F+G]
psllq mm7, 32 // mm7 = [B+C+D+E+F A+B+C+D+E 0 0]
movq [ecx], mm7 // Store the final result
add ecx, 8 // Update output pointer
movq mm7, mm4 // Update accumulator mm4=[H G+H F+G+H E+F+G+H]
sub eax, 8 // Update the number of points left
// Start the loop
row_sum_loop:
test eax, eax // Is there anything to do?
jz end_sum_loop // Jump out if necessary
movq mm1, [ebx] // Load data
//Process low word
punpcklbw mm1, mm6 // Expand low word bytes into words
aim_Sum_Words_In_MM1
//Add to the previous data
//prefetcht1 [ecx+16]
paddw mm7, mm3 // The current word of the accum
// translate everything to 2 words on the left
// mm0 = [0 0 D C] mm7 = [H G F E] ----> mm7=[0 0 H G] [ecx]=[F E D C]
punpckldq mm0, mm7 // mm0 = [F E D C]
movq [ecx], mm0
sub eax, 8 // Update the number of points left
movq mm0, mm4 // Update accumulator
psrlq mm7, 32 // mm7 = [0 0 H G]
//Process high word
movq mm1, [ebx] // Copy...
punpckhbw mm1, mm6 // Expand high word bytes into words
aim_Sum_Words_In_MM1
//Add to the previous data
paddw mm0, mm3 // The current word of the accum
// translate everything to 2 words on the left
// mm7 = [0 0 D C] mm0 = [H G F E] ----> mm0=[0 0 H G] [ecx+8]=[F E D C]
punpckldq mm7, mm0 // mm7 = [F E D C]
add ebx, 8 // Update input pointer
movq [ecx+8], mm7
psrlq mm0, 32 // mm0 = [0 0 H G]
movq mm7, mm4 // Update accumulator
add ecx, 16 // Update output pointer
jmp row_sum_loop // Loop
//Cleanup
end_sum_loop:
emms
}
}
// apply the mask (1/4)*[1 1 1 1 1] to the 1-D array im (words)
// output : im_out (words)
inline void sum_Row_5(ushort* im, ushort* im_out, int rowSize)
{
__asm {
mov eax, rowSize
mov ebx, im
mov ecx, im_out
//Process the first quad word, but save only the second result"
test eax, eax // Is there anything to do?"
jz end_sum_loop // Jump out if necessary
movq mm1, [ebx] // Load data (4 words)
add ebx, 8 // Update input pointer
//Process low word
aim_Sum_Words_In_MM1
//Store the result Only in the accumulator
movq mm7, mm4 // Update accumulator
//Process high word
movq mm1, [ebx] // Copy...
aim_Sum_Words_In_MM1
add ebx, 8
//Add to the previous data
paddw mm7, mm3 // The current word of the accum
// translate everything to 2 words on the left
movq mm1, mm7 // mm1 = [D+E+F+G+H C+D+E+F+G B+C+D+E+F A+B+C+D+E]
psrlq mm1, 32 // mm1 = [0 0 D+E+F+G+H C+D+E+F+G]
movq mm0, mm1 // mm0 = [0 0 D+E+F+G+H C+D+E+F+G]
psllq mm7, 32 // mm7 = [B+C+D+E+F A+B+C+D+E 0 0]
movq [ecx], mm7 // Store the final result
movq mm7, mm4 // Update accumulator
add ecx, 8 // Update output pointer
sub eax, 8 // Update the number of points left
// Start the loop
row_sum_loop:
test eax, eax // Is there anything to do?
jz end_sum_loop // Jump out if necessary
movq mm1, [ebx] // Load data
aim_Sum_Words_In_MM1
//Add to the previous data
//prefetcht0 [ecx + 32]
//prefetcht0 [ebx + 48]
paddw mm7, mm3 // The current word of the accum
psrlw mm7, 2 // divide result by ...
// translate everything to 2 words on the left
// mm0 = [0 0 D C] mm7 = [H G F E] ----> mm7 =[0 0 H G] [ecx]=[F E D C]
punpckldq mm0, mm7 // mm0 = [F E D C]
movq [ecx], mm0
sub eax, 8 // Update the number of points left
movq mm0, mm4 // Update accumulator
psrlq mm7, 32 // mm7 =[0 0 H G]
//Process high word
movq mm1, [ebx+8] // Copy...
aim_Sum_Words_In_MM1
//Add to the previous data
paddw mm0, mm3 // The current word of the accum
psrlw mm0, 2 // divide result by ...
// translate everything to 2 words on the left
// mm7 = [0 0 D C] mm0 = [H G F E] ----> mm0=[0 0 H G] [ecx+8]=[F E D C]
punpckldq mm7, mm0 // mm7 = [F E D C]
add ebx, 16 // Update input pointer
movq [ecx+8], mm7
psrlq mm0, 32 // mm0 = [0 0 H G]
movq mm7, mm4 // Update accumulator
add ecx, 16 // Update output pointer */
jmp row_sum_loop // Loop
//Cleanup
end_sum_loop:
emms
}
}
// apply vertical mask 1/16*[1 1 1 ... 1]^T to 'im'
// result in 'im_out'
inline void avg_Col(ushort* im, uchar* im_out, int dataSize, int width, int sizeMask)
{
switch (sizeMask)
{
case 5: avg_Col_5(im,im_out,dataSize,width);
break;
case 7: avg_Col_7(im,im_out,dataSize,width);
break;
case 9: avg_Col_9(im,im_out,dataSize,width);
break;
case 11: avg_Col_11(im,im_out,dataSize,width);
break;
case 13: avg_Col_13(im,im_out,dataSize,width);
break;
case 15: avg_Col_15(im,im_out,dataSize,width);
break;
case 17: avg_Col_17(im,im_out,dataSize,width);
break;
default: avg_Col_5(im,im_out,dataSize,width);
break;
}
}
#define macro_add __asm \
{ \
__asm paddusw mm3, [edx] \
__asm paddusw mm2, [edx+8] \
__asm add edx, edi \
}
inline void avg_Col_5(ushort* im, uchar* im_out, int dataSize, int width)
{
__asm {
mov edi, width
shl edi, 1 // edi = 2*width
mov eax, dataSize
mov ecx, im_out
mov ebx, im
sub ebx, edi
sub ebx, edi // ebx = ebx-4*width
test eax, eax // Is there anything to do?"
jz end_sum_loop // Jump out if necessary
row_sum_loop:
test eax, eax // Is there anything to do?
jz end_sum_loop // Jump out if necessary
mov edx, ebx
add ebx, 16
// 1
movq mm3, [edx] // mm3 = 4 words of im
movq mm2, [edx+8] // mm2 = next 4 words of im
add edx, edi
macro_add
macro_add
macro_add
macro_add
// divide results by ...
psrlw mm3, 3
psrlw mm2, 3
// convert [mm2 mm3] as 8 bytes
packuswb mm3,mm2
movq [ecx], mm3
sub eax, 8 // Update the number of points left
add ecx, 8 // Update output pointer
jmp row_sum_loop // Loop
//Cleanup
end_sum_loop:
emms
}
}
inline void avg_Col_7(ushort* im, uchar* im_out, int dataSize, int width)
{
__asm {
mov edi, width
shl edi, 1 // edi = 2*width
mov eax, dataSize
mov ecx, im_out
mov ebx, im
sub ebx, edi
sub ebx, edi
sub ebx, edi // ebx = ebx-6*width
test eax, eax // Is there anything to do?"
jz end_sum_loop // Jump out if necessary
row_sum_loop:
test eax, eax // Is there anything to do?
jz end_sum_loop // Jump out if necessary
mov edx, ebx
// 1
movq mm3, [edx] // mm3 = 4 words of im
add ebx, 16
movq mm2, [edx+8] // mm2 = next 4 words of im
add edx, edi
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
// divide results by ...
psrlw mm3, 3
psrlw mm2, 3
// convert [mm2 mm3] as 8 bytes
packuswb mm3,mm2
movq [ecx], mm3
sub eax, 8 // Update the number of points left
add ecx, 8 // Update output pointer
jmp row_sum_loop // Loop
//Cleanup
end_sum_loop:
emms
}
}
inline void avg_Col_9(ushort* im, uchar* im_out, int dataSize, int width)
{
__asm {
mov edi, width
shl edi, 1 // edi = 2*width
mov eax, dataSize
mov ecx, im_out
mov ebx, im
sub ebx, edi
sub ebx, edi
sub ebx, edi
sub ebx, edi // ebx = ebx-8*width
test eax, eax // Is there anything to do?"
jz end_sum_loop // Jump out if necessary
row_sum_loop:
test eax, eax // Is there anything to do?
jz end_sum_loop // Jump out if necessary
mov edx, ebx
add ebx, 16
// 1
movq mm3, [edx] // mm3 = 4 words of im
movq mm2, [edx+8] // mm2 = next 4 words of im
add edx, edi
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
// divide results by ...
psrlw mm3, 3
psrlw mm2, 3
// convert [mm2 mm3] as 8 bytes
packuswb mm3,mm2
movq [ecx], mm3
sub eax, 8 // Update the number of points left
add ecx, 8 // Update output pointer
jmp row_sum_loop // Loop
//Cleanup
end_sum_loop:
emms
}
}
inline void avg_Col_11(ushort* im, uchar* im_out, int dataSize, int width)
{
__asm {
mov edi, width
shl edi, 1 // edi = 2*width
mov eax, dataSize
mov ecx, im_out
mov ebx, im
sub ebx, edi
sub ebx, edi
sub ebx, edi
sub ebx, edi
sub ebx, edi // ebx = ebx-10*width
test eax, eax // Is there anything to do?"
jz end_sum_loop // Jump out if necessary
row_sum_loop:
test eax, eax // Is there anything to do?
jz end_sum_loop // Jump out if necessary
mov edx, ebx
add ebx, 16
// 1
movq mm3, [edx] // mm3 = 4 words of im
movq mm2, [edx+8] // mm2 = next 4 words of im
add edx, edi
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
// divide results by ...
psrlw mm3, 4
psrlw mm2, 4
// convert [mm2 mm3] as 8 bytes
packuswb mm3,mm2
movq [ecx], mm3
sub eax, 8 // Update the number of points left
add ecx, 8 // Update output pointer
jmp row_sum_loop // Loop
//Cleanup
end_sum_loop:
emms
}
}
inline void avg_Col_13(ushort* im, uchar* im_out, int dataSize, int width)
{
__asm {
mov edi, width
shl edi, 1 // edi = 2*width
mov eax, dataSize
mov ecx, im_out
mov ebx, im
sub ebx, edi
sub ebx, edi
sub ebx, edi
sub ebx, edi
sub ebx, edi
sub ebx, edi // ebx = ebx-12*width
test eax, eax // Is there anything to do?"
jz end_sum_loop // Jump out if necessary
row_sum_loop:
test eax, eax // Is there anything to do?
jz end_sum_loop // Jump out if necessary
mov edx, ebx
add ebx, 16
// 1
movq mm3, [edx] // mm3 = 4 words of im
movq mm2, [edx+8] // mm2 = next 4 words of im
add edx, edi
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
// divide results by ...
psrlw mm3, 4
psrlw mm2, 4
// convert [mm2 mm3] as 8 bytes
packuswb mm3,mm2
movq [ecx], mm3
sub eax, 8 // Update the number of points left
add ecx, 8 // Update output pointer
jmp row_sum_loop // Loop
//Cleanup
end_sum_loop:
emms
}
}
inline void avg_Col_15(ushort* im, uchar* im_out, int dataSize, int width)
{
__asm {
mov edi, width
shl edi, 1 // edi = 2*width
mov eax, dataSize
mov ecx, im_out
mov ebx, im
sub ebx, edi
sub ebx, edi
sub ebx, edi
sub ebx, edi
sub ebx, edi
sub ebx, edi
sub ebx, edi // ebx = ebx-14*width
test eax, eax // Is there anything to do?"
jz end_sum_loop // Jump out if necessary
row_sum_loop:
test eax, eax // Is there anything to do?
jz end_sum_loop // Jump out if necessary
mov edx, ebx
add ebx, 16
// 1
movq mm3, [edx] // mm3 = 4 words of im
movq mm2, [edx+8] // mm2 = next 4 words of im
add edx, edi
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
// divide results by ...
psrlw mm3, 4
psrlw mm2, 4
// convert [mm2 mm3] as 8 bytes
packuswb mm3,mm2
movq [ecx], mm3
sub eax, 8 // Update the number of points left
add ecx, 8 // Update output pointer
jmp row_sum_loop // Loop
//Cleanup
end_sum_loop:
emms
}
}
inline void avg_Col_17(ushort* im, uchar* im_out, int dataSize, int width)
{
__asm {
mov edi, width
shl edi, 1 // edi = 2*width
mov eax, dataSize
mov ecx, im_out
mov ebx, im
sub ebx, edi
sub ebx, edi
sub ebx, edi
sub ebx, edi
sub ebx, edi
sub ebx, edi
sub ebx, edi
sub ebx, edi // ebx = ebx-16*width
test eax, eax // Is there anything to do?"
jz end_sum_loop // Jump out if necessary
row_sum_loop:
test eax, eax // Is there anything to do?
jz end_sum_loop // Jump out if necessary
mov edx, ebx
add ebx, 16
// 1
movq mm3, [edx] // mm3 = 4 words of im
movq mm2, [edx+8] // mm2 = next 4 words of im
add edx, edi
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
macro_add
// divide results by ...
psrlw mm3, 4
psrlw mm2, 4
// convert [mm2 mm3] as 8 bytes
packuswb mm3,mm2
movq [ecx], mm3
sub eax, 8 // Update the number of points left
add ecx, 8 // Update output pointer
jmp row_sum_loop // Loop
//Cleanup
end_sum_loop:
emms
}
}
inline void add_Col_5_wb(ushort* im, uchar* im_out, int dataSize, int width)
{
__asm {
mov edi, width
shl edi, 1 // edi = 2*width
mov eax, dataSize
mov ecx, im_out
mov ebx, im
sub ebx, edi
sub ebx, edi // ebx = ebx-4*width
test eax, eax // Is there anything to do?"
jz end_sum_loop // Jump out if necessary
row_sum_loop:
test eax, eax // Is there anything to do?
jz end_sum_loop // Jump out if necessary
mov edx, ebx
add ebx, 16
// 1
movq mm3, [edx] // mm3 = 4 words of im
movq mm2, [edx+8] // mm2 = next 4 words of im
add edx, edi
macro_add
macro_add
macro_add
macro_add
// save [mm2 mm3] as 8 bytes
packuswb mm3,mm2
movq [ecx], mm3
sub eax, 8 // Update the number of points left
add ecx, 8 // Update output pointer
jmp row_sum_loop // Loop
//Cleanup
end_sum_loop:
emms
}
}
inline void add_Col_5_ww(ushort* im, ushort* im_out, int dataSize, int width)
{
__asm {
mov edi, width
shl edi, 1 // edi = 2*width
mov eax, dataSize
mov ecx, im_out
mov ebx, im
sub ebx, edi
sub ebx, edi // ebx = ebx-4*width
test eax, eax // Is there anything to do?"
jz end_sum_loop // Jump out if necessary
row_sum_loop:
test eax, eax // Is there anything to do?
jz end_sum_loop // Jump out if necessary
mov edx, ebx
add ebx, 16
// 1
movq mm3, [edx] // mm3 = 4 words of im
movq mm2, [edx+8] // mm2 = next 4 words of im
add edx, edi
macro_add
macro_add
macro_add
macro_add
// save [mm2 mm3] as words
movq [ecx], mm3
movq [ecx+8], mm2
sub eax, 8 // Update the number of points left
add ecx, 16 // Update output pointer
jmp row_sum_loop // Loop
//Cleanup
end_sum_loop:
emms
}
}
// compare bestScores and secondScores. if secondthresh
// 0 otherwise
pand mm3, mm2
pandn mm2, mm7
por mm3, mm2
movq [edx], mm3
sub eax, 8 // Update the number of points left
add ebx, 8 // Update output pointer
add ecx, 8
add edx, 8
jmp comp_loop // Loop
//Cleanup
end_loop:
emms
}
}
// windowWidth must be multiple of 8
inline void cropImage(const uchar* imSrc, int width, int height,
uchar* imDest, int x0, int y0, int windowWidth, int windowHeight)
{
int w8 = windowWidth/8;
int step = width-windowWidth;
const uchar* srcNewOrigin = imSrc+x0+y0*width;
__asm {
mov ecx, windowHeight
mov edx, w8
mov eax, srcNewOrigin
mov ebx, imDest
pixel_loop:
movq mm1, [eax]
movq [ebx], mm1
add eax, 8
add ebx, 8
dec edx
jnz pixel_loop
mov edx, w8
add eax, step
dec ecx
jnz pixel_loop
jmp done
done:
emms;
}
}
// return the average pixel value
inline float pixelMean(const uchar* im, int imageSize)
{
int sum;
__asm {
mov ecx, imageSize
shr ecx, 3
mov eax, im
pxor mm7,mm7 // mm7 used as accumulator
pxor mm0,mm0 // mm0 = 0
pixel_loop:
movq mm1, [eax]
movq mm2,mm1
punpcklbw mm2, mm0
punpckhbw mm1, mm0
paddw mm2,mm1
movq mm1,mm2
punpcklwd mm2, mm0
punpckhwd mm1, mm0
paddd mm2,mm1
paddd mm7,mm2
add eax, 8
dec ecx
jnz pixel_loop
jmp done
done:
movd ebx, mm7
psrlq mm7, 32
movd edx, mm7
add ebx, edx
mov sum, ebx
emms
}
return sum / (float)imageSize;
}
// -------------------------------------------------------------
// apply mask:
// if mask[]=undefined_val im[]->im[]
// otherwise, im[]->mask[]
// ....... this one may not be exact :-(
inline void overrideImageMMX(uchar* im, const uchar* mask, uchar undefined_val, int imageSize)
{
__asm {
// setup mm0 with 8 copies of 'undefined_val'
mov al, undefined_val
mov ah, al
mov bx, ax
shl eax, 16
mov ax, bx
movd mm0, eax
movd mm1, eax
punpckldq mm0, mm1
mov ecx, imageSize
shr ecx, 3
mov eax, im
mov ebx, mask
pixel_loop:
movq mm1, [eax]
movq mm2, [ebx]
movq mm3, mm2
pcmpeqb mm3, mm0 // mm3[] -> xFF if mm2[]==undefined_val
// -> x00 otherwise
pand mm3, mm1 // mm3[] = mm1[] if mm2[]==undefined_val
// = x00 otherwise
por mm3, mm2
movq [eax], mm3
add eax, 8
add ebx, 8
dec ecx
jnz pixel_loop
jmp done
done:
emms
}
}
inline void overrideImage(uchar* im, const uchar* mask, uchar undefined_val, int imageSize)
{
for (int i=0; i