www.pudn.com > Estereo.rar > processingMMX.cpp
/*************************************************************************** * * Copyright 2004 by the Massachusetts Institute of Technology. All * rights reserved. * * Developed by the Vision Interface Group * at the Computer Sciences and Artificial Intelligence Laboratory, * MIT, Cambridge, Massachusetts. * * Permission to use, copy, or modify this software and its documentation * for educational and research purposes only and without fee is hereby * granted, provided that this copyright notice and the original authors's * names appear on all copies and supporting documentation. If individual * files are separated from this distribution directory structure, this * copyright notice must be included. For any other uses of this software, * in original or modified form, including but not limited to distribution * in whole or in part, specific prior permission must be obtained from * MIT. These programs shall not be used, rewritten, or adapted as the * basis of a commercial software or hardware product without first * obtaining appropriate licenses from MIT. MIT. makes no representations * about the suitability of this software for any purpose. It is provided * "as is" without express or implied warranty. * **************************************************************************/ #include "stereoMatching.h" #include "processingMMX.h" //#include// ImgSub2: D = saturation0(|S1 - S2| + |S1 - S3|) // TODO? divide the result by 2 (shift) int ImgSubandAdd(const unsigned char *Src1, const unsigned char *Src2, const unsigned char *Src3, unsigned char *Dest, int l) { if (l < 8) return 0; // image size must be at least 8 bytes __asm { mov eax, Src1 mov ebx, Src2 mov edx, Src3 mov edi, Dest mov ecx, l shr ecx, 3 align 16 inner_loop: movq mm1,[eax] // mm1=src1 movq mm2,[ebx] // mm2=src2 movq mm4,mm1 // mm4=mm1 psubusb mm4,mm2 // mm4 = src1 - src2 movq mm3,[edx] // mm3=src3 psubusb mm2,mm1 // mm2 = src2 - src1 movq mm5,mm1 // mm5=src1 por mm2,mm4 // mm2=|src1-src2| psubusb mm5,mm3 // mm4=src1-src3 psubusb mm3,mm1 // mm3=src3-src1 por mm3,mm5 // mm3=|src1-src3| paddusb mm2,mm3 // mm2 = |src1-src2|+|src1-src3| movq [edi], mm2 add eax,8 add ebx,8 add edx,8 add edi,8 dec ecx jnz inner_loop emms } return 1; } // ImgSub2: D = saturation0(|S1 - S2|) // TODO? divide the result by 2 (shift) int ImgSubandAdd(const unsigned char *Src1, const unsigned char *Src2, const unsigned char *Dest, int l) { if (l < 8) return 0; // image size must be at least 8 bytes __asm { mov eax, Src1 mov ebx, Src2 mov edi, Dest mov ecx, l shr ecx, 3 align 16 inner_loop: movq mm1,[eax] // mm1=src1 movq mm2,[ebx] // mm2=src2 movq mm4,mm1 // mm4=mm1 psubusb mm4,mm2 // mm4 = src1 - src2 psubusb mm2,mm1 // mm2 = src2 - src1 por mm2,mm4 // mm2=|src1-src2| movq [edi], mm2 add eax,8 add ebx,8 add edi,8 dec ecx jnz inner_loop emms } return 1; } #define _ABS_DIFF_TRI(Z) __asm \ { \ __asm movq mm4,mm1 /* mm4=mm1 */ \ __asm add ebx, width \ __asm add edi, imageSize \ __asm por mm3,mm7 /* here mm2=new src2 mm3=new src3 */ \ \ __asm movq mm7, mm0 \ __asm psubusb mm4,mm2 /* mm4 = src1 - src2 */ \ \ __asm psubusb mm2,mm1 /* mm2 = src2 - src1 */ \ __asm psllq mm7,Z \ \ __asm movq mm5,mm1 /* mm5=src1 */ \ __asm por mm4,mm2 /* mm2=|src1-src2| */ \ \ __asm movq mm2,[ebx] /* mm2= src2 + 'width' = new src2*/ \ __asm psubusb mm5,mm3 /* mm5=src1-src3*/ \ \ __asm movq mm6,mm3 /* mm6=src3*/ \ __asm psubusb mm6,mm1 /* mm3=src3-src1*/ \ \ __asm por mm6,mm5 /* mm6=|src1-src3|*/ \ __asm paddusb mm4,mm6 /* mm4 = |src1-src2|+|src1-src3|*/ \ \ __asm movq [edi], mm4 /* here mm1=src1*/ \ __asm psrlq mm3, 8 /* mm3 = src3 + '1' ... with [x00000000] at the end*/\ } #define _ABS_DIFF_TRI_prefetch(Z, X) __asm \ { \ __asm movq mm4,mm1 /* mm4=mm1 */ \ __asm add ebx, width \ __asm add edi, imageSize \ __asm por mm3,mm7 /* here mm2=new src2 mm3=new src3 */ \ \ __asm movq mm7, mm0 \ __asm psubusb mm4,mm2 /* mm4 = src1 - src2 */ \ \ __asm psubusb mm2,mm1 /* mm2 = src2 - src1 */ \ __asm prefetcht0 [ebx + X] \ __asm psllq mm7,Z \ \ __asm movq mm5,mm1 /* mm5=src1 */ \ __asm por mm4,mm2 /* mm2=|src1-src2| */ \ \ \ __asm movq mm2,[ebx] /* mm2= src2 + 'width' = new src2*/ \ __asm psubusb mm5,mm3 /* mm5=src1-src3*/ \ \ __asm movq mm6,mm3 /* mm6=src3*/ \ __asm psubusb mm6,mm1 /* mm3=src3-src1*/ \ \ __asm por mm6,mm5 /* mm6=|src1-src3|*/ \ __asm paddusb mm4,mm6 /* mm4 = |src1-src2|+|src1-src3|*/ \ \ __asm movq [edi], mm4 /* here mm1=src1*/ \ __asm psrlq mm3, 8 /* mm3 = src3 + '1' ... with [x00000000] at the end*/\ } // ImgSubandAdd2: D = saturation0(|S1 - S2| + |S1 - S3|) // process 8 disparities at a time // // Src1: right // Src2: top // Src3: left // // TODO? divide the result by 2 (shift) int ImgSubandAdd2(const unsigned char *Src1, const unsigned char *Src2, const unsigned char *Src3, unsigned char* Dest1, int l, int imageSize, int width) { if (l < 8) return 0; // image size must be at least 8 bytes const int back_step1 = 7*width; const int back_step2 = 7*imageSize; __asm { mov eax, Src1 mov ebx, Src2 mov edx, Src3 mov edi, Dest1 mov ecx, l shr ecx, 3 movq mm0,[edx] // mm0=src3 movq mm0,[edx] // mm0=src3 align 16 inner_loop: movq mm1,[eax] // mm1=src1 movq mm3,mm0 // mm3=src3 movq mm2,[ebx] // mm2=src2 add eax,8 // -- 1 --------- in : mm1,mm2,mm3 out: mm4=SAD mm2=new mm2 -- movq mm4,mm1 // mm4=mm1 add ebx,width psubusb mm4,mm2 // mm4 = src1 - src2 //prefetcht0 [ebx + 32 + 2*320] movq mm0,[edx+8] psubusb mm2,mm1 // mm2 = src2 - src1 movq mm5,mm1 // mm5=src1 por mm4,mm2 // mm2=|src1-src2| movq mm2,[ebx] // mm2= src2 + 'width' = new src2 psubusb mm5,mm3 // mm5=src1-src3 movq mm6,mm3 // mm6=src3 psubusb mm6,mm1 // mm3=src3-src1 movq mm7, mm0 psrlq mm3, 8 // mm3 = src3 + '1' ... with [x00000000] at the end por mm6,mm5 // mm6=|src1-src3| paddusb mm4,mm6 // mm4 = |src1-src2|+|src1-src3| movq [edi], mm4 psllq mm7, 56 // here mm1=src1 mm2=NEW src2 mm3=begin of NEWsrc3 mm7=end of NEWsrc3 // ------------------------------------------------------------- // - 2 ---------------- _ABS_DIFF_TRI(48) // - 3 ---------------- _ABS_DIFF_TRI(40) // - 4 ---------------- _ABS_DIFF_TRI(32) // _ABS_DIFF_TRI_prefetch(32,24 + 3*320) // - 5 ---------------- _ABS_DIFF_TRI(24) // - 6 ---------------- _ABS_DIFF_TRI(16) // - 7 ---------------- _ABS_DIFF_TRI(8) // - 8 ---------------- movq mm4,mm1 // mm4=mm1 por mm3,mm7 // here mm2=new src2 mm3=new src3 psubusb mm4,mm2 // mm4 = src1 - src2 psubusb mm2,mm1 // mm2 = src2 - src1 movq mm5,mm1 // mm5=src1 por mm4,mm2 // mm2=|src1-src2| psubusb mm5,mm3 // mm5=src1-src3 psubusb mm3,mm1 // mm3=src3-src1 por mm3,mm5 // mm6=|src1-src3| paddusb mm4,mm3 // mm4 = |src1-src2|+|src1-src3| add edi, imageSize movq [edi], mm4 // here mm1=src1 // ------------------------------------------------------------- // sub ebx, back_step1 add ebx,8 add edx,8 sub edi, back_step2 add edi,8 dec ecx jnz inner_loop emms } return 1; } // macro: in: mm1,mm2 #define _ABS_DIFF_ __asm \ { \ __asm movq mm4,mm1 /* mm4=mm1 */ \ __asm psubusb mm4,mm2 /* mm4 = src1 - src2 */ \ __asm psubusb mm2,mm1 /* mm2 = src2 - src1 */ \ __asm por mm4,mm2 /* mm2=|src1-src2| */ \ __asm add ebx, width \ __asm add edi, imageSize \ __asm movq mm2,[ebx] \ __asm movq [edi], mm4 /* here mm1=src1 */ \ } // ImgSubandAdd2: D = saturation0(|S1 - S2| + |S1 - S3|) // process 8 disparities at a time // Src1: right // Src2: top // TODO? divide the result by 2 (shift) int ImgSubandAdd2_Vert(const unsigned char *Src1, const unsigned char *Src2, unsigned char* Dest1, int l, int imageSize, int width) { if (l < 8) return 0; // image size must be at least 8 bytes const int back_step1 = 7*width; const int back_step2 = 7*imageSize; __asm { mov eax, Src1 mov ebx, Src2 mov edi, Dest1 mov ecx, l shr ecx, 3 align 16 inner_loop: movq mm1,[eax] // mm1=src1 movq mm2,[ebx] // mm2=src2 add eax,8 // -- 1 --------- in : mm1,mm2,mm3 out: mm4=SAD mm2=new mm2 -- _ABS_DIFF_ _ABS_DIFF_ _ABS_DIFF_ _ABS_DIFF_ _ABS_DIFF_ _ABS_DIFF_ _ABS_DIFF_ // - 8 ---------------- movq mm4,mm1 // mm4=mm1 psubusb mm4,mm2 // mm4 = src1 - src2 psubusb mm2,mm1 // mm2 = src2 - src1 por mm4,mm2 // mm2=|src1-src2| add edi, imageSize movq [edi], mm4 // here mm1=src1 // ------------------------------------------------------------- // sub ebx, back_step1 add ebx,8 sub edi, back_step2 add edi,8 dec ecx jnz inner_loop emms } return 1; } // macro: in: mm1,mm2 #define _ABS_DIFF_HORIZ(Z) __asm \ { \ __asm movq mm7, mm0 \ __asm add edi, imageSize \ __asm movq mm5,mm1 /* mm5=src1 */ \ __asm psllq mm7, Z \ __asm psubusb mm5,mm3 /* mm5=src1-src3 */ \ __asm movq mm6,mm3 /* mm6=src3 */ \ __asm psubusb mm6,mm1 /* mm3=src3-src1 */ \ __asm por mm6,mm5 /* mm6=|src1-src3| */ \ __asm movq [edi], mm6 /* here mm1=src1 */ \ __asm psrlq mm3, 8 /* mm3 = src3 + '1' ... with [x00000000] at the end */ \ __asm por mm3,mm7 /* here mm3=new src3 */ \ } // ImgSubandAdd2: D = saturation0(|S1 - S2| + |S1 - S3|) // process 8 disparities at a time // // Src1: right // Src2: top // Src3: left // // TODO? divide the result by 2 (shift) int ImgSubandAdd_Horiz(const unsigned char *rightIm, const unsigned char *leftIm, unsigned char* Dest, int l, int imageSize, int width) { if (l < 8) return 0; // image size must be at least 8 bytes const int back_step2 = 7*imageSize; __asm { mov eax, rightIm mov edx, leftIm mov edi, Dest mov ecx, l shr ecx, 3 movq mm0,[edx] // mm0=src3 movq mm0,[edx] // mm0=src3 align 16 inner_loop: movq mm1,[eax] // mm1=src1 movq mm3,mm0 // mm3=src3 // -- 1 --------- in : mm1,mm2,mm3 out: mm4=SAD mm2=new mm2 -- movq mm0,[edx+8] add eax,8 movq mm5,mm1 // mm5=src1 psubusb mm5,mm3 // mm5=src1-src3 movq mm6,mm3 // mm6=src3 psubusb mm6,mm1 // mm3=src3-src1 movq mm7, mm0 psrlq mm3, 8 // mm3 = src3 + '1' ... with [x00000000] at the end por mm6,mm5 // mm6=|src1-src3| movq [edi], mm6 psllq mm7, 56 // here mm1=src1 mm3=begin of NEWsrc3 mm7=end of NEWsrc3 por mm3,mm7 // here mm3=new src3 // - 2 ---------------- _ABS_DIFF_HORIZ(48) _ABS_DIFF_HORIZ(40) _ABS_DIFF_HORIZ(32) _ABS_DIFF_HORIZ(24) _ABS_DIFF_HORIZ(16) _ABS_DIFF_HORIZ(8) // - 8 ---------------- movq mm5,mm1 // mm5=src1 add edi, imageSize psubusb mm5,mm3 // mm5=src1-src3 psubusb mm3,mm1 // mm3=src3-src1 por mm3,mm5 // mm6=|src1-src3| movq [edi], mm3 // ------------------------------------------------------------- // add edx,8 sub edi, back_step2 add edi,8 dec ecx jnz inner_loop emms } return 1; } // ---------------------- // FULL IMAGE, BEST ONLY : Keith's code int findMinimumCorrelation( const unsigned char *CurrentCorrelation, unsigned char CurrentDisparity, unsigned char *Disparity, unsigned char *BestCorrelation, int bytecount) { if ((bytecount < 8) || ((bytecount % 8) != 0)) { return 0; } __asm { // load ecx with the pixelblock count = bytecount / 8 mov ecx, bytecount shr ecx, 3 // setup mm0 with 8 copies of the disparity constant mov al, CurrentDisparity mov ah, al mov bx, ax shl eax, 16 mov ax, bx movd mm0, eax movd mm1, eax punpckldq mm0, mm1 // setup mm1 with 8 copies of the xor constant for unsigned => signed conversion mov eax, 0x80808080 movd mm1, eax movd mm2, eax punpckldq mm1, mm2 // setup the image pointers mov eax, BestCorrelation mov esi, CurrentCorrelation mov edi, Disparity pixel_loop: movq mm2, [esi] // current correlation movq mm3, [eax] // best correlation // check for updates movq mm5, mm2 // copy the current correlation pxor mm5, mm1 // convert from unsigned range to signed range movq mm6, mm3 // copy the best correlation pxor mm6, mm1 // convert from unsigned range to signed range pcmpgtb mm5, mm6 // mm5 := (current signed> best) mask // 1 indicates current > best, so keep best // 0 indicates current <= best, so use new value // BYPASS // this phase adds 8 additional instructions, but could skip 2 writes and 1 read // abort remainder if not updating best correlation pcmpeqb mm6, mm6 // mm6 = 0xFFFFFFFF pxor mm6, mm5 // mm6 = mm5 xor 0xFFFFFFFF = not mm5 // 0 indicates current > best, so keep best // 1 indicates current <= best, so use new value packsswb mm6, mm6 // pack it into the lower dword of mm6 (unsigned saturation) // 11111111 11111111 => 11111111 some replaced // 11111111 00000000 => 11111111 some replaced // 00000000 11111111 => 11111111 some replaced // 00000000 00000000 => 00000000 no replacements // we don't need to backup ebx because its not used in this routine // movd mm7, ebx // make a backup of eax movd ebx, mm6 // get the saturated mask test ebx, ebx // test ebx => yields 0 iff no substitutions will occur // movd ebx, mm7 // restore ebx jz bypass // store mm4 (second correlation) to [ebx] // Update best Correlation movq mm6, mm5 // mm6 := mask movq mm7, mm5 // mm7 := mask pand mm6, mm3 // best correlation values to keep pandn mm7, mm2 // current correlation value to move to best correlation por mm6, mm7 // merge values movq [eax], mm6 // store values // update disparity movq mm2, [edi] // get disparity map movq mm6, mm5 // mm6 := mask pand mm5, mm2 // select disparity map values to keep pandn mm6, mm0 // select current disparity values to move to disparity map por mm5, mm6 // merge values movq [edi], mm5 // store values bypass: add eax, 8 add esi, 8 add edi, 8 dec ecx jnz pixel_loop emms; } return 1; } /*int initMinimumCorrelation( const unsigned char *CurrentCorrelation, unsigned char disparityInit, unsigned char *Disparity, unsigned char *BestCorrelation, unsigned char *SecondCorrelation, int bytecount) { for (int i=0; i signed conversion mov eax, 0x80808080 movd mm1, eax movd mm2, eax punpckldq mm1, mm2 // setup the image pointers mov eax, BestCorrelation mov ebx, SecondCorrelation mov esi, CurrentCorrelation mov edi, Disparity pixel_loop: movq mm2, [esi] // current correlation movq mm4, [ebx] // second correlation // convert the current correlation from unsigned range to signed range movq mm5, mm2 // copy the current correlation pxor mm5, mm1 // convert from unsigned range to signed range movq mm7, mm5 // copy converted to mm7 // check for second correlation updates movq mm6, mm4 // copy second best correlation pxor mm6, mm1 // convert from unsigned range to signed range pcmpgtb mm7, mm6 // mm7 := (current signed> second best) mask // BYPASS 1 // skip remainder if second correlation is not to be updated // this phase adds an addition 8 instructions, but it could save as 1 memory read and 3 writes pcmpeqb mm6, mm6 // mm6 = 0xFFFFFFFF pxor mm6, mm7 // mm6 = mm7 xor 0xFFFFFFFF = not mm7 // 0 indicates current > second, so keep old value // 1 indicates current <= second, so use new value packsswb mm6, mm6 // pack it into the lower dword of mm6 (unsigned saturation) // 11111111 11111111 => 11111111 some replaced // 11111111 00000000 => 11111111 some replaced // 00000000 11111111 => 11111111 some replaced // 00000000 00000000 => 00000000 no replacements // don't need to backup edx because its not used in this routine // movd mm3, edx // make a backup of edx movd edx, mm6 // get the saturated mask test edx, edx // test edx => yields 0 iff no replacements will occur // movd edx, mm3 // restore edx jz bypass1 // direct update second correlation (get values from current) // mm7 already has mask // movq mm6, mm7 // mm6 := mask // pand mm6, mm4 // second correlation values to keep // pandn mm7, mm2 // current correlation values to move to second correlation // por mm6, mm7 // merge value => direct updated second correlation // movq mm4, mm6 // store values (*** this instruction could be eliminated!) pand mm4, mm7 // second correlation values to keep pandn mm7, mm2 // current correlation values to move to second correlation por mm4, mm7 // merge value => direct updated second correlation // check for best correlation updates movq mm3, [eax] // best correlation // mm5 has converted current correlation movq mm6, mm3 // copy the best correlation pxor mm6, mm1 // convert from unsigned range to signed range pcmpgtb mm5, mm6 // mm5 := (current signed> best) mask // 1 indicates current > best, so keep best // 0 indicates current <= best, so use new value // BYPASS 2 // this phase adds 8 additional instructions, but could skip 2 writes and 1 read // abort remainder if not updating best correlation pcmpeqb mm6, mm6 // mm6 = 0xFFFFFFFF pxor mm6, mm5 // mm6 = mm5 xor 0xFFFFFFFF = not mm5 // 0 indicates current > best, so keep best // 1 indicates current <= best, so use new value packsswb mm6, mm6 // pack it into the lower dword of mm6 (unsigned saturation) // 11111111 11111111 => 11111111 some replaced // 11111111 00000000 => 11111111 some replaced // 00000000 11111111 => 11111111 some replaced // 00000000 00000000 => 00000000 no replacements // don't need to backup edx because its not used in this routine // movd mm7, edx // make a backup of edx movd edx, mm6 // get the saturated mask test edx, edx // test edx => yields 0 iff no substitutions will occur // movd edx, mm7 // restore edx jz bypass2 // store mm4 (second correlation) to [ebx] // indirect update second correlation (pushed down from best) movq mm6, mm5 // mm6 := mask movq mm7, mm5 // mm7 := mask pand mm6, mm4 // second correlation values to keep pandn mm7, mm3 // best correlations to move to second correlation por mm6, mm7 // merge values movq [ebx], mm6 // store values // direct Update best Correlation movq mm6, mm5 // mm6 := mask movq mm7, mm5 // mm7 := mask pand mm6, mm3 // best correlation values to keep pandn mm7, mm2 // current correlation value to move to best correlation por mm6, mm7 // merge values movq [eax], mm6 // store values // update disparity movq mm2, [edi] // get disparity map movq mm6, mm5 // mm6 := mask pand mm5, mm2 // select disparity map values to keep pandn mm6, mm0 // select current disparity values to move to disparity map por mm5, mm6 // merge values movq [edi], mm5 // store values bypass1: next_pixel: add eax, 8 add ebx, 8 add esi, 8 add edi, 8 dec ecx jnz pixel_loop jmp done bypass2: movq [ebx], mm4; jmp next_pixel done: emms; } return 1; } void sum_Row(uchar* im, unsigned short* im_out, int rowSize, int maskSize) { switch (maskSize) { case 5: sum_Row_5(im, im_out, rowSize); break; case 9: case 8: case 7: case 6: sum_Row_5(im, im_out, rowSize); sum_Row_5(im_out, im_out, rowSize); break; case 13: case 12: case 11: case 10: sum_Row_5(im, im_out, rowSize); sum_Row_5(im_out, im_out, rowSize); sum_Row_5(im_out, im_out, rowSize); break; case 17: case 16: case 15: case 14 : sum_Row_5(im, im_out, rowSize); sum_Row_5(im_out, im_out, rowSize); sum_Row_5(im_out, im_out, rowSize); sum_Row_5(im_out, im_out, rowSize); break; default: break; } } void sum_Row(unsigned short* im, unsigned short* im_out, int rowSize, int maskSize) { switch (maskSize) { case 5: sum_Row_5(im, im_out, rowSize); break; case 9: case 8: case 7: case 6: sum_Row_5(im, im_out, rowSize); sum_Row_5(im_out, im_out, rowSize); break; case 13: case 12: case 11: case 10: sum_Row_5(im, im_out, rowSize); sum_Row_5(im_out, im_out, rowSize); sum_Row_5(im_out, im_out, rowSize); break; case 17: case 16: case 15: case 14 : sum_Row_5(im, im_out, rowSize); sum_Row_5(im_out, im_out, rowSize); sum_Row_5(im_out, im_out, rowSize); sum_Row_5(im_out, im_out, rowSize); break; default: break; } } #define aim_Sum_Words_In_MM1 __asm \ { \ __asm movq mm4, mm1 \ __asm movq mm2, mm1 \ \ __asm movq mm3, mm1 \ __asm psllq mm1, 16 \ \ __asm psrlq mm2, 16 \ __asm paddw mm4, mm2 \ \ __asm paddw mm3, mm1 \ __asm psrlq mm2, 16 \ \ __asm psllq mm1, 16 \ __asm paddw mm4, mm2 \ \ __asm psrlq mm2, 16 \ __asm paddw mm3, mm1 \ \ __asm psllq mm1, 16 \ __asm paddw mm4, mm2 \ \ __asm paddw mm3, mm1 \ } // apply the mask [1 1 1 1 1] to the 1-D array im (bytes) // output : im_out (words) void sum_Row_5(uchar* im, unsigned short* im_out, int rowSize) { __asm { mov eax, rowSize mov ebx, im mov ecx, im_out pxor mm6, mm6 // mm6 = x00000000 //Process the first quad word, but save only the second result" test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary //Process low word movq mm1, [ebx] // Copy... punpcklbw mm1, mm6 // Expand low word bytes into words // mm1 =[D C B A] aim_Sum_Words_In_MM1 //Store the result Only in the accumulator movq mm7, mm4 // Update accumulator mm4=[D C+D B+C+D A+B+C+D] //Process high word movq mm1, [ebx] // Copy... punpckhbw mm1, mm6 // Expand high word bytes into words // mm1 =[H G F E] add ebx, 8 // Update input pointer aim_Sum_Words_In_MM1 //Add to the previous data ... // mm3=[E+F+G+H E+F+G E+F E] // mm4=[H G+H F+G+H E+F+G+H] paddw mm7, mm3 // The current word of the accum // mm7=[D+E+F+G+H C+D+E+F+G B+C+D+E+F A+B+C+D+E] // translate everything to 2 words on the left movq mm1, mm7 // mm1 = [D+E+F+G+H C+D+E+F+G B+C+D+E+F A+B+C+D+E] psrlq mm1, 32 // mm1 = [0 0 D+E+F+G+H C+D+E+F+G] movq mm0, mm1 // mm0 = [D+E+F+G+H C+D+E+F+G] psllq mm7, 32 // mm7 = [B+C+D+E+F A+B+C+D+E 0 0] movq [ecx], mm7 // Store the final result add ecx, 8 // Update output pointer movq mm7, mm4 // Update accumulator mm4=[H G+H F+G+H E+F+G+H] sub eax, 8 // Update the number of points left // Start the loop row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary movq mm1, [ebx] // Load data //Process low word punpcklbw mm1, mm6 // Expand low word bytes into words aim_Sum_Words_In_MM1 //Add to the previous data //prefetcht1 [ecx+16] paddw mm7, mm3 // The current word of the accum // translate everything to 2 words on the left // mm0 = [0 0 D C] mm7 = [H G F E] ----> mm7=[0 0 H G] [ecx]=[F E D C] punpckldq mm0, mm7 // mm0 = [F E D C] movq [ecx], mm0 sub eax, 8 // Update the number of points left movq mm0, mm4 // Update accumulator psrlq mm7, 32 // mm7 = [0 0 H G] //Process high word movq mm1, [ebx] // Copy... punpckhbw mm1, mm6 // Expand high word bytes into words aim_Sum_Words_In_MM1 //Add to the previous data paddw mm0, mm3 // The current word of the accum // translate everything to 2 words on the left // mm7 = [0 0 D C] mm0 = [H G F E] ----> mm0=[0 0 H G] [ecx+8]=[F E D C] punpckldq mm7, mm0 // mm7 = [F E D C] add ebx, 8 // Update input pointer movq [ecx+8], mm7 psrlq mm0, 32 // mm0 = [0 0 H G] movq mm7, mm4 // Update accumulator add ecx, 16 // Update output pointer jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } // apply the mask (1/4)*[1 1 1 1 1] to the 1-D array im (words) // output : im_out (words) void sum_Row_5(ushort* im, ushort* im_out, int rowSize) { __asm { mov eax, rowSize mov ebx, im mov ecx, im_out //Process the first quad word, but save only the second result" test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary movq mm1, [ebx] // Load data (4 words) add ebx, 8 // Update input pointer //Process low word aim_Sum_Words_In_MM1 //Store the result Only in the accumulator movq mm7, mm4 // Update accumulator //Process high word movq mm1, [ebx] // Copy... aim_Sum_Words_In_MM1 add ebx, 8 //Add to the previous data paddw mm7, mm3 // The current word of the accum // translate everything to 2 words on the left movq mm1, mm7 // mm1 = [D+E+F+G+H C+D+E+F+G B+C+D+E+F A+B+C+D+E] psrlq mm1, 32 // mm1 = [0 0 D+E+F+G+H C+D+E+F+G] movq mm0, mm1 // mm0 = [0 0 D+E+F+G+H C+D+E+F+G] psllq mm7, 32 // mm7 = [B+C+D+E+F A+B+C+D+E 0 0] movq [ecx], mm7 // Store the final result movq mm7, mm4 // Update accumulator add ecx, 8 // Update output pointer sub eax, 8 // Update the number of points left // Start the loop row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary movq mm1, [ebx] // Load data aim_Sum_Words_In_MM1 //Add to the previous data //prefetcht0 [ecx + 32] //prefetcht0 [ebx + 48] paddw mm7, mm3 // The current word of the accum psrlw mm7, 2 // divide result by ... // translate everything to 2 words on the left // mm0 = [0 0 D C] mm7 = [H G F E] ----> mm7 =[0 0 H G] [ecx]=[F E D C] punpckldq mm0, mm7 // mm0 = [F E D C] movq [ecx], mm0 sub eax, 8 // Update the number of points left movq mm0, mm4 // Update accumulator psrlq mm7, 32 // mm7 =[0 0 H G] //Process high word movq mm1, [ebx+8] // Copy... aim_Sum_Words_In_MM1 //Add to the previous data paddw mm0, mm3 // The current word of the accum psrlw mm0, 2 // divide result by ... // translate everything to 2 words on the left // mm7 = [0 0 D C] mm0 = [H G F E] ----> mm0=[0 0 H G] [ecx+8]=[F E D C] punpckldq mm7, mm0 // mm7 = [F E D C] add ebx, 16 // Update input pointer movq [ecx+8], mm7 psrlq mm0, 32 // mm0 = [0 0 H G] movq mm7, mm4 // Update accumulator add ecx, 16 // Update output pointer */ jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } // apply vertical mask 1/16*[1 1 1 ... 1]^T to 'im' // result in 'im_out' void avg_Col(ushort* im, uchar* im_out, int dataSize, int width, int sizeMask) { switch (sizeMask) { case 5: avg_Col_5(im,im_out,dataSize,width); break; case 7: avg_Col_7(im,im_out,dataSize,width); break; case 9: avg_Col_9(im,im_out,dataSize,width); break; case 11: avg_Col_11(im,im_out,dataSize,width); break; case 13: avg_Col_13(im,im_out,dataSize,width); break; case 15: avg_Col_15(im,im_out,dataSize,width); break; case 17: avg_Col_17(im,im_out,dataSize,width); break; default: avg_Col_5(im,im_out,dataSize,width); break; } } #define macro_add __asm \ { \ __asm paddusw mm3, [edx] \ __asm paddusw mm2, [edx+8] \ __asm add edx, edi \ } void avg_Col_5(ushort* im, uchar* im_out, int dataSize, int width) { __asm { mov edi, width shl edi, 1 // edi = 2*width mov eax, dataSize mov ecx, im_out mov ebx, im sub ebx, edi sub ebx, edi // ebx = ebx-4*width test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary mov edx, ebx add ebx, 16 // 1 movq mm3, [edx] // mm3 = 4 words of im movq mm2, [edx+8] // mm2 = next 4 words of im add edx, edi macro_add macro_add macro_add macro_add // divide results by ... psrlw mm3, 3 psrlw mm2, 3 // convert [mm2 mm3] as 8 bytes packuswb mm3,mm2 movq [ecx], mm3 sub eax, 8 // Update the number of points left add ecx, 8 // Update output pointer jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } void avg_Col_7(ushort* im, uchar* im_out, int dataSize, int width) { __asm { mov edi, width shl edi, 1 // edi = 2*width mov eax, dataSize mov ecx, im_out mov ebx, im sub ebx, edi sub ebx, edi sub ebx, edi // ebx = ebx-6*width test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary mov edx, ebx // 1 movq mm3, [edx] // mm3 = 4 words of im add ebx, 16 movq mm2, [edx+8] // mm2 = next 4 words of im add edx, edi macro_add macro_add macro_add macro_add macro_add macro_add // divide results by ... psrlw mm3, 3 psrlw mm2, 3 // convert [mm2 mm3] as 8 bytes packuswb mm3,mm2 movq [ecx], mm3 sub eax, 8 // Update the number of points left add ecx, 8 // Update output pointer jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } void avg_Col_9(ushort* im, uchar* im_out, int dataSize, int width) { __asm { mov edi, width shl edi, 1 // edi = 2*width mov eax, dataSize mov ecx, im_out mov ebx, im sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi // ebx = ebx-8*width test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary mov edx, ebx add ebx, 16 // 1 movq mm3, [edx] // mm3 = 4 words of im movq mm2, [edx+8] // mm2 = next 4 words of im add edx, edi macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add // divide results by ... psrlw mm3, 3 psrlw mm2, 3 // convert [mm2 mm3] as 8 bytes packuswb mm3,mm2 movq [ecx], mm3 sub eax, 8 // Update the number of points left add ecx, 8 // Update output pointer jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } void avg_Col_11(ushort* im, uchar* im_out, int dataSize, int width) { __asm { mov edi, width shl edi, 1 // edi = 2*width mov eax, dataSize mov ecx, im_out mov ebx, im sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi // ebx = ebx-10*width test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary mov edx, ebx add ebx, 16 // 1 movq mm3, [edx] // mm3 = 4 words of im movq mm2, [edx+8] // mm2 = next 4 words of im add edx, edi macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add // divide results by ... psrlw mm3, 4 psrlw mm2, 4 // convert [mm2 mm3] as 8 bytes packuswb mm3,mm2 movq [ecx], mm3 sub eax, 8 // Update the number of points left add ecx, 8 // Update output pointer jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } void avg_Col_13(ushort* im, uchar* im_out, int dataSize, int width) { __asm { mov edi, width shl edi, 1 // edi = 2*width mov eax, dataSize mov ecx, im_out mov ebx, im sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi // ebx = ebx-12*width test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary mov edx, ebx add ebx, 16 // 1 movq mm3, [edx] // mm3 = 4 words of im movq mm2, [edx+8] // mm2 = next 4 words of im add edx, edi macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add // divide results by ... psrlw mm3, 4 psrlw mm2, 4 // convert [mm2 mm3] as 8 bytes packuswb mm3,mm2 movq [ecx], mm3 sub eax, 8 // Update the number of points left add ecx, 8 // Update output pointer jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } void avg_Col_15(ushort* im, uchar* im_out, int dataSize, int width) { __asm { mov edi, width shl edi, 1 // edi = 2*width mov eax, dataSize mov ecx, im_out mov ebx, im sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi // ebx = ebx-14*width test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary mov edx, ebx add ebx, 16 // 1 movq mm3, [edx] // mm3 = 4 words of im movq mm2, [edx+8] // mm2 = next 4 words of im add edx, edi macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add // divide results by ... psrlw mm3, 4 psrlw mm2, 4 // convert [mm2 mm3] as 8 bytes packuswb mm3,mm2 movq [ecx], mm3 sub eax, 8 // Update the number of points left add ecx, 8 // Update output pointer jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } void avg_Col_17(ushort* im, uchar* im_out, int dataSize, int width) { __asm { mov edi, width shl edi, 1 // edi = 2*width mov eax, dataSize mov ecx, im_out mov ebx, im sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi sub ebx, edi // ebx = ebx-16*width test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary mov edx, ebx add ebx, 16 // 1 movq mm3, [edx] // mm3 = 4 words of im movq mm2, [edx+8] // mm2 = next 4 words of im add edx, edi macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add macro_add // divide results by ... psrlw mm3, 4 psrlw mm2, 4 // convert [mm2 mm3] as 8 bytes packuswb mm3,mm2 movq [ecx], mm3 sub eax, 8 // Update the number of points left add ecx, 8 // Update output pointer jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } void add_Col_5_wb(ushort* im, uchar* im_out, int dataSize, int width) { __asm { mov edi, width shl edi, 1 // edi = 2*width mov eax, dataSize mov ecx, im_out mov ebx, im sub ebx, edi sub ebx, edi // ebx = ebx-4*width test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary mov edx, ebx add ebx, 16 // 1 movq mm3, [edx] // mm3 = 4 words of im movq mm2, [edx+8] // mm2 = next 4 words of im add edx, edi macro_add macro_add macro_add macro_add // save [mm2 mm3] as 8 bytes packuswb mm3,mm2 movq [ecx], mm3 sub eax, 8 // Update the number of points left add ecx, 8 // Update output pointer jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } void add_Col_5_ww(ushort* im, ushort* im_out, int dataSize, int width) { __asm { mov edi, width shl edi, 1 // edi = 2*width mov eax, dataSize mov ecx, im_out mov ebx, im sub ebx, edi sub ebx, edi // ebx = ebx-4*width test eax, eax // Is there anything to do?" jz end_sum_loop // Jump out if necessary row_sum_loop: test eax, eax // Is there anything to do? jz end_sum_loop // Jump out if necessary mov edx, ebx add ebx, 16 // 1 movq mm3, [edx] // mm3 = 4 words of im movq mm2, [edx+8] // mm2 = next 4 words of im add edx, edi macro_add macro_add macro_add macro_add // save [mm2 mm3] as words movq [ecx], mm3 movq [ecx+8], mm2 sub eax, 8 // Update the number of points left add ecx, 16 // Update output pointer jmp row_sum_loop // Loop //Cleanup end_sum_loop: emms } } // compare bestScores and secondScores. if second thresh // 0 otherwise pand mm3, mm2 pandn mm2, mm7 por mm3, mm2 movq [edx], mm3 sub eax, 8 // Update the number of points left add ebx, 8 // Update output pointer add ecx, 8 add edx, 8 jmp comp_loop // Loop //Cleanup end_loop: emms } } // windowWidth must be multiple of 8 void cropImage(const uchar* imSrc, int width, int height, uchar* imDest, int x0, int y0, int windowWidth, int windowHeight) { int w8 = windowWidth/8; int step = width-windowWidth; const uchar* srcNewOrigin = imSrc+x0+y0*width; __asm { mov ecx, windowHeight mov edx, w8 mov eax, srcNewOrigin mov ebx, imDest pixel_loop: movq mm1, [eax] movq [ebx], mm1 add eax, 8 add ebx, 8 dec edx jnz pixel_loop mov edx, w8 add eax, step dec ecx jnz pixel_loop jmp done done: emms; } } // return the average pixel value float pixelMean(const uchar* im, int imageSize) { int sum; __asm { mov ecx, imageSize shr ecx, 3 mov eax, im pxor mm7,mm7 // mm7 used as accumulator pxor mm0,mm0 // mm0 = 0 pixel_loop: movq mm1, [eax] movq mm2,mm1 punpcklbw mm2, mm0 punpckhbw mm1, mm0 paddw mm2,mm1 movq mm1,mm2 punpcklwd mm2, mm0 punpckhwd mm1, mm0 paddd mm2,mm1 paddd mm7,mm2 add eax, 8 dec ecx jnz pixel_loop jmp done done: movd ebx, mm7 psrlq mm7, 32 movd edx, mm7 add ebx, edx mov sum, ebx emms } return sum / (float)imageSize; } // ------------------------------------------------------------- // apply mask: // if mask[]=undefined_val im[]->im[] // otherwise, im[]->mask[] // ....... this one may not be exact :-( void overrideImageMMX(uchar* im, const uchar* mask, uchar undefined_val, int imageSize) { __asm { // setup mm0 with 8 copies of 'undefined_val' mov al, undefined_val mov ah, al mov bx, ax shl eax, 16 mov ax, bx movd mm0, eax movd mm1, eax punpckldq mm0, mm1 mov ecx, imageSize shr ecx, 3 mov eax, im mov ebx, mask pixel_loop: movq mm1, [eax] movq mm2, [ebx] movq mm3, mm2 pcmpeqb mm3, mm0 // mm3[] -> xFF if mm2[]==undefined_val // -> x00 otherwise pand mm3, mm1 // mm3[] = mm1[] if mm2[]==undefined_val // = x00 otherwise por mm3, mm2 movq [eax], mm3 add eax, 8 add ebx, 8 dec ecx jnz pixel_loop jmp done done: emms } } void overrideImage(uchar* im, const uchar* mask, uchar undefined_val, int imageSize) { for (int i=0; i