www.pudn.com > Estereo.rar > processingSSE2.inl
/***************************************************************************
*
* Copyright 2004 by the Massachusetts Institute of Technology. All
* rights reserved.
*
* Developed by David Demirdjian
* at the Computer Sciences and Artificial Intelligence Laboratory,
* MIT, Cambridge, Massachusetts.
*
* Permission to use, copy, or modify this software and its documentation
* for educational and research purposes only and without fee is hereby
* granted, provided that this copyright notice and the original authors's
* names appear on all copies and supporting documentation. If individual
* files are separated from this distribution directory structure, this
* copyright notice must be included. For any other uses of this software,
* in original or modified form, including but not limited to distribution
* in whole or in part, specific prior permission must be obtained from
* MIT. These programs shall not be used, rewritten, or adapted as the
* basis of a commercial software or hardware product without first
* obtaining appropriate licenses from MIT. MIT. makes no representations
* about the suitability of this software for any purpose. It is provided
* "as is" without express or implied warranty.
*
**************************************************************************/
#include "stereoMatching.h"
#include "processingmmx.h"
// Src1, Src2 and Dest suppose to point on 16-bytes memory block
inline int ImgSubandAdd_sse2(const unsigned char *Src1, const unsigned char *Src2,
const unsigned char *Src3, unsigned char *Dest, int l)
{
if (l < 8) return 0; // image size must be at least 8 bytes
__asm
{
mov eax, Src1
mov ebx, Src2
mov edx, Src3
mov edi, Dest
mov ecx, l
shr ecx, 4
align 16
inner_loop:
movdqa xmm1,[eax] // xmm1=src1
movdqa xmm2,[ebx] // mm2=src2
movdqa xmm4,xmm1 // mm4=mm1
psubusb xmm4,xmm2 // mm4 = src1 - src2
movdqu xmm3,[edx] // mm3=src3
psubusb xmm2,xmm1 // mm2 = src2 - src1
movdqa xmm5,xmm1 // mm5=src1
por xmm2,xmm4 // mm2=|src1-src2|
psubusb xmm5,xmm3 // mm4=src1-src3
psubusb xmm3,xmm1 // mm3=src3-src1
por xmm3,xmm5 // mm3=|src1-src3|
paddusb xmm2,xmm3 // mm2 = |src1-src2|+|src1-src3|
movdqa [edi], xmm2
add eax,16
add ebx,16
add edx,16
add edi,16
dec ecx
jnz inner_loop
emms
}
return 1;
}
#define macro_add_sse2 __asm \
{ \
__asm paddusw xmm3, [edx] \
__asm paddusw xmm2, [edx+16] \
__asm add edx, edi \
}
inline void avg_Col_5_sse2(ushort* im, uchar* im_out, int dataSize, int width)
{
__asm {
mov edi, width
shl edi, 1 // edi = 2*width
mov eax, dataSize
mov ecx, im_out
mov ebx, im
sub ebx, edi
sub ebx, edi // ebx = ebx-4*width
test eax, eax // Is there anything to do?"
jz end_sum_loop // Jump out if necessary
row_sum_loop:
test eax, eax // Is there anything to do?
jz end_sum_loop // Jump out if necessary
mov edx, ebx
add ebx, 32
// 1
movdqa xmm3, [edx] // xmm3 = 8 words of im
movdqa xmm2, [edx+16] // xmm3 = 8 words of im
add edx, edi
macro_add_sse2
macro_add_sse2
macro_add_sse2
macro_add_sse2
// divide results by ...
psrlw xmm3, 3
psrlw xmm2, 3
// convert [xmm2 xmm3] as 8 words
packuswb xmm3,xmm2
movdqa [ecx], xmm3
sub eax, 16 // Update the number of points left
add ecx, 16 // Update output pointer
jmp row_sum_loop // Loop
//Cleanup
end_sum_loop:
emms
}
}
inline void avg_Col_7_sse2(ushort* im, uchar* im_out, int dataSize, int width)
{
__asm {
mov edi, width
shl edi, 1 // edi = 2*width
mov eax, dataSize
mov ecx, im_out
mov ebx, im
sub ebx, edi
sub ebx, edi
sub ebx, edi // ebx = ebx-4*width
test eax, eax // Is there anything to do?"
jz end_sum_loop // Jump out if necessary
row_sum_loop:
test eax, eax // Is there anything to do?
jz end_sum_loop // Jump out if necessary
mov edx, ebx
add ebx, 32
// 1
movdqa xmm3, [edx] // xmm3 = 8 words of im
movdqa xmm2, [edx+16] // xmm3 = 8 words of im
add edx, edi
macro_add_sse2
macro_add_sse2
macro_add_sse2
macro_add_sse2
macro_add_sse2
macro_add_sse2
// divide results by ...
psrlw xmm3, 3
psrlw xmm2, 3
// convert [xmm2 xmm3] as 8 words
packuswb xmm3,xmm2
movdqa [ecx], xmm3
sub eax, 16 // Update the number of points left
add ecx, 16 // Update output pointer
jmp row_sum_loop // Loop
//Cleanup
end_sum_loop:
emms
}
}
inline void avg_Col_9_sse2(ushort* im, uchar* im_out, int dataSize, int width)
{
__asm {
mov edi, width
shl edi, 1 // edi = 2*width
mov eax, dataSize
mov ecx, im_out
mov ebx, im
sub ebx, edi
sub ebx, edi
sub ebx, edi
sub ebx, edi // ebx = ebx-4*width
test eax, eax // Is there anything to do?"
jz end_sum_loop // Jump out if necessary
row_sum_loop:
test eax, eax // Is there anything to do?
jz end_sum_loop // Jump out if necessary
mov edx, ebx
add ebx, 32
// 1
movdqa xmm3, [edx] // xmm3 = 8 words of im
movdqa xmm2, [edx+16] // xmm3 = 8 words of im
add edx, edi
macro_add_sse2
macro_add_sse2
macro_add_sse2
macro_add_sse2
macro_add_sse2
macro_add_sse2
macro_add_sse2
macro_add_sse2
// divide results by ...
psrlw xmm3, 3
psrlw xmm2, 3
// convert [xmm2 xmm3] as 8 words
packuswb xmm3,xmm2
movdqa [ecx], xmm3
sub eax, 16 // Update the number of points left
add ecx, 16 // Update output pointer
jmp row_sum_loop // Loop
//Cleanup
end_sum_loop:
emms
}
}
inline void avg_Col_11_sse2(ushort* im, uchar* im_out, int dataSize, int width)
{
__asm {
mov edi, width
shl edi, 1 // edi = 2*width
mov eax, dataSize
mov ecx, im_out
mov ebx, im
sub ebx, edi
sub ebx, edi
sub ebx, edi
sub ebx, edi
sub ebx, edi // ebx = ebx-4*width
test eax, eax // Is there anything to do?"
jz end_sum_loop // Jump out if necessary
row_sum_loop:
test eax, eax // Is there anything to do?
jz end_sum_loop // Jump out if necessary
mov edx, ebx
add ebx, 32
// 1
movdqa xmm3, [edx] // xmm3 = 8 words of im
movdqa xmm2, [edx+16] // xmm3 = 8 words of im
add edx, edi
macro_add_sse2
macro_add_sse2
macro_add_sse2
macro_add_sse2
macro_add_sse2
macro_add_sse2
macro_add_sse2
macro_add_sse2
macro_add_sse2
macro_add_sse2
// divide results by ...
psrlw xmm3, 3
psrlw xmm2, 3
// convert [xmm2 xmm3] as 8 words
packuswb xmm3,xmm2
movdqa [ecx], xmm3
sub eax, 16 // Update the number of points left
add ecx, 16 // Update output pointer
jmp row_sum_loop // Loop
//Cleanup
end_sum_loop:
emms
}
}
inline void avg_Col_13_sse2(ushort* im, uchar* im_out, int dataSize, int width)
{
__asm {
mov edi, width
shl edi, 1 // edi = 2*width
mov eax, dataSize
mov ecx, im_out
mov ebx, im
sub ebx, edi
sub ebx, edi
sub ebx, edi
sub ebx, edi
sub ebx, edi
sub ebx, edi // ebx = ebx-4*width
test eax, eax // Is there anything to do?"
jz end_sum_loop // Jump out if necessary
row_sum_loop:
test eax, eax // Is there anything to do?
jz end_sum_loop // Jump out if necessary
mov edx, ebx
add ebx, 32
// 1
movdqa xmm3, [edx] // xmm3 = 8 words of im
movdqa xmm2, [edx+16] // xmm3 = 8 words of im
add edx, edi
macro_add_sse2
macro_add_sse2
macro_add_sse2
macro_add_sse2
macro_add_sse2
macro_add_sse2
macro_add_sse2
macro_add_sse2
macro_add_sse2
macro_add_sse2
macro_add_sse2
macro_add_sse2
// divide results by ...
psrlw xmm3, 3
psrlw xmm2, 3
// convert [xmm2 xmm3] as 8 words
packuswb xmm3,xmm2
movdqa [ecx], xmm3
sub eax, 16 // Update the number of points left
add ecx, 16 // Update output pointer
jmp row_sum_loop // Loop
//Cleanup
end_sum_loop:
emms
}
}
// apply vertical mask 1/16*[1 1 1 ... 1]^T to 'im'
// result in 'im_out'
inline void avg_Col_sse2(ushort* im, uchar* im_out, int dataSize, int width, int sizeMask)
{
switch (sizeMask)
{
case 5: avg_Col_5_sse2(im,im_out,dataSize,width);
break;
case 7: avg_Col_7_sse2(im,im_out,dataSize,width);
break;
case 9: avg_Col_9_sse2(im,im_out,dataSize,width);
break;
case 11: avg_Col_11_sse2(im,im_out,dataSize,width);
break;
case 13: avg_Col_13_sse2(im,im_out,dataSize,width);
break;
case 15: avg_Col_15(im,im_out,dataSize,width);
break;
case 17: avg_Col_17(im,im_out,dataSize,width);
break;
default: avg_Col_5_sse2(im,im_out,dataSize,width);
break;
}
}