www.pudn.com > OpenCV-Intel.zip > cvconvolve.cpp


/*M/////////////////////////////////////////////////////////////////////////////////////// 
// 
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
// 
//  By downloading, copying, installing or using the software you agree to this license. 
//  If you do not agree to this license, do not download, install, 
//  copy or use the software. 
// 
// 
//                        Intel License Agreement 
//                For Open Source Computer Vision Library 
// 
// Copyright (C) 2000, Intel Corporation, all rights reserved. 
// Third party copyrights are property of their respective owners. 
// 
// Redistribution and use in source and binary forms, with or without modification, 
// are permitted provided that the following conditions are met: 
// 
//   * Redistribution's of source code must retain the above copyright notice, 
//     this list of conditions and the following disclaimer. 
// 
//   * Redistribution's in binary form must reproduce the above copyright notice, 
//     this list of conditions and the following disclaimer in the documentation 
//     and/or other materials provided with the distribution. 
// 
//   * The name of Intel Corporation may not be used to endorse or promote products 
//     derived from this software without specific prior written permission. 
// 
// This software is provided by the copyright holders and contributors "as is" and 
// any express or implied warranties, including, but not limited to, the implied 
// warranties of merchantability and fitness for a particular purpose are disclaimed. 
// In no event shall the Intel Corporation or contributors be liable for any direct, 
// indirect, incidental, special, exemplary, or consequential damages 
// (including, but not limited to, procurement of substitute goods or services; 
// loss of use, data, or profits; or business interruption) however caused 
// and on any theory of liability, whether in contract, strict liability, 
// or tort (including negligence or otherwise) arising in any way out of 
// the use of this software, even if advised of the possibility of such damage. 
// 
//M*/ 
 
#include "_cv.h" 
#include  
 
/////////////////////// common functions for working with IPP filters //////////////////// 
 
CvMat* icvIPPFilterInit( const CvMat* src, int stripe_size, CvSize ksize ) 
{ 
    CvSize temp_size; 
    int pix_size = CV_ELEM_SIZE(src->type); 
    temp_size.width = cvAlign(src->cols + ksize.width - 1,8/CV_ELEM_SIZE(src->type & CV_MAT_DEPTH_MASK)); 
    //temp_size.width = src->cols + ksize.width - 1; 
    temp_size.height = (stripe_size*2 + temp_size.width*pix_size) / (temp_size.width*pix_size*2); 
    temp_size.height = MAX( temp_size.height, ksize.height ); 
    temp_size.height = MIN( temp_size.height, src->rows + ksize.height - 1 ); 
     
    return cvCreateMat( temp_size.height, temp_size.width, src->type ); 
} 
 
 
int icvIPPFilterNextStripe( const CvMat* src, CvMat* temp, int y, 
                            CvSize ksize, CvPoint anchor ) 
{ 
    int pix_size = CV_ELEM_SIZE(src->type); 
    int src_step = src->step ? src->step : CV_STUB_STEP; 
    int temp_step = temp->step ? temp->step : CV_STUB_STEP; 
    int i, dy, src_y1 = 0, src_y2; 
    int temp_rows; 
    uchar* temp_ptr = temp->data.ptr; 
    CvSize stripe_size, temp_size; 
    CvCopyNonConstBorderFunc copy_border_func = 
        icvGetCopyNonConstBorderFunc( pix_size, IPL_BORDER_REPLICATE ); 
 
    dy = MIN( temp->rows - ksize.height + 1, src->rows - y ); 
    if( y > 0 ) 
    { 
        int temp_ready = ksize.height - 1; 
         
        for( i = 0; i < temp_ready; i++ ) 
            memcpy( temp_ptr + temp_step*i, temp_ptr + 
                    temp_step*(temp->rows - temp_ready + i), temp_step ); 
 
        temp_ptr += temp_ready*temp_step; 
        temp_rows = dy; 
        src_y1 = y + temp_ready - anchor.y; 
        src_y2 = src_y1 + dy; 
        if( src_y1 >= src->rows ) 
        { 
            src_y1 = src->rows - 1; 
            src_y2 = src->rows; 
        } 
    } 
    else 
    { 
        temp_rows = dy + ksize.height - 1; 
        src_y2 = temp_rows - anchor.y; 
    } 
 
    src_y2 = MIN( src_y2, src->rows ); 
 
    stripe_size = cvSize(src->cols, src_y2 - src_y1); 
    temp_size = cvSize(temp->cols, temp_rows); 
    copy_border_func( src->data.ptr + src_y1*src_step, src_step, 
                      stripe_size, temp_ptr, temp_step, temp_size, 
                      (y == 0 ? anchor.y : 0), anchor.x ); 
    return dy; 
} 
 
 
/////////////////////////////// IPP separable filter functions /////////////////////////// 
 
icvFilterRow_8u_C1R_t icvFilterRow_8u_C1R_p = 0; 
icvFilterRow_8u_C3R_t icvFilterRow_8u_C3R_p = 0; 
icvFilterRow_8u_C4R_t icvFilterRow_8u_C4R_p = 0; 
icvFilterRow_16s_C1R_t icvFilterRow_16s_C1R_p = 0; 
icvFilterRow_16s_C3R_t icvFilterRow_16s_C3R_p = 0; 
icvFilterRow_16s_C4R_t icvFilterRow_16s_C4R_p = 0; 
icvFilterRow_32f_C1R_t icvFilterRow_32f_C1R_p = 0; 
icvFilterRow_32f_C3R_t icvFilterRow_32f_C3R_p = 0; 
icvFilterRow_32f_C4R_t icvFilterRow_32f_C4R_p = 0; 
 
icvFilterColumn_8u_C1R_t icvFilterColumn_8u_C1R_p = 0; 
icvFilterColumn_8u_C3R_t icvFilterColumn_8u_C3R_p = 0; 
icvFilterColumn_8u_C4R_t icvFilterColumn_8u_C4R_p = 0; 
icvFilterColumn_16s_C1R_t icvFilterColumn_16s_C1R_p = 0; 
icvFilterColumn_16s_C3R_t icvFilterColumn_16s_C3R_p = 0; 
icvFilterColumn_16s_C4R_t icvFilterColumn_16s_C4R_p = 0; 
icvFilterColumn_32f_C1R_t icvFilterColumn_32f_C1R_p = 0; 
icvFilterColumn_32f_C3R_t icvFilterColumn_32f_C3R_p = 0; 
icvFilterColumn_32f_C4R_t icvFilterColumn_32f_C4R_p = 0; 
 
////////////////////////////////////////////////////////////////////////////////////////// 
 
typedef CvStatus (CV_STDCALL * CvIPPSepFilterFunc) 
    ( const void* src, int srcstep, void* dst, int dststep, 
      CvSize size, const float* kernel, int ksize, int anchor ); 
 
int icvIPPSepFilter( const CvMat* src, CvMat* dst, const CvMat* kernelX, 
                     const CvMat* kernelY, CvPoint anchor ) 
{ 
    int result = 0; 
     
    CvMat* top_bottom = 0; 
    CvMat* vout_hin = 0; 
    CvMat* dst_buf = 0; 
     
    CV_FUNCNAME( "icvIPPSepFilter" ); 
 
    __BEGIN__; 
 
    CvSize ksize; 
    CvPoint el_anchor; 
    CvSize size; 
    int type, depth, pix_size; 
    int i, x, y, dy = 0, prev_dy = 0, max_dy; 
    CvMat vout; 
    CvCopyNonConstBorderFunc copy_border_func; 
    CvIPPSepFilterFunc x_func = 0, y_func = 0; 
    int src_step, top_bottom_step; 
    float *kx, *ky; 
    int align, stripe_size; 
 
    if( !icvFilterRow_8u_C1R_p ) 
        EXIT; 
 
    if( !CV_ARE_TYPES_EQ( src, dst ) || !CV_ARE_SIZES_EQ( src, dst ) || 
        !CV_IS_MAT_CONT(kernelX->type & kernelY->type) || 
        CV_MAT_TYPE(kernelX->type) != CV_32FC1 || 
        CV_MAT_TYPE(kernelY->type) != CV_32FC1 || 
        kernelX->cols != 1 && kernelX->rows != 1 || 
        kernelY->cols != 1 && kernelY->rows != 1 || 
        (unsigned)anchor.x >= (unsigned)(kernelX->cols + kernelX->rows - 1) || 
        (unsigned)anchor.y >= (unsigned)(kernelY->cols + kernelY->rows - 1) ) 
        CV_ERROR( CV_StsError, "Internal Error: incorrect parameters" ); 
 
    ksize.width = kernelX->cols + kernelX->rows - 1; 
    ksize.height = kernelY->cols + kernelY->rows - 1; 
 
    /*if( ksize.width <= 5 && ksize.height <= 5 ) 
    { 
        float* ker = (float*)cvStackAlloc( ksize.width*ksize.height*sizeof(ker[0])); 
        CvMat kernel = cvMat( ksize.height, ksize.width, CV_32F, ker ); 
        for( y = 0, i = 0; y < ksize.height; y++ ) 
            for( x = 0; x < ksize.width; x++, i++ ) 
                ker[i] = kernelY->data.fl[y]*kernelX->data.fl[x]; 
 
        CV_CALL( cvFilter2D( src, dst, &kernel, anchor )); 
        EXIT; 
    }*/ 
 
    type = CV_MAT_TYPE(src->type); 
    depth = CV_MAT_DEPTH(type); 
    pix_size = CV_ELEM_SIZE(type); 
 
    if( type == CV_8UC1 ) 
        x_func = icvFilterRow_8u_C1R_p, y_func = icvFilterColumn_8u_C1R_p; 
    else if( type == CV_8UC3 ) 
        x_func = icvFilterRow_8u_C3R_p, y_func = icvFilterColumn_8u_C3R_p; 
    else if( type == CV_8UC4 ) 
        x_func = icvFilterRow_8u_C4R_p, y_func = icvFilterColumn_8u_C4R_p; 
    else if( type == CV_16SC1 ) 
        x_func = icvFilterRow_16s_C1R_p, y_func = icvFilterColumn_16s_C1R_p; 
    else if( type == CV_16SC3 ) 
        x_func = icvFilterRow_16s_C3R_p, y_func = icvFilterColumn_16s_C3R_p; 
    else if( type == CV_16SC4 ) 
        x_func = icvFilterRow_16s_C4R_p, y_func = icvFilterColumn_16s_C4R_p; 
    else if( type == CV_32FC1 ) 
        x_func = icvFilterRow_32f_C1R_p, y_func = icvFilterColumn_32f_C1R_p; 
    else if( type == CV_32FC3 ) 
        x_func = icvFilterRow_32f_C3R_p, y_func = icvFilterColumn_32f_C3R_p; 
    else if( type == CV_32FC4 ) 
        x_func = icvFilterRow_32f_C4R_p, y_func = icvFilterColumn_32f_C4R_p; 
    else 
        EXIT; 
 
    size = cvGetMatSize(src); 
    stripe_size = src->data.ptr == dst->data.ptr ? 1 << 15 : 1 << 16; 
    max_dy = MAX( ksize.height - 1, stripe_size/(size.width + ksize.width - 1)); 
    max_dy = MIN( max_dy, size.height + ksize.height - 1 ); 
     
    align = 8/CV_ELEM_SIZE(depth); 
 
    CV_CALL( top_bottom = cvCreateMat( ksize.height*2, cvAlign(size.width,align), type )); 
 
    CV_CALL( vout_hin = cvCreateMat( max_dy + ksize.height, 
        cvAlign(size.width + ksize.width - 1, align), type )); 
     
    if( src->data.ptr == dst->data.ptr && size.height ) 
        CV_CALL( dst_buf = cvCreateMat( max_dy + ksize.height, 
            cvAlign(size.width, align), type )); 
 
    kx = (float*)cvStackAlloc( ksize.width*sizeof(kx[0]) ); 
    ky = (float*)cvStackAlloc( ksize.height*sizeof(ky[0]) ); 
 
    // mirror the kernels 
    for( i = 0; i < ksize.width; i++ ) 
        kx[i] = kernelX->data.fl[ksize.width - i - 1]; 
 
    for( i = 0; i < ksize.height; i++ ) 
        ky[i] = kernelY->data.fl[ksize.height - i - 1]; 
 
    el_anchor = cvPoint( ksize.width - anchor.x - 1, ksize.height - anchor.y - 1 ); 
 
    cvGetCols( vout_hin, &vout, anchor.x, anchor.x + size.width ); 
    copy_border_func = icvGetCopyNonConstBorderFunc( pix_size, IPL_BORDER_REPLICATE ); 
 
    src_step = src->step ? src->step : CV_STUB_STEP; 
    top_bottom_step = top_bottom->step ? top_bottom->step : CV_STUB_STEP; 
    vout.step = vout.step ? vout.step : CV_STUB_STEP; 
 
    for( y = 0; y < size.height; y += dy ) 
    { 
        const CvMat *vin = src, *hout = dst; 
        int src_y = y, dst_y = y; 
        dy = MIN( max_dy, size.height - (ksize.height - anchor.y - 1) - y ); 
 
        if( y < anchor.y || dy < anchor.y ) 
        { 
            int ay = anchor.y; 
            CvSize src_stripe_size = size; 
             
            if( y < anchor.y ) 
            { 
                src_y = 0; 
                dy = MIN( anchor.y, size.height ); 
                src_stripe_size.height = MIN( dy + ksize.height - anchor.y - 1, size.height ); 
            } 
            else 
            { 
                src_y = MAX( y - anchor.y, 0 ); 
                dy = size.height - y; 
                src_stripe_size.height = MIN( dy + anchor.y, size.height ); 
                ay = MAX( anchor.y - y, 0 ); 
            } 
 
            copy_border_func( src->data.ptr + src_y*src_step, src_step, src_stripe_size, 
                              top_bottom->data.ptr, top_bottom_step, 
                              cvSize(size.width, dy + ksize.height - 1), 
                              ay, 0 ); 
            vin = top_bottom; 
            src_y = anchor.y;             
        } 
 
        // do vertical convolution 
        IPPI_CALL( y_func( vin->data.ptr + src_y*vin->step, vin->step ? vin->step : CV_STUB_STEP, 
                           vout.data.ptr, vout.step, cvSize(size.width, dy), 
                           ky, ksize.height, el_anchor.y )); 
 
        // now it's time to copy the previously processed stripe to the input/output image 
        if( src->data.ptr == dst->data.ptr ) 
        { 
            for( i = 0; i < prev_dy; i++ ) 
                memcpy( dst->data.ptr + (y - prev_dy + i)*dst->step, 
                        dst_buf->data.ptr + i*dst_buf->step, size.width*pix_size ); 
            if( y + dy < size.height ) 
            { 
                hout = dst_buf; 
                dst_y = 0; 
            } 
        } 
 
        // create a border for every line by replicating the left-most/right-most elements 
        for( i = 0; i < dy; i++ ) 
        { 
            uchar* ptr = vout.data.ptr + i*vout.step; 
            for( x = -1; x >= -anchor.x*pix_size; x-- ) 
                ptr[x] = ptr[x + pix_size]; 
            for( x = size.width*pix_size; x < (size.width+ksize.width-anchor.x-1)*pix_size; x++ ) 
                ptr[x] = ptr[x - pix_size]; 
        } 
 
        // do horizontal convolution 
        IPPI_CALL( x_func( vout.data.ptr, vout.step, hout->data.ptr + dst_y*hout->step, 
                           hout->step ? hout->step : CV_STUB_STEP, 
                           cvSize(size.width, dy), kx, ksize.width, el_anchor.x )); 
        prev_dy = dy; 
    } 
 
    result = 1; 
 
    __END__; 
 
    cvReleaseMat( &vout_hin ); 
    cvReleaseMat( &dst_buf ); 
    cvReleaseMat( &top_bottom ); 
 
    return result; 
} 
 
 
#define ICV_DEF_FILTER_FUNC( flavor, arrtype, worktype,                     \ 
                             load_macro, cast_macro1, cast_macro2 )         \ 
static CvStatus CV_STDCALL                                                  \ 
icvFilter_##flavor##_CnR( arrtype* src, int srcstep,                        \ 
                          arrtype* dst, int dststep, CvSize* roi,           \ 
                          CvFilterState* state, int stage )                 \ 
{                                                                           \ 
    int width = roi->width;                                                 \ 
    int src_height = roi->height;                                           \ 
    int dst_height = src_height;                                            \ 
    int x, y = 0, i;                                                        \ 
                                                                            \ 
    int ker_x = state->ker_x;                                               \ 
    int ker_y = state->ker_y;                                               \ 
    int ker_width = state->ker_width;                                       \ 
    int ker_height = state->ker_height;                                     \ 
    const float *ker_data = (const float*)state->ker0;                      \ 
                                                                            \ 
    int crows = state->crows;                                               \ 
    arrtype **rows = (arrtype**) (state->rows);                             \ 
    arrtype* tbuf = (arrtype*)(state->tbuf);                                \ 
                                                                            \ 
    int channels = state->channels;                                         \ 
    int ker_x_n = ker_x * channels;                                         \ 
    int ker_width_n = ker_width * channels;                                 \ 
    int width_n = width * channels;                                         \ 
                                                                            \ 
    int starting_flag = 0;                                                  \ 
    int width_rest = width_n & (CV_MORPH_ALIGN - 1);                        \ 
    arrtype **ker_ptr, **ker = (arrtype**)cvStackAlloc(                     \ 
                ker_width*ker_height*sizeof(ker[0]) );                      \ 
    float* ker_coeffs0 = (float*)cvStackAlloc(                              \ 
                ker_width*ker_height*sizeof(ker_coeffs0[0]) );              \ 
    float* ker_coeffs = ker_coeffs0;                                        \ 
                                                                            \ 
    srcstep /= sizeof(src[0]);                                              \ 
    dststep /= sizeof(dst[0]);                                              \ 
                                                                            \ 
    for( i = 0; i < ker_height; i++ )                                       \ 
        for( x = 0; x < ker_width; x++ )                                    \ 
        {                                                                   \ 
            int t = ((int*)ker_data)[i*ker_width + x];                      \ 
            if( t )                                                         \ 
            {                                                               \ 
                *(int*)ker_coeffs = t;                                      \ 
                ker_coeffs++;                                               \ 
            }                                                               \ 
        }                                                                   \ 
    if( stage == CV_START + CV_END )                                        \ 
        stage = CV_WHOLE;                                                   \ 
                                                                            \ 
    /* initialize cyclic buffer when starting */                            \ 
    if( stage == CV_WHOLE || stage == CV_START )                            \ 
    {                                                                       \ 
        for( i = 0; i < ker_height; i++ )                                   \ 
        {                                                                   \ 
            rows[i] = (arrtype*)(state->buffer + state->buffer_step * i);   \ 
        }                                                                   \ 
        crows = ker_y;                                                      \ 
        if( stage != CV_WHOLE )                                             \ 
            dst_height -= ker_height - ker_y - 1;                           \ 
        starting_flag = 1;                                                  \ 
    }                                                                       \ 
                                                                            \ 
    if( stage == CV_END )                                                   \ 
        dst_height += ker_height - ker_y - 1;                               \ 
                                                                            \ 
    do                                                                      \ 
    {                                                                       \ 
        arrtype *tsrc, *tdst;                                               \ 
        int need_copy = 0;                                                  \ 
                                                                            \ 
        /* fill cyclic buffer */                                            \ 
        for( ; crows < ker_height; crows++ )                                \ 
        {                                                                   \ 
            tsrc = src;                                                     \ 
            tdst = rows[crows];                                             \ 
                                                                            \ 
            if( src_height-- <= 0 )                                         \ 
            {                                                               \ 
                if( stage != CV_WHOLE && stage != CV_END )                  \ 
                    break;                                                  \ 
                /* duplicate last row */                                    \ 
                tsrc = rows[crows - 1];                                     \ 
                CV_COPY( tdst, tsrc, width_n + ker_width_n, x );            \ 
                continue;                                                   \ 
            }                                                               \ 
                                                                            \ 
            src += srcstep;                                                 \ 
                                                                            \ 
            CV_COPY( tdst + ker_x_n, tsrc, width_n, x );                    \ 
                                                                            \ 
            /* make replication borders */                                  \ 
            for( i = ker_x_n - 1; i >= 0; i-- )                             \ 
                tdst[i] = tdst[i + channels];                               \ 
            for( i = width_n + ker_x_n; i < width_n + ker_width_n; i++ )    \ 
                tdst[i] = tdst[i - channels];                               \ 
        }                                                                   \ 
                                                                            \ 
        if( starting_flag )                                                 \ 
        {                                                                   \ 
            starting_flag = 0;                                              \ 
            tsrc = rows[ker_y];                                             \ 
                                                                            \ 
            for( i = 0; i < ker_y; i++ )                                    \ 
            {                                                               \ 
                tdst = rows[i];                                             \ 
                CV_COPY( tdst, tsrc, width_n + ker_width_n, x );            \ 
            }                                                               \ 
        }                                                                   \ 
                                                                            \ 
        /* do convolution */                                                \ 
        if( crows < ker_height )                                            \ 
            break;                                                          \ 
                                                                            \ 
        tdst = dst;                                                         \ 
        if( width_rest )                                                    \ 
        {                                                                   \ 
            need_copy = width_n < CV_MORPH_ALIGN || y == dst_height - 1;    \ 
                                                                            \ 
            if( need_copy )                                                 \ 
                tdst = tbuf;                                                \ 
            else                                                            \ 
                CV_COPY( tbuf + width_n, dst + width_n, CV_MORPH_ALIGN, x );\ 
        }                                                                   \ 
                                                                            \ 
        ker_ptr = ker;                                                      \ 
        for( i = 0; i < ker_height; i++ )                                   \ 
            for( x = 0; x < ker_width; x++ )                                \ 
                if( ((int*)ker_data)[i*ker_width + x] )                     \ 
                    *ker_ptr++ = rows[i] + x*channels;                      \ 
                                                                            \ 
        if( channels == 3 )                                                 \ 
        {                                                                   \ 
            for( x = 0; x < width_n; x += 3 )                               \ 
            {                                                               \ 
                double sum0 = 0, sum1 = 0, sum2 = 0;                        \ 
                worktype t0, t1, t2;                                        \ 
                arrtype** kp = ker;                                         \ 
                ker_coeffs = ker_coeffs0;                                   \ 
                while( kp != ker_ptr )                                      \ 
                {                                                           \ 
                    arrtype* tp = *kp++;                                    \ 
                    double f = *ker_coeffs++;                               \ 
                    sum0 += load_macro(tp[x])*f;                            \ 
                    sum1 += load_macro(tp[x+1])*f;                          \ 
                    sum2 += load_macro(tp[x+2])*f;                          \ 
                }                                                           \ 
                t0 = cast_macro1(sum0);                                     \ 
                t1 = cast_macro1(sum1);                                     \ 
                t2 = cast_macro1(sum2);                                     \ 
                tdst[x] = cast_macro2(t0);                                  \ 
                tdst[x+1] = cast_macro2(t1);                                \ 
                tdst[x+2] = cast_macro2(t2);                                \ 
            }                                                               \ 
        }                                                                   \ 
        else                                                                \ 
        {                                                                   \ 
            for( x = 0; x < width_n; x += 4 )                               \ 
            {                                                               \ 
                double sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;              \ 
                worktype t0, t1;                                            \ 
                arrtype** kp = ker;                                         \ 
                ker_coeffs = ker_coeffs0;                                   \ 
                while( kp != ker_ptr )                                      \ 
                {                                                           \ 
                    arrtype* tp = *kp++;                                    \ 
                    double f = *ker_coeffs++;                               \ 
                    sum0 += load_macro(tp[x])*f;                            \ 
                    sum1 += load_macro(tp[x+1])*f;                          \ 
                    sum2 += load_macro(tp[x+2])*f;                          \ 
                    sum3 += load_macro(tp[x+3])*f;                          \ 
                }                                                           \ 
                t0 = cast_macro1(sum0);                                     \ 
                t1 = cast_macro1(sum1);                                     \ 
                tdst[x] = cast_macro2(t0);                                  \ 
                tdst[x+1] = cast_macro2(t1);                                \ 
                t0 = cast_macro1(sum2);                                     \ 
                t1 = cast_macro1(sum3);                                     \ 
                tdst[x+2] = cast_macro2(t0);                                \ 
                tdst[x+3] = cast_macro2(t1);                                \ 
            }                                                               \ 
        }                                                                   \ 
                                                                            \ 
        if( width_rest )                                                    \ 
        {                                                                   \ 
            if( need_copy )                                                 \ 
                CV_COPY( dst, tbuf, width_n, x );                           \ 
            else                                                            \ 
                CV_COPY( dst + width_n, tbuf + width_n, CV_MORPH_ALIGN, x );\ 
        }                                                                   \ 
                                                                            \ 
        /* rotate buffer */                                                 \ 
        {                                                                   \ 
            arrtype *t = rows[0];                                           \ 
                                                                            \ 
            CV_COPY( rows, rows + 1, ker_height - 1, i );                   \ 
            rows[i] = t;                                                    \ 
            crows--;                                                        \ 
            dst += dststep;                                                 \ 
        }                                                                   \ 
    }                                                                       \ 
    while( ++y < dst_height );                                              \ 
                                                                            \ 
    roi->height = y;                                                        \ 
    state->crows = crows;                                                   \ 
                                                                            \ 
    return CV_OK;                                                           \ 
} 
 
 
ICV_DEF_FILTER_FUNC( 8u, uchar, int, CV_8TO32F, cvRound, CV_CAST_8U ) 
ICV_DEF_FILTER_FUNC( 16u, ushort, int, CV_NOP, cvRound, CV_CAST_16U ) 
ICV_DEF_FILTER_FUNC( 32f, float, double, CV_NOP, CV_NOP, CV_CAST_32F ) 
 
 
static void icvInitFilterTab( CvFuncTable* tab ) 
{ 
    tab->fn_2d[CV_8U] = (void*)icvFilter_8u_CnR; 
    tab->fn_2d[CV_16U] = (void*)icvFilter_16u_CnR; 
    tab->fn_2d[CV_32F] = (void*)icvFilter_32f_CnR; 
} 
 
 
//////////////////////////////// IPP generic filter functions //////////////////////////// 
 
icvFilter_8u_C1R_t icvFilter_8u_C1R_p = 0; 
icvFilter_8u_C3R_t icvFilter_8u_C3R_p = 0; 
icvFilter_8u_C4R_t icvFilter_8u_C4R_p = 0; 
icvFilter_16s_C1R_t icvFilter_16s_C1R_p = 0; 
icvFilter_16s_C3R_t icvFilter_16s_C3R_p = 0; 
icvFilter_16s_C4R_t icvFilter_16s_C4R_p = 0; 
icvFilter_32f_C1R_t icvFilter_32f_C1R_p = 0; 
icvFilter_32f_C3R_t icvFilter_32f_C3R_p = 0; 
icvFilter_32f_C4R_t icvFilter_32f_C4R_p = 0; 
 
////////////////////////////////////////////////////////////////////////////////////////// 
 
typedef CvStatus (CV_STDCALL * CvFilterIPPFunc) 
( const void* src, int srcstep, void* dst, int dststep, CvSize size, 
  const float* kernel, CvSize ksize, CvPoint anchor ); 
 
CV_IMPL void 
cvFilter2D( const CvArr* _src, CvArr* _dst, const CvMat* _kernel, CvPoint anchor ) 
{ 
    // below that approximate size OpenCV is faster 
    const int ipp_lower_limit = 20; 
     
    static CvFuncTable filter_tab; 
    static int inittab = 0; 
    CvFilterState *state = 0; 
    float* kernel_data = 0; 
    int local_alloc = 1; 
    CvMat* temp = 0; 
 
    CV_FUNCNAME( "cvFilter2D" ); 
 
    __BEGIN__; 
 
    CvFilterFunc func = 0; 
    int coi1 = 0, coi2 = 0; 
    CvMat srcstub, *src = (CvMat*)_src; 
    CvMat dststub, *dst = (CvMat*)_dst; 
    CvSize size; 
    int type, depth; 
    int src_step, dst_step; 
    CvMat kernel_hdr; 
    const CvMat* kernel = _kernel; 
 
    if( !inittab ) 
    { 
        icvInitFilterTab( &filter_tab ); 
        inittab = 1; 
    } 
 
    CV_CALL( src = cvGetMat( src, &srcstub, &coi1 )); 
    CV_CALL( dst = cvGetMat( dst, &dststub, &coi2 )); 
 
    if( coi1 != 0 || coi2 != 0 ) 
        CV_ERROR( CV_BadCOI, "" ); 
 
    type = CV_MAT_TYPE( src->type ); 
 
    if( !CV_ARE_SIZES_EQ( src, dst )) 
        CV_ERROR( CV_StsUnmatchedSizes, "" ); 
 
    if( !CV_ARE_TYPES_EQ( src, dst )) 
        CV_ERROR( CV_StsUnmatchedFormats, "" ); 
 
    if( !CV_IS_MAT(kernel) || 
        (CV_MAT_TYPE(kernel->type) != CV_32F && 
        CV_MAT_TYPE(kernel->type) != CV_64F )) 
        CV_ERROR( CV_StsBadArg, "kernel must be single-channel floating-point matrix" ); 
 
    if( anchor.x == -1 && anchor.y == -1 ) 
        anchor = cvPoint(kernel->cols/2,kernel->rows/2); 
 
    if( (unsigned)anchor.x >= (unsigned)kernel->cols || 
        (unsigned)anchor.y >= (unsigned)kernel->rows ) 
        CV_ERROR( CV_StsOutOfRange, "anchor point is out of kernel" ); 
 
    if( CV_MAT_TYPE(kernel->type) != CV_32FC1 || !CV_IS_MAT_CONT(kernel->type) || icvFilter_8u_C1R_p ) 
    { 
        int sz = kernel->rows*kernel->cols*sizeof(kernel_data[0]); 
        if( sz < CV_MAX_LOCAL_SIZE ) 
            kernel_data = (float*)cvStackAlloc( sz ); 
        else 
        { 
            CV_CALL( kernel_data = (float*)cvAlloc( sz )); 
            local_alloc = 0; 
        } 
        kernel_hdr = cvMat( kernel->rows, kernel->cols, CV_32F, kernel_data ); 
        if( CV_MAT_TYPE(kernel->type) == CV_32FC1 ) 
            cvCopy( kernel, &kernel_hdr ); 
        else 
            cvConvertScale( kernel, &kernel_hdr, 1, 0 ); 
        kernel = &kernel_hdr; 
    } 
 
    size = cvGetMatSize( src ); 
    depth = CV_MAT_DEPTH(type); 
    src_step = src->step; 
    dst_step = dst->step ? dst->step : CV_STUB_STEP; 
 
    if( icvFilter_8u_C1R_p && (src->rows >= ipp_lower_limit || src->cols >= ipp_lower_limit) ) 
    { 
        CvFilterIPPFunc ipp_func =  
                type == CV_8UC1 ? (CvFilterIPPFunc)icvFilter_8u_C1R_p : 
                type == CV_8UC3 ? (CvFilterIPPFunc)icvFilter_8u_C3R_p : 
                type == CV_8UC4 ? (CvFilterIPPFunc)icvFilter_8u_C4R_p : 
                type == CV_16SC1 ? (CvFilterIPPFunc)icvFilter_16s_C1R_p : 
                type == CV_16SC3 ? (CvFilterIPPFunc)icvFilter_16s_C3R_p : 
                type == CV_16SC4 ? (CvFilterIPPFunc)icvFilter_16s_C4R_p : 
                type == CV_32FC1 ? (CvFilterIPPFunc)icvFilter_32f_C1R_p : 
                type == CV_32FC3 ? (CvFilterIPPFunc)icvFilter_32f_C3R_p : 
                type == CV_32FC4 ? (CvFilterIPPFunc)icvFilter_32f_C4R_p : 0; 
         
        if( ipp_func ) 
        { 
            CvSize el_size = { kernel->cols, kernel->rows }; 
            CvPoint el_anchor = { el_size.width - anchor.x - 1, el_size.height - anchor.y - 1 }; 
            int stripe_size = 1 << 16; // the optimal value may depend on CPU cache, 
                                       // overhead of current IPP code etc. 
            const uchar* shifted_ptr; 
            int i, j, y, dy = 0; 
            int temp_step; 
 
            // mirror the kernel around the center 
            for( i = 0; i < (el_size.height+1)/2; i++ ) 
            { 
                float* top_row = kernel->data.fl + el_size.width*i; 
                float* bottom_row = kernel->data.fl + el_size.width*(el_size.height - i - 1); 
 
                for( j = 0; j < (el_size.width+1)/2; j++ ) 
                { 
                    float a = top_row[j], b = top_row[el_size.width - j - 1]; 
                    float c = bottom_row[j], d = bottom_row[el_size.width - j - 1]; 
                    top_row[j] = d; 
                    top_row[el_size.width - j - 1] = c; 
                    bottom_row[j] = b; 
                    bottom_row[el_size.width - j - 1] = a; 
                } 
            } 
 
            CV_CALL( temp = icvIPPFilterInit( src, stripe_size, el_size )); 
             
            shifted_ptr = temp->data.ptr + 
                anchor.y*temp->step + anchor.x*CV_ELEM_SIZE(type); 
            temp_step = temp->step ? temp->step : CV_STUB_STEP; 
 
            for( y = 0; y < src->rows; y += dy ) 
            { 
                dy = icvIPPFilterNextStripe( src, temp, y, el_size, anchor ); 
                IPPI_CALL( ipp_func( shifted_ptr, temp_step, 
                    dst->data.ptr + y*dst_step, dst_step, cvSize(src->cols, dy), 
                    kernel->data.fl, el_size, el_anchor )); 
            } 
            EXIT; 
        } 
    } 
 
    CV_CALL( state = icvFilterInitAlloc( src->cols, cv32f, CV_MAT_CN(type), 
                                   cvSize(kernel->cols, kernel->rows), anchor, 
                                   kernel->data.ptr, ICV_GENERIC_KERNEL )); 
 
    if( CV_MAT_CN(type) == 2 ) 
        CV_ERROR( CV_BadNumChannels, "Unsupported number of channels" ); 
 
    func = (CvFilterFunc)(filter_tab.fn_2d[depth]); 
 
    if( !func ) 
        CV_ERROR( CV_StsUnsupportedFormat, "" ); 
 
    if( size.height == 1 ) 
        src_step = dst_step = CV_STUB_STEP; 
 
    IPPI_CALL( func( src->data.ptr, src_step, dst->data.ptr, 
                     dst_step, &size, state, 0 )); 
 
    __END__; 
 
    cvReleaseMat( &temp ); 
    icvFilterFree( &state ); 
    if( !local_alloc ) 
        cvFree( (void**)&kernel_data ); 
} 
 
/* End of file. */