www.pudn.com > three_step_search.rar > hpel.asm


/******************************************************************************* 
Copyright(c) 2000 - 2002 Analog Devices. All Rights Reserved. 
Developed by Joint Development Software Application Team, IPDC, Bangalore, India 
for Blackfin DSPs  ( Micro Signal Architecture 1.0 specification). 
 
By using this module you agree to the terms of the Analog Devices License 
Agreement for DSP Software.  
******************************************************************************** 
Module Name     : hpel.asm 
Label Name      : __hpel 
Version         :   1.0 
Change History  : 
 
                Version     Date          Author            Comments 
                1.0         04/12/2001    Vijay             Original  
 
Description     : This routine does the half pixel computation for the motion  
                  estimation. 
 
Assumption      : The routine assumes that WINWIDTH is a multiple of 4. 
 
Prototype       : int _hpel(unsigned char *best_match, int min_SAD,  
                            unsigned char *target, int WINWIDTH); 
 
                    best_match -> Address of the best matching reference block 
                    min_SAD    -> The minimum SAD corresponding to the best  
                                  matching block. 
                    target     -> Address of the target macro block (16x16) 
                    WINWIDTH   -> Width of the reference window (WINWIDTH) 
 
                    The output is the half pel positions which are returned to  
                    the calling routine 
 
Registers used  : A0, A1, R0-R7, I0-I3, M0, M1, M3, L0-L3, P0, P2-P5, LC0. 
 
Performance: 
            Code size                       :  768 bytes 
            Cycle count for half pixelation : 1695 cycles 
*******************************************************************************/ 
 
// Half pixel computation 
 
.section L1_code; 
.align 8; 
.global __hpel; 
 
__hpel: 
 
    [--SP] = (R7:4, P5:3); 
    [--SP] = RETS; 
    L0 = 0; 
    L1 = 0; 
    L2 = 0; 
    L3 = 0; 
    P5 = R0;                // Address of the best matching block 
    R5 = R1;                // SAD corresponding to the best match 
    P4 = R2;                // Address of the target block 
    P3 = [SP + 44];         // Width of the reference window 
    P0 = 344; 
    SP -= P0; 
    I3 = SP;                // Temporary buffer in stack 
    R4 = 0; 
/******************** INTERPOLATE DIAGONAL BLOCKS ********************/ 
    M0 = 7; 
    M3 = -3 (X); 
    R0 = P5;                // Address of the best match 
    R0 += -1; 
    I0 = R0; 
    I1 = R0; 
    R2 = P3;                // WINWIDTH 
    R1 = R0 - R2 (S); 
    I2 = R1;                // Address of best match - (WINWIDTH+1) 
    R2 += -17; 
    M1 = R2; 
    P2 = 17; 
 
    LSETUP(AVG4_ST, AVG4_END) LC0 = P2; 
    DISALGNEXCPT || R0 = [I1++] || R2 = [I2++]; 
    DISALGNEXCPT || R1 = [I1++M3] || R3 = [I2++M3]; 
    R7 = BYTEOP2P(R1:0,R3:2) (RNDL) || R0 = [I1++M0]; 
AVG4_ST:     
        DISALGNEXCPT || I0 += M3 || R2 = [I2++M0]; 
        R6 = BYTEOP2P(R1:0,R3:2) (RNDH) || R0 = [I1++M3] || R2 = [I2++M3]; 
        R7 = R6 + R7 (NS) || I0 -= M3; 
     
        R7 = BYTEOP2P(R1:0,R3:2) (RNDL, R) || [I3++] = R7 || R1 = [I1++M0]; 
        DISALGNEXCPT || I0 += M3 || R3 = [I2++M0]; 
        R6 = BYTEOP2P(R1:0,R3:2) (RNDH, R) || R1 = [I1++M3] || R3 = [I2++M3]; 
        R7 = R6 + R7 (NS) || I0 -= M3; 
     
        R7 = BYTEOP2P(R1:0,R3:2) (RNDL) || [I3++] = R7 || R0 = [I1++M0]; 
        DISALGNEXCPT || I0 += M3 || R2 = [I2++M0]; 
        R6 = BYTEOP2P(R1:0,R3:2) (RNDH) || R0 = [I1++M3] || R2 = [I2++M3]; 
        R7 = R6 + R7 (NS) || I0 -= M3; 
     
        R7 = BYTEOP2P(R1:0,R3:2) (RNDL, R) || [I3++] = R7 || R1 = [I1++M0]; 
        DISALGNEXCPT || I0 += M3 || R3 = [I2++M0]; 
        R6 = BYTEOP2P(R1:0,R3:2) (RNDH, R) || R1 = [I1++M3] || R3 = [I2++M3]; 
        R7 = R6 + R7 (NS) || I0 -= M3; 
     
        R7 = BYTEOP2P(R1:0,R3:2) (RNDL) || [I3++] = R7 || R0 = [I1++M1]; 
        DISALGNEXCPT || I0 += M3 || R2 = [I2++M1]; 
        R6 = BYTEOP2P(R1:0,R3:2) (RNDH) || R0 = [I1++] || R2 = [I2++]; 
        R7 = R6 + R7 (NS) || I0 -= M3; 
     
        DISALGNEXCPT || R1 = [I1++M3] || R3 = [I2++M3]; 
AVG4_END: 
        R7 = BYTEOP2P(R1:0,R3:2) (RNDL) || [I3++] = R7 || R0 = [I1++M0];        
     
/**************** CALCULATE SAD FOR DIAGONAL BLOCKS *******************/ 
    R7 = -1;                // (V,H) -> R7.H = -1, R7.L = -1 
    R0 = P4; 
    R1 = SP; 
    CALL _compute_sad; 
    CC = R0 < R5; 
    IF CC R5 = R0; 
    IF CC R4 = R7; 
     
    R7.L = 1;               // (V,H) -> R7.H = -1, R7.L = 1 
    R0 = P4; 
    R1 = SP; 
    R1 += 1; 
    CALL _compute_sad; 
    CC = R0 < R5; 
    IF CC R5 = R0; 
    IF CC R4 = R7; 
     
    R7.H = 1;               // (V,H) -> R7.H = 1, R7.L = 1 
    R0 = P4; 
    R1 = SP; 
    R1 += 21; 
    CALL _compute_sad; 
    CC = R0 < R5; 
    IF CC R5 = R0; 
    IF CC R4 = R7; 
     
    R7.L = -1;              // (V,H) -> R7.H = 1, R7.L = -1 
    R0 = P4; 
    R1 = SP; 
    R1 += 20; 
    CALL _compute_sad; 
    CC = R0 < R5; 
    IF CC R5 = R0; 
    IF CC R4 = R7; 
     
/******************** INTERPOLATE LEFT/RIGHT BLOCKS ********************/ 
    R0 = P5;                // Address of the best match 
    I0 = R0;                // Address of best match 
    R0 += -1; 
    I1 = R0;                // Address of best match - 1 
    I3 = SP;                // Output buffer 
    P2 = 16 (Z); 
    R1 = P3;                // WINWIDTH 
    R1 += -20; 
    M1 = R1; 
    MNOP; 
 
    LSETUP(AVG2_LR_ST, AVG2_LR_END) LC0 = P2; 
    DISALGNEXCPT || R0 = [I0++] || R2 = [I1++]; 
AVG2_LR_ST: 
        DISALGNEXCPT || R1 = [I0++] || R3  =[I1++];  
        R6 = BYTEOP1P(R1:0,R3:2) || R0 = [I0++] || R2  =[I1++];  
        R6 = BYTEOP1P(R1:0,R3:2)(R) || [I3++] = R6 || R1 = [I0++]; 
        DISALGNEXCPT  || [I3++] = R6 || R3  =[I1++];             
        R6 = BYTEOP1P(R1:0,R3:2) || R0 = [I0++] || R2  =[I1++]; 
        R6 = BYTEOP1P(R1:0,R3:2)(R) || [I3++] = R6 || R1 = [I0++M1];  
        DISALGNEXCPT || [I3++] = R6 || R3  =[I1++M1];                
        R6 = BYTEOP1P(R1:0,R3:2) || R0 = [I0++]; 
AVG2_LR_END: 
        DISALGNEXCPT || [I3++] = R6 || R2  =[I1++]; 
     
/**************** CALCULATE SAD FOR LEFT/RIGHT BLOCKS *******************/ 
    R7 = 1;                 // (V,H) -> R7.H = 0, R7.L = 1 
    R0 = P4; 
    R1 = SP; 
    R1 += 1; 
    CALL _compute_sad; 
    CC = R0 < R5; 
    IF CC R5 = R0; 
    IF CC R4 = R7; 
     
    R7.L = -1;              // (V,H) -> R7.H = 0, R7.L = -1 
    R0 = P4; 
    R1 = SP; 
    CALL _compute_sad; 
    CC = R0 < R5; 
    IF CC R5 = R0; 
    IF CC R4 = R7; 
     
/******************** INTERPOLATE TOP/BOTTOM BLOCKS ********************/ 
    R1 = P3;                // WINWIDTH 
    R0 = P5;                // Address of the best match 
    I0 = R0;                // Address of best match 
    R0 = R0 - R1(S) || NOP; 
    I1 = R0;                // Address of best match - WINWIDTH 
    I3 = SP;                // Output buffer 
    P2 = 17; 
    R1 += -16; 
    M1 = R1; 
 
    LSETUP(AVG2_TB_ST, AVG2_TB_END) LC0 = P2; 
    DISALGNEXCPT || R0 = [I0++] || R2 = [I1++]; 
AVG2_TB_ST: 
        DISALGNEXCPT || R1 = [I0++] || R3  =[I1++];  
        R6 = BYTEOP1P(R1:0,R3:2) || R0 = [I0++] || R2  =[I1++];  
        R6 = BYTEOP1P(R1:0,R3:2)(R) || [I3++] = R6 || R1 = [I0++]; 
        DISALGNEXCPT  || [I3++] = R6 || R3  =[I1++];             
        R6 = BYTEOP1P(R1:0,R3:2) || R0 = [I0++M1] || R2  =[I1++M1]; 
        R6 = BYTEOP1P(R1:0,R3:2)(R) || [I3++] = R6 || R0 = [I0++];  
AVG2_TB_END: 
        DISALGNEXCPT || [I3++] = R6 || R2  =[I1++];              
     
/**************** CALCULATE SAD FOR TOP/BOTTOM BLOCKS *******************/ 
    R7.H = -1;              // (V,H) -> R7.H = -1, R7.L = 0 
    R7.L = 0; 
    R0 = P4; 
    R1 = SP; 
    CALL _compute_sad_aligned; 
    CC = R0 < R5; 
    IF CC R5 = R0; 
    IF CC R4 = R7; 
     
    R7.H = 1;               // (V,H) -> R7.H = 1, R7.L = 0 
    R0 = P4; 
    R1 = SP; 
    R1 += 16; 
    CALL _compute_sad_aligned; 
    CC = R0 < R5; 
    IF CC R5 = R0; 
    IF CC R4 = R7; 
     
/********************************************************************/ 
    R0 = R4;                // Return horizontal and vertical half pel 
    P0 = 344; 
    SP = SP + P0; 
    RETS = [SP++]; 
    (R7:4, P5:3) = [SP++]; 
    RTS; 
     
     
.align 8; 
_compute_sad: 
    I0 = R0;                // Address of the target 
    I1 = R1;                // Address of the interpolated block 
    P0 = 16 (Z); 
    A1 = A0 = 0; 
 
    LSETUP (MAD_START1, MAD_END1) LC0=P0; 
    DISALGNEXCPT || R0 = [I0++] || R2 = [I1++];  
MAD_START1: 
        DISALGNEXCPT || R3 = [I1++]; 
        SAA (R1:0,R3:2) || R1 = [I0++]  || R2 = [I1++]; 
                            // Compute absolute difference and acc  
        SAA (R1:0,R3:2) (R) || R0 = [I0++] || R3 = [I1++]; 
        SAA (R1:0,R3:2) || R1 = [I0++] || R2 = [I1++]; 
MAD_END1: 
        SAA (R1:0,R3:2) (R) || R0 = [I0++] || R2 = [I1++]; 
    R3=A1.L+A1.H,R2=A0.L+A0.H;     
    R0 = R2 + R3 (S);       // Add the accumulated values in both MACs 
    RTS; 
     
.align 8; 
_compute_sad_aligned: 
    I0 = R0;                // Address of the target 
    I1 = R1;                // Address of the interpolated block 
    P0 = 16; 
 
    LSETUP (MAD_START, MAD_END) LC0=P0; 
    A1=A0=0 || R0 = [I0++] || R2 = [I1++]; 
                            // Initialize accumulators  
MAD_START: 
        SAA (R1:0,R3:2) || R1 = [I0++]  || R3 = [I1++]; 
                            // Compute absolute difference and acc.  
        SAA (R1:0,R3:2) (R) || R0 = [I0++] || R2 = [I1++]; 
        SAA (R1:0,R3:2) || R1 = [I0++] || R3 = [I1++]; 
MAD_END:SAA (R1:0,R3:2) (R) || R0 = [I0++] || R2 = [I1++]; 
    R3=A1.L+A1.H,R2=A0.L+A0.H;     
    R0 = R2 + R3 (S);       // Add the accumulated values in both MACs 
    RTS; 
__hpel.end: