www.pudn.com > three_step_search.rar > three_step_search.asm
/*******************************************************************************
Copyright(c) 2000 - 2002 Analog Devices. All Rights Reserved.
Developed by Joint Development Software Application Team, IPDC, Bangalore, India
for Blackfin DSPs ( Micro Signal Architecture 1.0 specification).
By using this module you agree to the terms of the Analog Devices License
Agreement for DSP Software.
********************************************************************************
Module Name : three_step_search.asm
Label Name : __three_step_search
Version : 1.2
Change History :
Version Date Author Comments
1.2 11/18/2002 Swarnalatha Tested with VDSP++ 3.0
compiler 6.2.2 on
ADSP-21535 Rev.0.2
1.1 11/13/2002 Swarnalatha Tested with VDSP++ 3.0
on ADSP-21535 Rev.0.2
1.0 07/02/2001 Vijay Original
Description : This routine does the motion vector computation using the
Three step search algorithm for a given macroblock. The
integer pel motion vector is first computed and a half pixel
correction is given. The range covered by different step sizes
is shown below :
=====================================
| Initial | |
| step size (SS) | Range (SR) |
=====================================
| 4 | -7.5 to 7.5 |
| 5 | -8.5 to 8.5 |
| 6 | -10.5 to 10.5 |
| 7 | -11.5 to 11.5 |
| 8 | -14.5 to 14.5 |
| 9 | -15.5 to 15.5 |
=====================================
The input and output to this routine is through a structure.
The motion vectors are written back to the same structure.
The input/output structure is declared as follows :
typedef struct
{
unsigned char *ptr_target;
// Target block address (16x16)
unsigned char *ptr_reference;
// Reference window address
int winwidth; // Width of the reference window
int step_size; // Initial step size
tss_struct *ptr_tss;// Pointer to the initialized tss_struct
short mv_x; // Address of the horizontal MV
short mv_y; // Address of the vertical MV
}tss_par;
where tss_struct is defined as
typedef struct
{
short vmv[9];
short hmv[9];
short modifier[25];
}tss_struct;
The tss_struct is initialized by calling the function
__init_tss() once before invoking the TSS routine.
This structure is used by the TSS routine in the motion vector
computation (Refer init_tss.asm for initialization details).
The target block is assumed to be stored in a 16x16 buffer and
the pointer to this buffer is initialized in the tss_par
structure.
The reference block stores all the required data for covering
the range of the step size and for doing the half pixel
interpolation, that is, if the initial step size is SS, then
the range covered is SR = (SS + (SS>>1) + (SS>>2)). In
addition to this we need one more row/column for half pixel
interpolation.
Thus, in the reference window we have to store a stretch of
{2*(SR + 1) + 16} pixels around the target block. The width of
the reference window becomes WINWIDTH = {2*(SR+1) + 16}. The
following picture depicts the data storage in the reference
window.
Data layout of the reference window
<------- WINWIDTH ---------->
------------------------------- ---
| --------------------------- | |
| | | | | |
1 pel gap->| |<- SR | | |
for half | | | | | W
pel inter.| | ----------- | | I
| | | TARGET | | | N
| | | (ZERO | | | W
| |< SR >| MOTION | | | I
| | | POSITION)| | | D
| | | | | | T
| | ----------- | | H
| | <--- 16 ---> | | |
| | | | |
| --------------------------- | |
------------------------------ ---
Assumption : The width of the reference window is assumed tobe a multiple
of 4.
Prototype : void three_step_search(tss_par *tss_in_out);
Registers used : A0, A1, R0-R7, I0-I2, M0, M1, L0-L2, P0-P5, LC0, LC1.
Performance :
Code size :
three_step_search : 688 bytes
hpel : 768 bytes
init_tss : 192 bytes
Total cycle count : 4239 cycles
Cycle count split up :
Integer pel estimation : 2544 cycles
Half pel estimation : 1695 cycles
The cycle count given above is for the first iteration of the test case 1 in
test_three_step_search.c
*******************************************************************************/
#define PTR_MACROBLOCK 0
#define PTR_REFERENCE 4
#define WINWIDTH 8
#define STEP_SIZE 12
#define PTR_TSS 16
#define H_MV 20
#define V_MV 22
.section L1_code;
.align 8;
.global _three_step_search;
.extern __hpel;
_three_step_search:
[--SP] = (R7:4, P5:3);
P5 = R0; // Address of tss parameter structure
[--SP] = RETS;
SP += -36;
L0 = 0;
L1 = 0;
L2 = 0;
I2 = SP; // I2 points to mv_ind array
R7 = R7 - R7 (S) || R0 = [P5 + STEP_SIZE];
// Initial step size
R1 = R0 >>> 1 || R3 = [P5 + WINWIDTH];
// WINWIDTH
R2 = R0 >>> 2 || P4 = [P5 + PTR_TSS];
// Address of tss structure
R0 = R0 + R1(S);
R0 = R0 + R2 (S) || R1 = [P5 + PTR_MACROBLOCK] || [I2++] = R7;
// Fetch the address of the target block
R0 += 1; // Search Range (SR)
I0 = R1;
R1 = R0.L*R3.L (IS) || R2 = [P5 + PTR_REFERENCE];
// (SR*(WINWIDTH+1)),Address of reference window
R1 = R0 + R1 (NS) || [I2++] = R7;
// [SP] = mv_ind[0]
R1 = R2 + R1 (NS) || [I2--] = R7;
// ref_ptr = reference + SR(WINWIDTH + 1),
// [SP + 8] = mv_ind[2]
I1 = R1; // Address of reference data
R4 = R4 - R4 (S) || [SP + 20] = R1;
// [SP + 20] = ref_ptr
P4 += 36;
A1 = A0 = 0 || [SP + 24] = P4;
// [SP + 24] = Address of modifier[0]
P4 += 2;
M0 = -260 (X);
R3 = [P5 + WINWIDTH];
R3 += -16;
M1 = R3; // Modifier for the reference window
P2 = 16 (Z);
[SP + 28] = P2; // To retrieve the loop count if it is modified
/******************** ZERO MOTION VECTOR POSITION *****************************/
DISALGNEXCPT || R0 = [I0++] || R2 = [I1++];
// Fetch the first data from the two blocks
LSETUP (MAD_START, MAD_END) LC0=P2;
MAD_START:
DISALGNEXCPT || R3 = [I1++];
SAA (R1:0,R3:2) || R1 = [I0++] || R2 = [I1++];
// Compute absolute difference and acc
SAA (R1:0,R3:2) (R) || R0 = [I0++] || R3 = [I1++];
SAA (R1:0,R3:2) || R1 = [I0++] || R2 = [I1 ++ M1];
MAD_END:SAA (R1:0,R3:2) (R) || R0 = [I0++] || R2 = [I1++];
// Dummy fetch using I0, modifier[k++]
R3=A1.L+A1.H,R2=A0.L+A0.H || R0 = [I0 ++ M0] || R1 = W[P4++] (X);
R5 = R2 + R3 (S) || R0 = [SP + 20];
// Add the accumulated values in both MACs
R1 = R0 + R1 (S) || I2 -= 4;
I1 = R1;
/************************ THREE STEP SEARCH **********************************/
STEP_LOOP:
P3 = 2;
LSETUP (ST_SEARCH, END_SEARCH) LC1=P2>>1;
ST_SEARCH:
A1 = A0 = 0;
LSETUP (MAD_START1, MAD_END1) LC0=P2;
DISALGNEXCPT || R0 = [I0++] || R2 = [I1++];
MAD_START1: DISALGNEXCPT || R3 = [I1++];
SAA (R1:0,R3:2) || R1 = [I0++] || R2 = [I1++];
// Compute absolute difference and acc
SAA (R1:0,R3:2) (R) || R0 = [I0++] || R3 = [I1++];
SAA (R1:0,R3:2) || R1 = [I0++] || R2 = [I1 ++ M1];
MAD_END1: SAA (R1:0,R3:2) (R) || R0 = [I0++] || R2 = [I1++];
// Dummy fetch using I0, modifier[k++]
R3=A1.L+A1.H,R2=A0.L+A0.H || R0 = [I0 ++ M0] || R1 = W[P4++] (X);
R6 = R2 + R3 (S) || R0 = [SP + 20];
// Add the accumulated values in both MACs
R1 = R0 + R1;
I1 = R1;
CC = R6 == R5;
IF !CC JUMP LESS (BP);
R3 = P3;
R0 = [P5 + PTR_TSS];// Address of vmv[]
R1 = 18;
R1 = R1 + R0 (S) || [I2] = R4;
// Address of hmv[]
R2 = SP; // Address of mv_ind[]
R3 = R3 >> 1 || [SP + 16] = R7;
// step number (step)
[SP + 12] = R3; // Current index (c)
CALL __compute_distance;
R4 = R0 << 0 || P2 = [SP + 28];
// Store the returned index, Restore loop count
LESS: CC = R6 < R5;
IF CC R5 = R6;
IF CC R4 = P3;
END_SEARCH:
P3 += 2; // Index increment
// ref_ptr update
P4 += -2;
R1 = R7 << 4 || R0 = [SP + 24];
R1 = R1 + R4 (NS) || [I2++] = R4;
// store mv_ind[j]
R0 = R0 + R1 (NS) || R3 = W[P4++] (X);
P0 = R0;
R2 = R2 - R2 (S) || R1 = [SP + 20];
R7 += 1;
CC = R4 == 0;
R4 = R4 - R4 (S) || R0 = W[P0] (X);
IF CC R0 = R2;
R0 = R0 + R1;
R1 = R0 + R3(NS) || [SP + 20] = R0;
// Updated ref_ptr
I1 = R1;
CC = R7 == 3;
IF !CC JUMP STEP_LOOP (BP);
/********************** MOTION VECTOR COMPUATION **************************/
P3 = [SP + 8];
P0 = [P5 + PTR_TSS]; // Address of vmv
P4 = [P5 + PTR_TSS];
P4 += 18; // Address of hmv
P1 = P0 + P3;
R0.H = W[P1]; // vmv[mv_ind[2]]
P3 = P4 + P3;
R0.L = W[P3]; // hmv[mv_ind[2]]
R1 = R0 >> 14 (V) || P3 = [SP + 4];
P1 = P0 + P3;
P3 = P4 + P3;
R0 = R0 +|+ R1 || R2.H = W[P1];
// vmv[mv_ind[1]]
R3 = R0 >>> 2 (V) || R2.L = W[P3];
// hmv[mv_ind[1]]
R1 = R2 >> 15 (V) || P3 = [SP];
// mv_ind[0]
P1 = P0 + P3;
P3 = P4 + P3;
R2 = R2 +|+ R1 || R0.H = W[P1];
// vmv[mv_ind[0]]
R2 = R2 >>> 1 (V) || R0.L = W[P3];
// hmv[mv_ind[0]]
R3 = R3 +|+ R2;
// R3.H -> HMV, R3.L -> VMV (scaled)
R3 = R3 +|+ R0, R4 = R3 -|- R0 (ASL) || R0 = [SP + 20];
// R0->Best matching block
/************************* HALF PIXEL CCOMPUTATION *************************/
R3 = R3 >> 16 || W[P5 + H_MV] = R3;
// Store Horizontal MV
W[P5 + V_MV] = R3; // Store Vertical MV
R1 = R5; // SAD corresponding to the best match
R2 = [P5 + PTR_MACROBLOCK];
// Address of the target macroblock
R3 = [P5 + WINWIDTH];
[SP + 12] = R3;
CALL __hpel; // __hpel is in hpel.asm
/***************************************************************************/
R1 = W[P5 + H_MV](Z); // Get horizontal integer pel MV
R2 = W[P5 + V_MV](Z); // Get vertical integer pel MV
R1.L = R1.L + R0.L (S); // Compute half pel horizontal MV
R2.L = R2.L + R0.H (S); // Compute half pel vertical MV
W[P5 + H_MV] = R1;
W[P5 + V_MV] = R2;
SP += 36;
RETS = [SP++];
(R7:4, P5:3) = [SP++];
RTS;
_three_step_search.end:
/* If the SAD of two check points are same this routine computes the
motion vector of the two check points and calculates the distance of
each from the target block. Whichever checkpoint is close to the
target block that index is returned
*/
.global __compute_distance;
.align 8;
__compute_distance:
P2 = R2; // Address of mv_ind[];
P0 = R0; // Address of vmv[]
[--SP] = (R7:6, P5:4);
R0 = R0 - R0 (S) || R3 = [SP + 32];
// step
R6 = R6 - R6 (S) || P4 = [P2++];
P1 = R1; // Address of hmv[]
R2 = -16;
R3 = -R3;
LP_START:
P5 = P0 + P4;
R1.L = W[P5];
P5 = P1 + P4;
R7 = R2 - R0(S) || R1.H = W[P5];
R7 = LSHIFT R1 BY R7.L (V) || P4 = [P2++];
R1 = R1 +|+ R7;
R7 = ASHIFT R1 BY R0.L (V);
R6 = R6 +|+ R7;
R0 += -1;
CC = R3 <= R0;
IF CC JUMP LP_START;
R7 = R6 -|- R7 || P4 = [SP + 28];
A1 = R6.L * R6.L (IS) || R0 = [SP + 28];
// R0 -> c (current index)
A1 += R6.H*R6.H (IS);
P5 = P0 + (P4<<1);
R0 = R0 << 1 || R1.L = W[P5];
P5 = P1 + (P4<<1);
R6 = R2 - R3(S) || R1.H = W[P5];
R6 = LSHIFT R1 BY R6.L (V) || P4 = [P2--];
R1 = R1 +|+ R6;
R6 = ASHIFT R1 BY R3.L (V) || P4 = [P2--];
R6 = R6 +|+ R7;
A0 = R6.L * R6.L (IS);
A0 += R6.H * R6.H (IS) || R1 = [P2];
// R1 -> p[step]
// A0 -> Current MV distance,
// A1 -> Previous MV distance
// R0 -> Current MV index, R1 -> Previous MV index
CC = A0 < A1;
IF !CC R0 = R1;
(R7:6, P5:4) = [SP++];
RTS;
__compute_distance.end: