www.pudn.com > scaling.rar > scale_horz_h.asm
* ========================================================================= *
* *
* TEXAS INSTRUMENTS, INC. *
* *
* NAME *
* scale_horz *
* *
* *
* USAGE *
* This routine is C-callable and can be called as: *
* *
* void scale_horz *
* ( *
* unsigned short *in_data, /* Ptr to unscaled lines */ *
* unsigned int in_len, /* Pixels/line unscaled */ *
* short *out_data, /* Ptr to scaled data lines */ *
* unsigned int out_len, /* Pixels/line of scaled data */ *
* short *hh, /* Ptr to filter taps, *
* interleaved odd/even *
* outputs */ *
* unsigned int l_hh, /* Length of scaling filters */ *
* unsigned int n_hh, /* Number of scaling filters */ *
* short *patch /* Ptr to decrement pattern */ *
* ); *
* *
* DESCRIPTION *
* *
* This code can scale up or down 1 line of data, in the *
* ratio out_len : in_len. e.g 1 to 3, 4:3, 5:6. The *
* filters are designed outside of the loop using a *
* general purpose resizing algorithm. *
* *
* patch0 = patch + 2; *
* filter_count = n_hh; *
* ka = 0; *
* *
* line0_x = plane_x; *
* line0_y = plane_y; *
* ptr_hh = hh; *
* jump = (int) patch[0]; ka = jump >> 1; *
* jump = (int) patch[1]; kb = jump >> 1; *
* *
* for ( i = 0; i < n_y; i += 2) *
* { *
* y0 = 1 << 5; *
* y1 = 1 << 5; *
* for ( j = 0; j < l_hh; j+=4) *
* { *
* /* even outputs */ *
* for (k=0; k < 4; k++) *
* { *
* h0 = *ptr_hh++; *
* x0 = *(line0_x+ ka + k); *
* y0 += ( x0 * h0 ); *
* } *
* jump = (int) (*patch0++); *
* ka = ka + (jump>>1); *
* /* odd outputs */ *
* for (k=0; k < 4; k++) *
* { *
* h1 = *ptr_hh++; *
* x1 = *(line0_x + kb + k); *
* y1 += ( x1 * h1 ); *
* } *
* jump = (int) (*patch0++); *
* kb = kb + (jump>>1); *
* } *
* *line0_y++ = (short) (y0 >> 6) ; *
* *line0_y++ = (short) (y1 >> 6) ; *
* *
* filter_count -= 2; *
* if (!filter_count) *
* { *
* patch0 = patch + 2; *
* ptr_hh = hh; *
* filter_count = n_hh; *
* } *
* } *
* *
* ASSUMPTIONS *
* One line of data is produced per function call. *
* *
* The line must be aligned on a double word boundary and be a *
* multiples of 8 bytes. *
* *
* Filters are multiples of 4 taps, maximum number of filters is 256. *
* *
* The computations for each output are interleaved, thus the filters *
* are interleaved on a 4 short interval. *
* *
* Little ENDIAN Configuration is used and the input and output data *
* is 16 bit unsinged and signed shorts respectively. The filters *
* are also 16 bit signed shorts in 12 bit precision. *
* *
* The n_hh filters are all of the same length and are *
* strung together in a single linear array. *
* *
* Interrupts are masked by the function for most of its duration. *
* *
* MEMORY NOTE *
* Some bank hits will occur in this code for certain scale *
* factors and filter lengths. *
* *
* For 4 taps k = 0, for l_hh 8, k = 0.031, for l_hh = 16, k = 0.015. *
* Different flter lengths can produce different numbers of bank *
* conflicts. Overall, these bank conflicts have nearly zero effect. *
* *
* For l_hh=4: k=0, l_hh=8: k=1/32, l_hh=12: k=0, l_hh=16: k=1/64 *
* For l_hh % 8 == 0, k = 1/(4*l_hh) else k = 0 *
* *
* 'k' is the bank conflict between the store and the guidance table *
* load. Depending on the relative sizes of the filters and *
* memory width, this bank conflict is between 0 and 3.1% *
* overhead. *
* *
* TECHNIQUES *
* The outputs are computed using interleaved inputs. The patch table *
* controls the access of 2 parallel pointers. For example an 8/33 *
* scale factor will have the following access pattern. *
* *
* 11111111112222222222333333333344444444445555555555 *
* 012345678901234567890123456789012345678901234567890123456789 *
* *
* 0 e xxxxxxxx <-start point of even output 0 *
* 1 o xxxxxxxx <-start point of odd output 4 *
* 2 e xxxxxxxx *
* 3 o xxxxxxxx *
* 4 e xxxxxxxx *
* 5 o xxxxxxxx *
* 6 e xxxxxxxx *
* 7 o xxxxxxxx *
* 0 e xxxxxxxx <-next start *
* 1 o xxxxxxxx <-next start *
* *
* *
* From this diagram the odd pointer jumps 4 then another 4 as the *
* filters have 8 taps, it then jumps 4 to get to the next set of *
* input data. The odd pointer does the same. These jumps are *
* interleaved and so are the filter coefficients. The jumps are *
* in multiples of bytes as non-scaled non-aligned double word *
* accesses are used. In this case the table will be: *
* *
* short patch[] = {0,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,10,10,8,8}; *
* *
* Notice the first 2 entries are the intial starting points for *
* the two pointers. To remove a dependency in the code the last 2 *
* entries are copies of the 2nd two. This makes the table almost *
* circular. *
* *
* NOTES *
* Other scale factors can be achieved with the following *
* example tables. *
* *
* Scale Factor Taps Table short jump[] = *
* -------------------------------------------------------------------- *
* 5/6 4 {0, 1, 2, 2, 2, 3, 3, 2, 2, 2, 3, 3, 2, 2} *
* 4/3 8 {0, 4, 4, 4, -3, -2, 4, 4, -2, -3, 4, 4} *
* 3/4 12 {0,1,4,4,4,4,-6,-5,4,4,4,4,-5,-6,4,4,4,4,-5,-5,4,4} *
* 6/5 16 {0,0,4,4,4,4,4,4,-11,-10,4,4,4,4,4,4,-10,-10, *
* 4,4,4,4,4,4,-10,-11,4,4} *
* *
* The software to produce these tables and the simple coefficents *
* for an arbitarary scale factor and number of taps can be found *
* in the api document. Note in the case of 3/4, odd scale factors *
* are doubled to make 6/8 instead of 3/4 *
* *
* CYCLES *
* cycles = 0.5 * out_len * l_hh * (1+k) + 30. *
* If (l_hh % 8) == 0 then k = 1/(4*l_hh) else k = 0. *
* *
* For l_hh = 16, in_len = 1024, and out_len = 1366, cycles = 11129. *
* For l_hh = 8, in_len = 640, and out_len = 120, cycles = 525. *
* *
* CODESIZE *
* 452 bytes *
* ------------------------------------------------------------------------- *
* Copyright (c) 2001 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *
.sect ".data:copyright_h"
_Copyright: .string "Copyright (C) 2001 Texas Instruments Incorporated. "
.string "All Rights Reserved."
.include "scale_horz_h.h62"
_scale_horz_asm:
* ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== *
.asg A4, A_plane_x ;
.asg B4, B_n_x ;
.asg A6, A_plane_y ;
.asg B6, B_n_y ;
.asg A8, A_hh ;
.asg B8, B_l_hh ;
.asg A10, A_n_hh ;
.asg B10, B_patch ;
.asg A12, A_filt_state ;
.asg B18, B_filt_no ;
.asg B16, B_hh0 ;
.asg A23, A_ptr_hh ;
.asg B24, B_ptr_hh ;
.asg A9, A_line0_x0 ;
.asg B9, B_line0_x1 ;
.asg A18, A_ka ;
.asg B17, B_kb ;
.asg A17, A_jump10 ;
.asg A19, A_kbka ;
.asg A22, A_patch0 ;
.asg A20, A_y0 ;
.asg B20, B_y1 ;
.asg B21, B_line0_y ;
.asg A7, A_round ;
.asg A0, A_taps ;
.asg A5, A_l_hh ;
.asg B0, B_f_cnt ;
.asg A3, A_patch ;
.asg B7, B_hh ;
.asg B5, B_max_filt ;
.asg B22, B_i ;
.asg A27, A_h03h02 ;
.asg A26, A_h01h00 ;
.asg B27, B_h13h12 ;
.asg B26, B_h11h10 ;
.asg A17, A_x03x02 ;
.asg A16, A_x01x00 ;
.asg B17, B_x13x12 ;
.asg B16, B_x11x10 ;
.asg B19, B_one ;
.asg A24, A_p00 ;
.asg A21, A_p01 ;
.asg A18, A_p0 ;
.asg B23, B_p10 ;
.asg B28, B_p11 ;
.asg B18, B_p1 ;
.asg A16, A_t_y0 ;
.asg B19, B_t_y1 ;
.asg B16, B_t_y10 ;
.asg A22, A_k1k0 ;
.asg A28, A_kbka_ ;
.asg B29, B_csr ;
.asg B22, B_csr_no_gie ;
* =========================== PIPE LOOP PROLOG ============================ *
LDW .D1T2 *A_filt_state[0], B_f_cnt ;[ 2,0]
|| MVC .S2 CSR, B_csr ;
ZERO .L1 A_p01 ;
|| AND .S2 B_csr, -2, B_csr_no_gie ;
ZERO .L2 B_p10 ;
|| MVC .S2 B_csr_no_gie, CSR ;
MPY .M2X B_l_hh, A_n_hh, B_max_filt ;[5,0]max_flt=l_hh*n_hh
ZERO .L2 B_p11 ;
|| ZERO .L1 A_p00 ;
MV .L2X A_hh, B_hh ;[ 7,0]
|| SUB .D2 B_max_filt, B_f_cnt, B_filt_no ;[ 7,0]filt_cnt=max_flt
|| MV .D1X B_patch, A_patch ;[ 7,0]patch0=patch
ADDAH .D2 B_hh, B_filt_no, B_hh0 ;[ 8,0]
|| LDW .D1T1 *A_patch[0], A_kbka ;[ 8,0]initial start
|| SHRU .S2 B_filt_no, 1, B_filt_no ;[ 8,0]
ADD .L1 A_patch, 4, A_patch0 ;[ 9,0]patch0=patch
|| ZERO .S1 A_h01h00 ;
ADD .D1X A_patch0, B_filt_no, A_patch0 ;[10,0]
|| ZERO .L2 B_h13h12 ;
|| B .S1 LOOPY + 20 ;
LDW .D1T1 *A_patch0++[1], A_jump10 ;[11,0] first offset
|| B .S1 LOOPY1 + 12 ;
MPY .M2 B_l_hh, B_n_y, B_i ;[12,0]
|| MV .L2X A_plane_y, B_line0_y ;[12,0]line0_y=plane_y
|| MVK .S1 020h, A_round ;[12,0]
|| MVK .D2 1, B_one ;[12,0]
|| MPY .M1X 1, B_l_hh, A_l_hh ;[12,0]
|| B .S2 LOOPY2 + 12 ;
MPYLH .M2X B_one, A_kbka, B_kb ;[13,0]
|| MPY .M1 1, A_kbka, A_ka ;[13,0]
|| B .S1 LOOPY3 + 20 ;
SHRU .S2 B_i, 3, B_i ;[14,0]
|| MPY .M2X 1, A_round, B_y1 ;[14,0] y1 = 1 << 5
|| ADD .L1 12, A_l_hh, A_taps ;[14,0]taps=l_hh
|| ROTL .M1 A_plane_x, 0, A_line0_x0 ;[15,0]line0_x=plane_x
|| B .S1 LOOPY ;
SUB .S2 B_i, 3, B_i ;[15,0]2 + 1
|| ADD .D2 B_hh0, 8, B_ptr_hh ;[15,0]ptr_hh=hh+1
|| MV .L1X B_hh0, A_ptr_hh ;[15,0]ptr_hh = hh
|| MV .L2X A_plane_x, B_line0_x1 ;[15,0]line1_x=plane_x
|| MV .S1 A_round, A_y0 ;[15,0]y0 = 1 << 5
|| LDW .D1T1 *A_patch0++[1], A_jump10 ;[1,1]load next offst
* =========================== PIPE LOOP KERNEL ============================ *
LOOPY:
SHR .S2 B_y1, 6, B_t_y1 ;[18,1]
|| SHR .S1 A_y0, 6, A_t_y0 ;[18,1]
|| SUB .L1 A_taps, 4, A_taps ;[18,1]
|| DOTP2 .M1 A_h01h00, A_x01x00, A_p00 ;[10,3]
|| DOTP2 .M2 B_h13h12, B_x13x12, B_p11 ;[10,3]
|| LDDW .D1T1 *A_ptr_hh++[2], A_h03h02:A_h01h00 ;[ 2,5]h3:0=*ptr_hh++
|| LDDW .D2T2 *B_ptr_hh++[2], B_h13h12:B_h11h10 ;[ 2,5]h3:0=*ptr_hh++
|| SUB .L2 B_f_cnt, 8, B_f_cnt ;[ 2,5]fil_count-=8
LOOPY1:
[!A_taps]MPY .M1 1, A_round, A_y0 ;[19,1]if(!samp)y0=y1=0
||[!A_taps]MPY .M2X 1, A_round, B_y1 ;[19,1]if(!samp)y0=y1=0
|| ADD .S2 B_p10, B_p11, B_p1 ;[15,2]
||[!B_f_cnt]MV .S1X B_hh, A_ptr_hh ;[3,5](!flt_c)p_hh=hh
|| ADD2 .D1 A_jump10, A_kbka, A_kbka ;[3,5]
||[!B_f_cnt]ADD .L2 B_hh, 8, B_ptr_hh ;[3,5](!flt_c)p_hh=hh+4
||[!B_f_cnt]ADD .L1 A_patch, 8, A_patch0 ;[3,5](!flt_c)ptch0=ptch
|| LDNDW .D2T2 *B_line0_x1(B_kb), B_x13x12:B_x11x10;[3,5]x3:0=*(line0_x+kb)
LOOPY2:
PACK2 .L2X B_t_y1, A_t_y0, B_t_y10 ;[20,1]
|| ADD .S1 A_p00, A_p01, A_p0 ;[16,2]
|| DOTP2 .M2 B_h11h10, B_x11x10, B_p10 ;[ 8,4]
|| BDEC .S2 LOOPY, B_i ;[16,2]
|| LDNDW .D1T1 *A_line0_x0(A_ka), A_x03x02:A_x01x00;[4,5]x3:0=*(line0_x+ka)
|| MPY .M1 1, A_kbka, A_ka ;[ 4,5]
|| MVK .D2 1, B_one ;[ 4,5]
LOOPY3:
[!A_taps]MV .L1 A_l_hh, A_taps ;[21,1]taps=l_hh
||[!A_taps]STW.D2T2 B_t_y10, *B_line0_y++[1] ;[21,1]if(!samp)
|| ADD .S1 A_p0, A_y0, A_y0 ;[17,2] *line0_y++=t_y0
|| ADD .S2 B_p1, B_y1, B_y1 ;[17,2]
|| DOTP2 .M1 A_h03h02, A_x03x02, A_p01 ;[9,4]
||[!B_f_cnt]MV .L2 B_max_filt, B_f_cnt ;[5,5]if(!flt_c)
|| MPYLH .M2X B_one, A_kbka, B_kb ;[5,5] flt_cnt=max_flt
|| LDW .D1T1 *A_patch0++[1], A_jump10 ;[1,6]load next offst
* =========================== PIPE LOOP EPILOG ============================ *
SHR .S2 B_y1, 6, B_t_y1 ;[18,3]
|| SHR .S1 A_y0, 6, A_t_y0 ;[18,3]
|| SUB .L1 A_taps, 4, A_taps ;[18,3]
|| DOTP2 .M1 A_h01h00, A_x01x00, A_p00 ;[10,5]
|| DOTP2 .M2 B_h13h12, B_x13x12, B_p11 ;[10,5]
|| LDW .D1T1 *A_patch[0],A_k1k0 ;
[!A_taps]MPY .M1 1, A_round, A_y0 ;[19,3]if(!samp)y0=y1=0;
||[!A_taps]MPY .M2X 1, A_round, B_y1 ;[19,3]if(!samp)y0=y1=0;
|| ADD .S2 B_p10, B_p11, B_p1 ;[15,4]
PACK2 .L2X B_t_y1, A_t_y0, B_t_y10 ;[20,3]
|| ADD .L1 A_p00, A_p01, A_p0 ;[16,4]
|| DOTP2 .M2 B_h11h10, B_x11x10, B_p10 ;[ 8,6]
|| B .S1 PIPE_DOWN ;
|| MVK .S2 0, B_i ;
[!A_taps]MV .L1 A_l_hh, A_taps ;[21,3]taps = l_hh
||[!A_taps]STW.D2T2 B_t_y10, *B_line0_y++[1] ;[21,3]if(!sample)
|| ADD .S1 A_p0, A_y0, A_y0 ;[17,4] *line0_y++=t_y0
|| ADD .S2 B_p1, B_y1, B_y1 ;[17,4]
|| DOTP2 .M1 A_h03h02, A_x03x02, A_p01 ;[ 9,6]
PIPE_DOWN:
SHR .S2 B_y1, 6, B_t_y1 ;[18,4]
|| SHR .S1 A_y0, 6, A_t_y0 ;[18,4]
|| SUB .L1 A_taps, 4, A_taps ;[18,4]
|| DOTP2 .M1 A_h01h00, A_x01x00, A_p00 ;[10,6]
|| DOTP2 .M2 B_h13h12, B_x13x12, B_p11 ;[10,6]
[!A_taps]MPY .M1 1, A_round, A_y0 ;[19,4]if(!samp)y0=y1=0
||[!A_taps]MPY .M2X 1, A_round, B_y1 ;[19,4]if(!samp)y0=y1=0
|| ADD .S2 B_p10, B_p11, B_p1 ;[15,5]
|| SUB2 .L1 A_kbka, A_k1k0, A_kbka_ ;
PACK2 .L2X B_t_y1, A_t_y0, B_t_y10 ;[20,4]
|| ADD .L1 A_p00, A_p01, A_p0 ;[16,5]
|| BDEC .S2 PIPE_DOWN, B_i ;
|| SHR2 .S1 A_kbka_, 1, A_kbka_ ;
[!A_taps]MV .L1 A_l_hh, A_taps ;[21,4]taps=l_hh
|| ADD .S1 A_p0, A_y0, A_y0 ;[17,5]
|| ADD .S2 B_p1, B_y1, B_y1 ;[17,5]
||[!A_taps]STW.D2T2 B_t_y10, *B_line0_y++[1] ;[21,4](!samp)
; *line0_y++=t_y0
* ========================================================================= *
BNOP .S2 B3, 2 ;return to call
STW .D1T1 A_kbka_, *A_filt_state[1] ;
STW .D1T2 B_f_cnt, *A_filt_state[2] ;
MVC .S2 B_csr, CSR ;
;BRANCH OCCURS
* ========================================================================= *
* End of file: scale_horz_h.asm *
* ------------------------------------------------------------------------- *
* Copyright (c) 2001 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *