www.pudn.com > scaling.rar > scale_vert_h.asm
* ========================================================================= *
* TEXAS INSTRUMENTS, INC. *
* *
* NAME *
* scale_vert *
* *
* USAGE *
* This routine is C-callable and can be called as: *
* *
* void scale_vert *
* ( *
* short *in_cols, /* Input image (16-bit pixels) */ *
* int cols, /* Width of the image. */ *
* short *out_cols, /* Output image (16-bit pixels) */ *
* short *ptr_hh, /* Pointer to filter taps. */ *
* short *mod_hh, /* Pointer to rotated filter taps. */ *
* int l_hh, /* Taps per filter (multiple of 4) */ *
* int start_line /* Starting position for filter. */ *
* ) *
* *
* DESCRIPTION *
* Performs a vertical scaling function on a block of data from a *
* frame store. It uses a scaling filter decided upon by the *
* calling function. The filter is rotated depending on the *
* starting line of the circular buffer. One line of length *
* 'cols' is produced per function call. The same filter runs *
* along the entire length of buffer and perfoms the filter *
* vertically along each set of parallel data points. *
* *
* void scale_vert *
* ( *
* short *ptr_plane_x, *
* short *ptr_plane_y, *
* int n_x, *
* short *ptr_hh, *
* short *mod_hh, *
* int l_hh, *
* int start_line *
* ) *
* { *
* int taps_count, sample, i, j, ka; *
* int max_filt, y0, y1, y2, y3, block; *
* int t_y0, t_y1, t_y2, t_y3; *
* short h0, h1, x0, x1, x2, x3, x01, x11, x21, x31; *
* short * ptr_line0_x, * ptr_line1_x; *
* short * ptr_line2_x, * ptr_line3_x; *
* short * ptr_line0_x1, * ptr_line1_x1; *
* short * ptr_line2_x1, * ptr_line3_x1; *
* short * ptr_line0_y, * filter; *
* *
* taps_count = l_hh - 2; *
* *
* ptr_line0_x = ptr_plane_x; *
* ptr_line0_y = ptr_plane_y; *
* *
* y0 = 0x0; y1 = 0x0; *
* y2 = 0x0; y3 = 0x0; *
* *
* max_filt = n_x*l_hh; *
* *
* /* if we know the pointer will need to wrap around, a positive *
* correction is needed instead of a negative one */ *
* *
* block = n_x*l_hh; *
* block = block - 4; /* next column on */ *
* *
* if (!start_line) start_line = l_hh; *
* filter = &ptr_hh[l_hh-start_line]; *
* j = start_line; *
* for (i = 0; i < l_hh; i++) *
* { *
* mod_hh[i] = *filter++; *
* j--; *
* if (!j) filter= ptr_hh; *
* if (!j) j = l_hh; *
* } *
* filter = mod_hh; *
* ka = 0; *
* *
* ptr_line1_x = ptr_line0_x+1; *
* ptr_line2_x = ptr_line1_x+1; *
* ptr_line3_x = ptr_line2_x+1; *
* *
* ptr_line0_x1 = ptr_line0_x+n_x ; *
* ptr_line1_x1 = ptr_line0_x1+1 ; *
* ptr_line2_x1 = ptr_line1_x1+1 ; *
* ptr_line3_x1 = ptr_line2_x1+1 ; *
* *
* for ( i = 0; i < max_filt; i+=8) *
* { *
* h0 = *filter++; *
* h1 = *filter++; *
* *
* x0 = ptr_line0_x[ka]; *
* x1 = ptr_line1_x[ka]; *
* x2 = ptr_line2_x[ka]; *
* x3 = ptr_line3_x[ka]; *
* *
* x01 = ptr_line0_x1[ka]; *
* x11 = ptr_line1_x1[ka]; *
* x21 = ptr_line2_x1[ka]; *
* x31 = ptr_line3_x1[ka]; *
* ka += 2*n_x; *
* *
* sample = taps_count ; *
* *
* if (!taps_count) filter = mod_hh; *
* if (!taps_count) ka = ka - block; *
* if (!taps_count) taps_count = l_hh ; *
* *
* /* 2 taps have been used, for 1 point */ *
* taps_count -= 2; *
* *
* y0 += (x0*h0 + x01*h1) ; *
* y1 += (x1*h0 + x11*h1) ; *
* y2 += (x2*h0 + x21*h1) ; *
* y3 += (x3*h0 + x31*h1) ; *
* *
* t_y0 = y0 >> 16; t_y1 = y1 >> 16; *
* t_y2 = y2 >> 16; t_y3 = y3 >> 16; *
* *
* if (!sample) y0 = 0x0; if (!sample) y1 = 0x0; *
* if (!sample) y2 = 0x0; if (!sample) y3 = 0x0; *
* *
* if (!sample) *ptr_line0_y++ = t_y0; *
* if (!sample) *ptr_line0_y++ = t_y1; *
* if (!sample) *ptr_line0_y++ = t_y2; *
* if (!sample) *ptr_line0_y++ = t_y3; *
* } *
* } *
* *
* ASSUMPTIONS *
* 'l_hh' must be divisible by 4. Pad with zeros for non % 4. *
* *
* All data must be double word aligned. *
* *
* Input and output, mod_hh, ptr_hh, cols must be multiple of 8. *
* *
* Input filters, hh, must be 12 bit precision. *
* *
* The data is in LITTLE ENDIAN format. *
* *
* The code is uninterruptable during execution. Interrupts are *
* disabled at the beginning and reenabled at the end. *
* *
* TECHNIQUES *
* Inner and outer loops are collapsed into 1 loop. *
* *
* A load store live too long problem is dealt with using a MPY. *
* *
* The filter loop is unrolled 4 times with l_hh % 4 == 0. *
* *
* The output data stores are packed together into words and the *
* outer loop is unrolled 8 times to allow 8 parallel filters to *
* be convolved at once. *
* *
* NOTES *
* Before calling this function the input data needs to be in Q11 *
* form. (It is generally the output from the horizontal scaling *
* function, scale_horz.) The results from this function must be *
* clipped before displaying as some pixels may go < 0 or > 255. *
* *
* MEMORY NOTE *
* No bank hits occur in this code. *
* *
* CYCLES *
* Formula for execution time: *
* cycles = (5 * l_hh * cols) / 16 + 6 * l_hh + 32 *
* For cols = 800 and l_hh = 4, cycles = 1056. *
* *
* CODESIZE *
* 612 bytes *
* ------------------------------------------------------------------------- *
* Copyright (c) 2000 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *
.sect ".data:copyright_h"
_Copyright: .string "Copyright (C) 2000 Texas Instruments Incorporated. "
.string "All Rights Reserved."
.include "scale_vert_h.h62"
*==============================================================================*
_scale_vert_asm:
*================= SYMBOLIC REGISTER ASSIGNMENTS: =============================*
.asg A0, A_filter ; copy of filter pointer inc.
.asg A1, A_taps ; number of taps used
.asg A2, A_taps_ ; copy of number of taps used
.asg A3, A_n_x_ ; copy of length of line
.asg A4, A_p01 ; temp sum of products
.asg A4, A_p20 ; temp sum of products
.asg A4, A_p202 ; temp sum of products
.asg A4, A_ptr_plane_x ; pntr to input data buffer
.asg A4, A_x11x01 ; line 0:1 data 1
.asg A4, A_x32x22 ; line 2:3 data 2
.asg A5, A_ka ; index along buffer
.asg A5, A_p02 ; temp sum of products
.asg A6, A_n_x ; width of line in shorts
.asg A6, A_p00 ; temp sum of products
.asg A6, A_p200 ; temp sum of products
.asg A6, A_p203 ; temp sum of products
.asg A6, A_x33x23 ; line 2:3 data 3
.asg A7, A_p201 ; temp sum of products
.asg A7, A_p23 ; temp sum of products
.asg A8, A_mod_hh ; point to rotated filter
.asg A8, A_p22 ; temp sum of products
.asg A8, A_x31x21 ; line 2:3 data 1
.asg A9, A_ptr_ln_y ; pntr to even outputs
.asg A10, A_start_ln ; start line into input buffer
.asg A10, A_x13x03 ; line 0:1 data 3
.asg A11, A_ptr_ln0_x ; line0 data 0 to 3
.asg A12, A_ptr_ln1_x ; line1 data 0 to 3
.asg A13, A_ptr_ln2_x ; line2 data 0 to 3
.asg A14, A_ptr_ln3_x ; line3 data 0 to 3
.asg A0, A_SP ; Stack pointer, A datapath
.asg A16, A_t_y10 ; packed outputs 1:0
.asg A16, A_x11x10 ; line 1 data 1:0
.asg A17, A_p03 ; temp sum of products
.asg A17, A_t_y32 ; packed outputs 3:2
.asg A17, A_x10x00 ; line 0:1 data 0
.asg A17, A_x12x02 ; line 0:1 data 2
.asg A17, A_x13x12 ; line 1 data 3:2
.asg A18, A_x21x20 ; line 2 data 1:0
.asg A18, A_x30x20 ; line 2:3 data 0
.asg A19, A_x23x22 ; line 2 data 3:2
.asg A20, A_p21 ; temp sum of products
.asg A20, A_x31x30 ; line 3 data 1:0
.asg A21, A_x33x32 ; line 3 data 3:2
.asg A22, A_h1h0 ; filter taps i:i+1
.asg A23, A_h3h2 ; filter taps i+2:i+3
.asg A24, A_x01x00 ; line 0 data 1:0
.asg A25, A_x03x02 ; line 0 data 3:2
.asg A26, A_y0 ; acumulater 0
.asg A27, A_y1 ; acumulater 1
.asg A28, A_y2 ; acumulater 2
.asg A29, A_y3 ; acumulater 3
.asg A30, A_hh ; base address of rotated filter
.asg A31, A_l_hh ; copy of length of filter
.asg B0, B_j ; count in filter rotate loop
.asg B0, B_p07 ; temp sum of products
.asg B0, B_p207 ; temp sum of products
.asg B0, B_start_ln ; copy of start_line
.asg B0, B_x14x04 ; line 0:1 data 4
.asg B1, B_no_gie ; CSR w/ GIE bit cleared
.asg B1, B_p24 ; temp sum of products
.asg B1, B_p27 ; temp sum of products
.asg B2, B_csr ; CSR's value
.asg B2, B_k ; count to rotate filter
.asg B2, B_pro ; prologue predication
.asg B3, B_ret ; Return address
.asg B3, B_x36x26 ; line 2:3 data 6
.asg B4, B_p06 ; temp sum of products
.asg B4, B_p206 ; temp sum of products
.asg B4, B_ptr_plane_y ; pntr to output line
.asg B4, B_x34x24 ; line 2:3 data 4
.asg B5, B_filter ; copy of pointto filter
.asg B5, B_p05 ; temp sum of products
.asg B5, B_p205 ; temp sum of products
.asg B5, B_p26 ; temp sum of products
.asg B6, B_ptr_hh ; pntr to filter
.asg B6, B_x17x07 ; line 0:1 data 7
.asg B7, B_i ; main scale_vert loop count % 32
.asg B8, B_l_hh ; length of filter in shorts
.asg B8, B_ptr_ln_y ; pntr to odd outputs
.asg B9, B_mod_hh ; copy of filter pntr
.asg B9, B_x15x05 ; line 0:1 data 5
.asg B10, B_x35x25 ; line 2:3 data 5
.asg B11, B_ka ; index along buffer
.asg B12, B_ptr_ln0_x ; line0 data 4 to 7
.asg B13, B_ptr_ln1_x ; line1 data 4 to 7
.asg B14, B_ptr_ln2_x ; line2 data 4 to 7
.asg B15, B_SP ; Stack pointer, B datapath
.asg B16, B_ptr_ln3_x ; line3 data 4 to 7
.asg B17, B_n_x ; copy of length of line
.asg B18, B_block ; amount of data in buffer - 8
.asg B19, B_zero ; constant of 0x00000000
.asg B20, B_p25 ; temp sum of products
.asg B20, B_x15x14 ; line 1 data 5:4
.asg B21, B_p04 ; temp sum of products
.asg B21, B_p204 ; temp sum of products
.asg B21, B_x17x16 ; line 1 data 7:6
.asg B22, B_t_y54 ; packed outputs 5:4
.asg B22, B_x35x34 ; line 3 data 5:4
.asg B22, B_x37x27 ; line 2:3 data 7
.asg B23, B_t_y76 ; packed outputs 7:6
.asg B23, B_x37x36 ; line 3 data 7:6
.asg B24, B_x05x04 ; line 0 data 5:4
.asg B25, B_hh_i ; values in rotated filter
.asg B25, B_x07x06 ; line 0 data 7:6
.asg B25, B_x16x06 ; line 0:1 data 6
.asg B26, B_x25x24 ; line 2 data 5:4
.asg B27, B_x27x26 ; line 2 data 7:6
.asg B28, B_y6 ; acumulater 6
.asg B29, B_y7 ; acumulater 7
.asg B30, B_y4 ; acumulater 4
.asg B31, B_y5 ; acumulater 5
*==============================================================================*
;-
B .S2 BLOCK ; block interupts
STW .D2T2 B14, *B_SP--[14] ; Save B14, get stk
|| MV B_SP, A_SP ; Twin Stack Ptr
STDW .D1T1 A15:A14, *-A_SP[1] ; Save A15, A14
|| STDW .D2T2 B13:B12, *+B_SP[5] ; Save B13, B12
|| MVC .S2 CSR, B_csr ; Remember CSR
STDW .D1T1 A13:A12, *-A_SP[3] ; Save A13, A12
|| STDW .D2T2 B11:B10, *+B_SP[3] ; Save B11, B10
|| AND B_csr, -2, B_no_gie ; Clear GIE
;-
STDW .D1T1 A11:A10, *-A_SP[5] ; Save A11, A10
|| STDW .D2T2 B_ret:B_csr,*+B_SP[1] ; Save Return, CSR
|| MVC .S2 B_no_gie, CSR ; Disable ints
|| MV .L1X B_l_hh, A_l_hh ;copy l_hh
|| MV .L2X A_start_ln, B_start_ln ;copy start line
ADDAH .D2 B_ptr_hh, B_l_hh, B_filter ;rotated filter start
|| MV .L2X A_n_x, B_n_x ;copy cols
||[!B_start_ln]MV.S2 B_l_hh, B_start_ln ;if(!start)start=l_hh
BLOCK:
SUBAH .D2 B_filter, B_start_ln, B_filter ;ptr_hh+l_hh-start
|| ADD .S2X A_ptr_plane_x, 8, B_ptr_ln0_x ;ptr_ln1_x=plane_x+1
|| MV .S1 A_ptr_plane_x, A_ptr_ln0_x ;ptr_ln0_x=plane_x
SUB .L2 B_l_hh, 1, B_k ;for(i=0;i