www.pudn.com > scaling.rar > scale_vert_h.asm


* ========================================================================= * 
*   TEXAS INSTRUMENTS, INC.                                                 * 
*                                                                           * 
*   NAME                                                                    * 
*       scale_vert                                                          * 
*                                                                           * 
*   USAGE                                                                   * 
*       This routine is C-callable and can be called as:                    * 
*                                                                           * 
*           void scale_vert                                                 * 
*           (                                                               * 
*               short *in_cols,     /* Input image (16-bit pixels)      */  * 
*               int    cols,        /* Width of the image.              */  * 
*               short *out_cols,    /* Output image (16-bit pixels)     */  * 
*               short *ptr_hh,      /* Pointer to filter taps.          */  * 
*               short *mod_hh,      /* Pointer to rotated filter taps.  */  * 
*               int    l_hh,        /* Taps per filter (multiple of 4)  */  * 
*               int    start_line   /* Starting position for filter.    */  * 
*           )                                                               * 
*                                                                           * 
*   DESCRIPTION                                                             * 
*       Performs a vertical scaling function on a block of data from a      * 
*       frame store.  It uses a scaling filter decided upon by the          * 
*       calling function.  The filter is rotated depending on the           * 
*       starting line of the circular buffer.  One line of length           * 
*       'cols' is produced per function call.  The same filter runs         * 
*       along the entire length of buffer and perfoms the filter            * 
*       vertically along each set of parallel data points.                  * 
*                                                                           * 
*       void scale_vert                                                     * 
*       (                                                                   * 
*           short *ptr_plane_x,                                             * 
*           short *ptr_plane_y,                                             * 
*           int   n_x,                                                      * 
*           short *ptr_hh,                                                  * 
*           short *mod_hh,                                                  * 
*           int   l_hh,                                                     * 
*           int   start_line                                                * 
*       )                                                                   * 
*       {                                                                   * 
*           int     taps_count, sample, i, j, ka;                           * 
*           int     max_filt, y0, y1, y2, y3, block;                        * 
*           int     t_y0, t_y1, t_y2, t_y3;                                 * 
*           short   h0, h1, x0, x1, x2, x3, x01, x11, x21, x31;             * 
*           short * ptr_line0_x, * ptr_line1_x;                             * 
*           short * ptr_line2_x, * ptr_line3_x;                             * 
*           short * ptr_line0_x1, * ptr_line1_x1;                           * 
*           short * ptr_line2_x1, * ptr_line3_x1;                           * 
*           short * ptr_line0_y,  * filter;                                 * 
*                                                                           * 
*           taps_count = l_hh - 2;                                          * 
*                                                                           * 
*           ptr_line0_x = ptr_plane_x;                                      * 
*           ptr_line0_y = ptr_plane_y;                                      * 
*                                                                           * 
*           y0 = 0x0; y1 = 0x0;                                             * 
*           y2 = 0x0; y3 = 0x0;                                             * 
*                                                                           * 
*           max_filt = n_x*l_hh;                                            * 
*                                                                           * 
*           /* if we know the pointer will need to wrap around, a positive  * 
*           correction is needed instead of a negative one */               * 
*                                                                           * 
*           block = n_x*l_hh;                                               * 
*           block = block - 4;      /* next column on */                    * 
*                                                                           * 
*           if (!start_line) start_line = l_hh;                             * 
*           filter = &ptr_hh[l_hh-start_line];                              * 
*           j = start_line;                                                 * 
*           for (i = 0; i < l_hh; i++)                                      * 
*           {                                                               * 
*               mod_hh[i] = *filter++;                                      * 
*               j--;                                                        * 
*               if (!j) filter= ptr_hh;                                     * 
*               if (!j) j = l_hh;                                           * 
*           }                                                               * 
*           filter = mod_hh;                                                * 
*           ka = 0;                                                         * 
*                                                                           * 
*           ptr_line1_x = ptr_line0_x+1;                                    * 
*           ptr_line2_x = ptr_line1_x+1;                                    * 
*           ptr_line3_x = ptr_line2_x+1;                                    * 
*                                                                           * 
*           ptr_line0_x1 = ptr_line0_x+n_x ;                                * 
*           ptr_line1_x1 = ptr_line0_x1+1 ;                                 * 
*           ptr_line2_x1 = ptr_line1_x1+1 ;                                 * 
*           ptr_line3_x1 = ptr_line2_x1+1 ;                                 * 
*                                                                           * 
*           for ( i = 0; i < max_filt; i+=8)                                * 
*           {                                                               * 
*               h0 = *filter++;                                             * 
*               h1 = *filter++;                                             * 
*                                                                           * 
*               x0 = ptr_line0_x[ka];                                       * 
*               x1 = ptr_line1_x[ka];                                       * 
*               x2 = ptr_line2_x[ka];                                       * 
*               x3 = ptr_line3_x[ka];                                       * 
*                                                                           * 
*               x01 = ptr_line0_x1[ka];                                     * 
*               x11 = ptr_line1_x1[ka];                                     * 
*               x21 = ptr_line2_x1[ka];                                     * 
*               x31 = ptr_line3_x1[ka];                                     * 
*               ka += 2*n_x;                                                * 
*                                                                           * 
*               sample = taps_count ;                                       * 
*                                                                           * 
*               if (!taps_count) filter = mod_hh;                           * 
*               if (!taps_count) ka = ka - block;                           * 
*               if (!taps_count) taps_count = l_hh ;                        * 
*                                                                           * 
*               /* 2 taps have been used, for 1 point */                    * 
*               taps_count -= 2;                                            * 
*                                                                           * 
*               y0 += (x0*h0 + x01*h1) ;                                    * 
*               y1 += (x1*h0 + x11*h1) ;                                    * 
*               y2 += (x2*h0 + x21*h1) ;                                    * 
*               y3 += (x3*h0 + x31*h1) ;                                    * 
*                                                                           * 
*               t_y0 = y0 >> 16; t_y1 = y1 >> 16;                           * 
*               t_y2 = y2 >> 16; t_y3 = y3 >> 16;                           * 
*                                                                           * 
*               if (!sample) y0 = 0x0; if (!sample) y1 = 0x0;               * 
*               if (!sample) y2 = 0x0; if (!sample) y3 = 0x0;               * 
*                                                                           * 
*               if (!sample) *ptr_line0_y++ = t_y0;                         * 
*               if (!sample) *ptr_line0_y++ = t_y1;                         * 
*               if (!sample) *ptr_line0_y++ = t_y2;                         * 
*               if (!sample) *ptr_line0_y++ = t_y3;                         * 
*           }                                                               * 
*       }                                                                   * 
*                                                                           * 
*   ASSUMPTIONS                                                             * 
*       'l_hh' must be divisible by 4.  Pad with zeros for non % 4.         * 
*                                                                           * 
*       All data must be double word aligned.                               * 
*                                                                           * 
*       Input and output, mod_hh, ptr_hh, cols must be multiple of 8.       * 
*                                                                           * 
*       Input filters, hh, must be 12 bit precision.                        * 
*                                                                           * 
*       The data is in LITTLE ENDIAN format.                                * 
*                                                                           * 
*       The code is uninterruptable during execution.  Interrupts are       * 
*       disabled at the beginning and reenabled at the end.                 * 
*                                                                           * 
*   TECHNIQUES                                                              * 
*       Inner and outer loops are collapsed into 1 loop.                    * 
*                                                                           * 
*       A load store live too long problem is dealt with using a MPY.       * 
*                                                                           * 
*       The filter loop is unrolled 4 times with l_hh % 4 == 0.             * 
*                                                                           * 
*       The output data stores are packed together into words and the       * 
*       outer loop is unrolled 8 times to allow 8 parallel filters to       * 
*       be convolved at once.                                               * 
*                                                                           * 
*   NOTES                                                                   * 
*       Before calling this function the input data needs to be in Q11      * 
*       form. (It is generally the output from the horizontal scaling       * 
*       function, scale_horz.)  The results from this function must be      * 
*       clipped before displaying as some pixels may go < 0 or > 255.       * 
*                                                                           * 
*   MEMORY NOTE                                                             * 
*       No bank hits occur in this code.                                    * 
*                                                                           * 
*   CYCLES                                                                  * 
*       Formula for execution time:                                         * 
*       cycles = (5 * l_hh * cols) / 16  + 6 * l_hh + 32                    * 
*       For cols = 800 and l_hh = 4, cycles = 1056.                         * 
*                                                                           * 
*   CODESIZE                                                                * 
*       612 bytes                                                           * 
* ------------------------------------------------------------------------- * 
*             Copyright (c) 2000 Texas Instruments, Incorporated.           * 
*                            All Rights Reserved.                           * 
* ========================================================================= * 
 
                .sect ".data:copyright_h" 
_Copyright:     .string "Copyright (C) 2000 Texas Instruments Incorporated. " 
                .string "All Rights Reserved." 
                .include "scale_vert_h.h62" 
*==============================================================================* 
_scale_vert_asm:  
*================= SYMBOLIC REGISTER ASSIGNMENTS: =============================* 
        .asg    A0,         A_filter      ; copy of filter pointer inc. 
        .asg    A1,         A_taps        ; number of taps used 
        .asg    A2,         A_taps_       ; copy of number of taps used 
        .asg    A3,         A_n_x_        ; copy of length of line 
        .asg    A4,         A_p01         ; temp sum of products 
        .asg    A4,         A_p20         ; temp sum of products 
        .asg    A4,         A_p202        ; temp sum of products 
        .asg    A4,         A_ptr_plane_x ; pntr to input data buffer 
        .asg    A4,         A_x11x01      ; line 0:1 data 1 
        .asg    A4,         A_x32x22      ; line 2:3 data 2 
        .asg    A5,         A_ka          ; index along buffer 
        .asg    A5,         A_p02         ; temp sum of products 
        .asg    A6,         A_n_x         ; width of line in shorts 
        .asg    A6,         A_p00         ; temp sum of products 
        .asg    A6,         A_p200        ; temp sum of products 
        .asg    A6,         A_p203        ; temp sum of products 
        .asg    A6,         A_x33x23      ; line 2:3 data 3 
        .asg    A7,         A_p201        ; temp sum of products 
        .asg    A7,         A_p23         ; temp sum of products 
        .asg    A8,         A_mod_hh      ; point to rotated filter 
        .asg    A8,         A_p22         ; temp sum of products 
        .asg    A8,         A_x31x21      ; line 2:3 data 1 
        .asg    A9,         A_ptr_ln_y    ; pntr to even outputs 
        .asg    A10,        A_start_ln    ; start line into input buffer   
        .asg    A10,        A_x13x03      ; line 0:1 data 3 
        .asg    A11,        A_ptr_ln0_x   ; line0 data 0 to 3 
        .asg    A12,        A_ptr_ln1_x   ; line1 data 0 to 3 
        .asg    A13,        A_ptr_ln2_x   ; line2 data 0 to 3 
        .asg    A14,        A_ptr_ln3_x   ; line3 data 0 to 3 
        .asg    A0,         A_SP          ; Stack pointer, A datapath 
        .asg    A16,        A_t_y10       ; packed outputs 1:0 
        .asg    A16,        A_x11x10      ; line 1 data 1:0 
        .asg    A17,        A_p03         ; temp sum of products 
        .asg    A17,        A_t_y32       ; packed outputs 3:2 
        .asg    A17,        A_x10x00      ; line 0:1 data 0 
        .asg    A17,        A_x12x02      ; line 0:1 data 2 
        .asg    A17,        A_x13x12      ; line 1 data 3:2 
        .asg    A18,        A_x21x20      ; line 2 data 1:0 
        .asg    A18,        A_x30x20      ; line 2:3 data 0 
        .asg    A19,        A_x23x22      ; line 2 data 3:2 
        .asg    A20,        A_p21         ; temp sum of products 
        .asg    A20,        A_x31x30      ; line 3 data 1:0 
        .asg    A21,        A_x33x32      ; line 3 data 3:2 
        .asg    A22,        A_h1h0        ; filter taps i:i+1 
        .asg    A23,        A_h3h2        ; filter taps i+2:i+3 
        .asg    A24,        A_x01x00      ; line 0 data 1:0 
        .asg    A25,        A_x03x02      ; line 0 data 3:2 
        .asg    A26,        A_y0          ; acumulater 0 
        .asg    A27,        A_y1          ; acumulater 1 
        .asg    A28,        A_y2          ; acumulater 2 
        .asg    A29,        A_y3          ; acumulater 3 
        .asg    A30,        A_hh          ; base address of rotated filter 
        .asg    A31,        A_l_hh        ; copy of length of filter 
        .asg    B0,         B_j           ; count in filter rotate loop 
        .asg    B0,         B_p07         ; temp sum of products 
        .asg    B0,         B_p207        ; temp sum of products 
        .asg    B0,         B_start_ln    ; copy of start_line 
        .asg    B0,         B_x14x04      ; line 0:1 data 4 
        .asg    B1,         B_no_gie      ; CSR w/ GIE bit cleared 
        .asg    B1,         B_p24         ; temp sum of products 
        .asg    B1,         B_p27         ; temp sum of products 
        .asg    B2,         B_csr         ; CSR's value 
        .asg    B2,         B_k           ; count to rotate filter 
        .asg    B2,         B_pro         ; prologue predication 
        .asg    B3,         B_ret         ; Return address 
        .asg    B3,         B_x36x26      ; line 2:3 data 6 
        .asg    B4,         B_p06         ; temp sum of products 
        .asg    B4,         B_p206        ; temp sum of products 
        .asg    B4,         B_ptr_plane_y ; pntr to output line 
        .asg    B4,         B_x34x24      ; line 2:3 data 4 
        .asg    B5,         B_filter      ; copy of pointto filter 
        .asg    B5,         B_p05         ; temp sum of products 
        .asg    B5,         B_p205        ; temp sum of products 
        .asg    B5,         B_p26         ; temp sum of products 
        .asg    B6,         B_ptr_hh      ; pntr to filter 
        .asg    B6,         B_x17x07      ; line 0:1 data 7 
        .asg    B7,         B_i           ; main scale_vert loop count % 32 
        .asg    B8,         B_l_hh        ; length of filter in shorts 
        .asg    B8,         B_ptr_ln_y    ; pntr to odd outputs 
        .asg    B9,         B_mod_hh      ; copy of filter pntr 
        .asg    B9,         B_x15x05      ; line 0:1 data 5 
        .asg    B10,        B_x35x25      ; line 2:3 data 5 
        .asg    B11,        B_ka          ; index along buffer 
        .asg    B12,        B_ptr_ln0_x   ; line0 data 4 to 7 
        .asg    B13,        B_ptr_ln1_x   ; line1 data 4 to 7 
        .asg    B14,        B_ptr_ln2_x   ; line2 data 4 to 7 
        .asg    B15,        B_SP          ; Stack pointer, B datapath 
        .asg    B16,        B_ptr_ln3_x   ; line3 data 4 to 7 
        .asg    B17,        B_n_x         ; copy of length of line 
        .asg    B18,        B_block       ; amount of data in buffer - 8 
        .asg    B19,        B_zero        ; constant of 0x00000000 
        .asg    B20,        B_p25         ; temp sum of products 
        .asg    B20,        B_x15x14      ; line 1 data 5:4 
        .asg    B21,        B_p04         ; temp sum of products 
        .asg    B21,        B_p204        ; temp sum of products 
        .asg    B21,        B_x17x16      ; line 1 data 7:6 
        .asg    B22,        B_t_y54       ; packed outputs 5:4 
        .asg    B22,        B_x35x34      ; line 3 data 5:4 
        .asg    B22,        B_x37x27      ; line 2:3 data 7 
        .asg    B23,        B_t_y76       ; packed outputs 7:6 
        .asg    B23,        B_x37x36      ; line 3 data 7:6 
        .asg    B24,        B_x05x04      ; line 0 data 5:4 
        .asg    B25,        B_hh_i        ; values in rotated filter 
        .asg    B25,        B_x07x06      ; line 0 data 7:6 
        .asg    B25,        B_x16x06      ; line 0:1 data 6 
        .asg    B26,        B_x25x24      ; line 2 data 5:4 
        .asg    B27,        B_x27x26      ; line 2 data 7:6 
        .asg    B28,        B_y6          ; acumulater 6 
        .asg    B29,        B_y7          ; acumulater 7 
        .asg    B30,        B_y4          ; acumulater 4 
        .asg    B31,        B_y5          ; acumulater 5 
*==============================================================================* 
;-             
        B       .S2   BLOCK                               ; block interupts 
 
        STW     .D2T2 B14,        *B_SP--[14]             ; Save B14, get stk 
||      MV            B_SP,       A_SP                    ; Twin Stack Ptr 
 
        STDW    .D1T1 A15:A14,    *-A_SP[1]               ; Save A15, A14 
||      STDW    .D2T2 B13:B12,    *+B_SP[5]               ; Save B13, B12 
||      MVC     .S2   CSR,        B_csr                   ; Remember CSR 
 
        STDW    .D1T1 A13:A12,    *-A_SP[3]               ; Save A13, A12 
||      STDW    .D2T2 B11:B10,    *+B_SP[3]               ; Save B11, B10 
||      AND           B_csr,      -2,         B_no_gie    ; Clear GIE 
;- 
        STDW    .D1T1 A11:A10,    *-A_SP[5]               ; Save A11, A10 
||      STDW    .D2T2 B_ret:B_csr,*+B_SP[1]               ; Save Return, CSR 
||      MVC     .S2   B_no_gie,   CSR                     ; Disable ints 
||      MV      .L1X  B_l_hh,     A_l_hh                  ;copy l_hh 
||      MV      .L2X  A_start_ln, B_start_ln              ;copy start line 
 
        ADDAH   .D2   B_ptr_hh,   B_l_hh,     B_filter    ;rotated filter start  
||      MV      .L2X  A_n_x,      B_n_x                   ;copy cols  
||[!B_start_ln]MV.S2  B_l_hh,     B_start_ln              ;if(!start)start=l_hh 
BLOCK: 
        SUBAH   .D2   B_filter,   B_start_ln, B_filter    ;ptr_hh+l_hh-start 
||      ADD     .S2X  A_ptr_plane_x, 8,       B_ptr_ln0_x ;ptr_ln1_x=plane_x+1 
||      MV      .S1   A_ptr_plane_x,          A_ptr_ln0_x ;ptr_ln0_x=plane_x 
 
        SUB     .L2   B_l_hh,     1,          B_k         ;for(i=0;i