www.pudn.com > scaling.rar > scale_horz_h.asm


* ========================================================================= * 
*                                                                           * 
*   TEXAS INSTRUMENTS, INC.                                                 * 
*                                                                           * 
*   NAME                                                                    * 
*       scale_horz                                                          * 
*                                                                           * 
*                                                                           * 
*   USAGE                                                                   * 
*       This routine is C-callable and can be called as:                    * 
*                                                                           * 
*           void scale_horz                                                 * 
*           (                                                               * 
*               unsigned short *in_data,  /* Ptr to unscaled lines      */  * 
*               unsigned int    in_len,   /* Pixels/line unscaled       */  * 
*               short          *out_data, /* Ptr to scaled data lines   */  * 
*               unsigned int    out_len,  /* Pixels/line of scaled data */  * 
*               short          *hh,       /* Ptr to filter taps,            * 
*                                            interleaved odd/even           * 
*                                            outputs                    */  * 
*               unsigned int    l_hh,     /* Length of scaling filters  */  * 
*               unsigned int    n_hh,     /* Number of scaling filters  */  * 
*               short          *patch     /* Ptr to decrement pattern   */  * 
*           );                                                              * 
*                                                                           * 
*   DESCRIPTION                                                             * 
*                                                                           * 
*       This code can scale up or down 1 line of data, in the               * 
*       ratio out_len : in_len.  e.g 1 to 3, 4:3, 5:6. The                  * 
*       filters are designed outside of the loop using a                    * 
*       general purpose resizing algorithm.                                 * 
*                                                                           * 
*           patch0 = patch + 2;                                             * 
*           filter_count = n_hh;                                            * 
*           ka = 0;                                                         * 
*                                                                           * 
*           line0_x = plane_x;                                              * 
*           line0_y = plane_y;                                              * 
*           ptr_hh = hh;                                                    * 
*           jump = (int) patch[0]; ka = jump >> 1;                          * 
*           jump = (int) patch[1]; kb = jump >> 1;                          * 
*                                                                           * 
*           for ( i = 0; i < n_y; i += 2)                                   * 
*           {                                                               * 
*               y0 = 1 << 5;                                                * 
*               y1 = 1 << 5;                                                * 
*               for ( j = 0; j < l_hh; j+=4)                                * 
*               {                                                           * 
*                   /* even outputs */                                      * 
*                   for (k=0; k < 4; k++)                                   * 
*                   {                                                       * 
*                       h0 = *ptr_hh++;                                     * 
*                       x0 = *(line0_x+ ka + k);                            * 
*                       y0 += ( x0 * h0 );                                  * 
*                   }                                                       * 
*                   jump = (int) (*patch0++);                               * 
*                   ka = ka + (jump>>1);                                    * 
*                   /* odd outputs */                                       * 
*                   for (k=0; k < 4; k++)                                   * 
*                   {                                                       * 
*                       h1 = *ptr_hh++;                                     * 
*                       x1 = *(line0_x + kb + k);                           * 
*                       y1 += ( x1 * h1 );                                  * 
*                   }                                                       * 
*                   jump = (int) (*patch0++);                               * 
*                   kb = kb + (jump>>1);                                    * 
*               }                                                           * 
*               *line0_y++ = (short) (y0 >> 6) ;                            * 
*               *line0_y++ = (short) (y1 >> 6) ;                            * 
*                                                                           * 
*               filter_count -= 2;                                          * 
*               if (!filter_count)                                          * 
*               {                                                           * 
*                   patch0 = patch + 2;                                     * 
*                   ptr_hh = hh;                                            * 
*                   filter_count = n_hh;                                    * 
*               }                                                           * 
*           }                                                               * 
*                                                                           * 
*   ASSUMPTIONS                                                             * 
*       One line of data is produced per function call.                     * 
*                                                                           * 
*       The line must be aligned on a double word boundary and be a         * 
*       multiples of 8 bytes.                                               * 
*                                                                           * 
*       Filters are multiples of 4 taps, maximum number of filters is 256.  * 
*                                                                           * 
*       The computations for each output are interleaved, thus the filters  * 
*       are interleaved on a 4 short interval.                              * 
*                                                                           * 
*       Little ENDIAN Configuration is used and the input and output data   * 
*       is 16 bit unsinged and signed shorts respectively.  The filters     * 
*       are also 16 bit signed shorts in 12 bit precision.                  * 
*                                                                           * 
*       The n_hh filters are all of the same length and are                 * 
*       strung together in a single linear array.                           * 
*                                                                           * 
*       Interrupts are masked by the function for most of its duration.     * 
*                                                                           * 
*   MEMORY NOTE                                                             * 
*       Some bank hits will occur in this code for certain scale            * 
*       factors and filter lengths.                                         * 
*                                                                           * 
*       For 4 taps k = 0, for l_hh 8, k = 0.031, for l_hh = 16, k = 0.015.  * 
*       Different flter lengths can produce different numbers of bank       * 
*       conflicts.  Overall, these bank conflicts have nearly zero effect.  * 
*                                                                           * 
*       For l_hh=4: k=0, l_hh=8: k=1/32, l_hh=12: k=0, l_hh=16: k=1/64      * 
*       For l_hh % 8 == 0, k = 1/(4*l_hh) else k = 0                        * 
*                                                                           * 
*       'k' is the bank conflict between the store and the guidance table   * 
*       load.  Depending on the relative sizes of the filters and           * 
*       memory width, this bank conflict is between 0 and 3.1%              * 
*       overhead.                                                           * 
*                                                                           * 
*   TECHNIQUES                                                              * 
*       The outputs are computed using interleaved inputs. The patch table  * 
*       controls the access of 2 parallel pointers. For example an 8/33     * 
*       scale factor will have the following access pattern.                * 
*                                                                           * 
*                 11111111112222222222333333333344444444445555555555        * 
*       012345678901234567890123456789012345678901234567890123456789        * 
*                                                                           * 
*       0  e xxxxxxxx     <-start point of even output 0                    * 
*       1      o xxxxxxxx      <-start point of odd output 4                * 
*       2          e xxxxxxxx                                               * 
*       3              o xxxxxxxx                                           * 
*       4                  e xxxxxxxx                                       * 
*       5                      o xxxxxxxx                                   * 
*       6                          e xxxxxxxx                               * 
*       7                              o xxxxxxxx                           * 
*       0                                   e xxxxxxxx  <-next start        * 
*       1                                       o xxxxxxxx  <-next start    * 
*                                                                           * 
*                                                                           * 
*       From this diagram the odd pointer jumps 4 then another 4 as the     * 
*       filters have 8 taps, it then jumps 4 to get to the next set of      * 
*       input data. The odd pointer does the same. These jumps are          * 
*       interleaved and so are the filter coefficients. The jumps are       * 
*       in multiples of bytes as non-scaled non-aligned double word         * 
*       accesses are used.  In this case the table will be:                 * 
*                                                                           * 
*           short patch[] = {0,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,10,10,8,8};    * 
*                                                                           * 
*       Notice the first 2 entries are the intial starting points for       * 
*       the two pointers. To remove a dependency in the code the last 2     * 
*       entries are copies of the 2nd two. This makes the table almost      * 
*       circular.                                                           * 
*                                                                           * 
*   NOTES                                                                   * 
*       Other scale factors can be achieved with the following              * 
*       example tables.                                                     * 
*                                                                           * 
*   Scale Factor Taps  Table short jump[] =                                 * 
*   --------------------------------------------------------------------    * 
*       5/6       4    {0, 1, 2, 2, 2, 3, 3, 2, 2, 2, 3, 3, 2, 2}           * 
*       4/3       8    {0, 4, 4, 4, -3, -2, 4, 4, -2, -3, 4, 4}             * 
*       3/4       12   {0,1,4,4,4,4,-6,-5,4,4,4,4,-5,-6,4,4,4,4,-5,-5,4,4}  * 
*       6/5       16   {0,0,4,4,4,4,4,4,-11,-10,4,4,4,4,4,4,-10,-10,        * 
*                       4,4,4,4,4,4,-10,-11,4,4}                            * 
*                                                                           * 
*       The software to produce these tables and the simple coefficents     * 
*       for an arbitarary scale factor and number of taps can be found      * 
*       in the api document. Note in the case of 3/4, odd scale factors     * 
*       are doubled to make 6/8 instead of 3/4                              * 
*                                                                           * 
*   CYCLES                                                                  * 
*       cycles = 0.5 * out_len * l_hh * (1+k) + 30.                         * 
*       If (l_hh % 8) == 0 then k = 1/(4*l_hh) else k = 0.                  * 
*                                                                           * 
*       For l_hh = 16, in_len = 1024, and out_len = 1366,  cycles = 11129.  * 
*       For l_hh = 8,  in_len = 640,  and out_len = 120,   cycles = 525.    * 
*                                                                           * 
*   CODESIZE                                                                * 
*       452 bytes                                                           * 
* ------------------------------------------------------------------------- * 
*             Copyright (c) 2001 Texas Instruments, Incorporated.           * 
*                            All Rights Reserved.                           * 
* ========================================================================= * 
                .sect ".data:copyright_h" 
_Copyright:     .string "Copyright (C) 2001 Texas Instruments Incorporated. " 
                .string "All Rights Reserved." 
                .include "scale_horz_h.h62" 
_scale_horz_asm:  
* ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== * 
        .asg     A4,         A_plane_x    ; 
        .asg     B4,         B_n_x        ; 
        .asg     A6,         A_plane_y    ; 
        .asg     B6,         B_n_y        ; 
        .asg     A8,         A_hh         ; 
        .asg     B8,         B_l_hh       ; 
        .asg     A10,        A_n_hh       ; 
        .asg     B10,        B_patch      ; 
        .asg     A12,        A_filt_state ; 
        .asg     B18,        B_filt_no    ; 
        .asg     B16,        B_hh0        ; 
        .asg     A23,        A_ptr_hh     ; 
        .asg     B24,        B_ptr_hh     ; 
        .asg     A9,         A_line0_x0   ; 
        .asg     B9,         B_line0_x1   ; 
        .asg     A18,        A_ka         ; 
        .asg     B17,        B_kb         ; 
        .asg     A17,        A_jump10     ; 
        .asg     A19,        A_kbka       ; 
        .asg     A22,        A_patch0     ; 
        .asg     A20,        A_y0         ; 
        .asg     B20,        B_y1         ; 
        .asg     B21,        B_line0_y    ; 
        .asg     A7,         A_round      ; 
        .asg     A0,         A_taps       ; 
        .asg     A5,         A_l_hh       ; 
        .asg     B0,         B_f_cnt      ; 
        .asg     A3,         A_patch      ; 
        .asg     B7,         B_hh         ; 
        .asg     B5,         B_max_filt   ; 
        .asg     B22,        B_i          ; 
        .asg     A27,        A_h03h02     ; 
        .asg     A26,        A_h01h00     ; 
        .asg     B27,        B_h13h12     ; 
        .asg     B26,        B_h11h10     ; 
        .asg     A17,        A_x03x02     ; 
        .asg     A16,        A_x01x00     ; 
        .asg     B17,        B_x13x12     ; 
        .asg     B16,        B_x11x10     ; 
        .asg     B19,        B_one        ; 
        .asg     A24,        A_p00        ; 
        .asg     A21,        A_p01        ; 
        .asg     A18,        A_p0         ; 
        .asg     B23,        B_p10        ; 
        .asg     B28,        B_p11        ; 
        .asg     B18,        B_p1         ; 
        .asg     A16,        A_t_y0       ; 
        .asg     B19,        B_t_y1       ; 
        .asg     B16,        B_t_y10      ; 
        .asg     A22,        A_k1k0       ; 
        .asg     A28,        A_kbka_      ; 
        .asg     B29,        B_csr        ; 
        .asg     B22,        B_csr_no_gie ; 
* =========================== PIPE LOOP PROLOG ============================ * 
        LDW   .D1T2 *A_filt_state[0],      B_f_cnt      ;[ 2,0]  
||      MVC     .S2 CSR,        B_csr                   ; 
 
        ZERO    .L1 A_p01                               ; 
||      AND     .S2 B_csr,      -2,        B_csr_no_gie ; 
 
        ZERO    .L2 B_p10                               ; 
||      MVC     .S2 B_csr_no_gie,          CSR          ; 
 
        MPY    .M2X B_l_hh,     A_n_hh,    B_max_filt   ;[5,0]max_flt=l_hh*n_hh 
 
        ZERO    .L2 B_p11                               ; 
||      ZERO    .L1 A_p00                               ; 
 
        MV     .L2X A_hh,       B_hh                    ;[ 7,0]  
||      SUB     .D2 B_max_filt, B_f_cnt, B_filt_no      ;[ 7,0]filt_cnt=max_flt 
||      MV     .D1X B_patch,    A_patch                 ;[ 7,0]patch0=patch 
 
        ADDAH   .D2 B_hh,       B_filt_no, B_hh0        ;[ 8,0]  
||      LDW   .D1T1 *A_patch[0],           A_kbka       ;[ 8,0]initial start 
||      SHRU    .S2 B_filt_no,  1,         B_filt_no    ;[ 8,0]  
 
        ADD     .L1 A_patch,    4,         A_patch0     ;[ 9,0]patch0=patch 
||      ZERO    .S1 A_h01h00                            ; 
 
        ADD    .D1X A_patch0,   B_filt_no, A_patch0     ;[10,0]  
||      ZERO    .L2 B_h13h12                            ; 
||      B       .S1 LOOPY + 20                          ; 
 
        LDW   .D1T1 *A_patch0++[1],        A_jump10     ;[11,0] first offset 
||      B       .S1 LOOPY1 + 12                         ; 
 
        MPY     .M2 B_l_hh,     B_n_y,     B_i          ;[12,0]  
||      MV     .L2X A_plane_y,  B_line0_y               ;[12,0]line0_y=plane_y 
||      MVK     .S1 020h,       A_round                 ;[12,0]  
||      MVK     .D2 1,          B_one                   ;[12,0]  
||      MPY    .M1X 1,          B_l_hh,    A_l_hh       ;[12,0]  
||      B       .S2 LOOPY2 + 12                         ; 
 
        MPYLH  .M2X B_one,      A_kbka,    B_kb         ;[13,0]  
||      MPY     .M1 1,          A_kbka,    A_ka         ;[13,0]  
||      B       .S1 LOOPY3 + 20                         ; 
 
        SHRU    .S2 B_i,        3,         B_i          ;[14,0]  
||      MPY    .M2X 1,          A_round,   B_y1         ;[14,0] y1 = 1 << 5 
||      ADD     .L1 12,         A_l_hh,    A_taps       ;[14,0]taps=l_hh 
||      ROTL    .M1 A_plane_x,  0,         A_line0_x0   ;[15,0]line0_x=plane_x 
||      B       .S1 LOOPY                               ; 
 
        SUB     .S2 B_i,        3,         B_i          ;[15,0]2 + 1 
||      ADD     .D2 B_hh0,      8,         B_ptr_hh     ;[15,0]ptr_hh=hh+1 
||      MV     .L1X B_hh0,      A_ptr_hh                ;[15,0]ptr_hh = hh 
||      MV     .L2X A_plane_x,  B_line0_x1              ;[15,0]line1_x=plane_x 
||      MV      .S1 A_round,    A_y0                    ;[15,0]y0 = 1 << 5 
||      LDW   .D1T1 *A_patch0++[1],        A_jump10     ;[1,1]load next offst 
* =========================== PIPE LOOP KERNEL ============================ * 
LOOPY: 
        SHR     .S2 B_y1,       6,         B_t_y1       ;[18,1]  
||      SHR     .S1 A_y0,       6,         A_t_y0       ;[18,1]  
||      SUB     .L1 A_taps,     4,         A_taps       ;[18,1]  
||      DOTP2   .M1 A_h01h00,   A_x01x00,  A_p00        ;[10,3]  
||      DOTP2   .M2 B_h13h12,   B_x13x12,  B_p11        ;[10,3]  
||      LDDW  .D1T1 *A_ptr_hh++[2], A_h03h02:A_h01h00   ;[ 2,5]h3:0=*ptr_hh++ 
||      LDDW  .D2T2 *B_ptr_hh++[2], B_h13h12:B_h11h10   ;[ 2,5]h3:0=*ptr_hh++ 
||      SUB     .L2 B_f_cnt,    8,         B_f_cnt      ;[ 2,5]fil_count-=8 
LOOPY1: 
  [!A_taps]MPY  .M1 1,          A_round,   A_y0         ;[19,1]if(!samp)y0=y1=0 
||[!A_taps]MPY .M2X 1,          A_round,   B_y1         ;[19,1]if(!samp)y0=y1=0 
||      ADD     .S2 B_p10,      B_p11,     B_p1         ;[15,2]  
||[!B_f_cnt]MV .S1X B_hh,       A_ptr_hh                ;[3,5](!flt_c)p_hh=hh 
||      ADD2    .D1 A_jump10,   A_kbka,    A_kbka       ;[3,5]  
||[!B_f_cnt]ADD .L2 B_hh,       8,         B_ptr_hh     ;[3,5](!flt_c)p_hh=hh+4 
||[!B_f_cnt]ADD .L1 A_patch,    8,         A_patch0     ;[3,5](!flt_c)ptch0=ptch 
||      LDNDW .D2T2 *B_line0_x1(B_kb), B_x13x12:B_x11x10;[3,5]x3:0=*(line0_x+kb) 
LOOPY2: 
        PACK2  .L2X B_t_y1,     A_t_y0,    B_t_y10      ;[20,1]  
||      ADD     .S1 A_p00,      A_p01,     A_p0         ;[16,2]  
||      DOTP2   .M2 B_h11h10,   B_x11x10,  B_p10        ;[ 8,4]  
||      BDEC    .S2 LOOPY,      B_i                     ;[16,2]  
||      LDNDW .D1T1 *A_line0_x0(A_ka), A_x03x02:A_x01x00;[4,5]x3:0=*(line0_x+ka) 
||      MPY     .M1 1,          A_kbka,    A_ka         ;[ 4,5]  
||      MVK     .D2 1,          B_one                   ;[ 4,5]  
LOOPY3: 
  [!A_taps]MV   .L1 A_l_hh,     A_taps                  ;[21,1]taps=l_hh 
||[!A_taps]STW.D2T2 B_t_y10,    *B_line0_y++[1]         ;[21,1]if(!samp) 
||      ADD     .S1 A_p0,       A_y0,      A_y0         ;[17,2] *line0_y++=t_y0 
||      ADD     .S2 B_p1,       B_y1,      B_y1         ;[17,2]  
||      DOTP2   .M1 A_h03h02,   A_x03x02,  A_p01        ;[9,4]  
||[!B_f_cnt]MV  .L2 B_max_filt, B_f_cnt                 ;[5,5]if(!flt_c) 
||      MPYLH  .M2X B_one,      A_kbka,    B_kb         ;[5,5]  flt_cnt=max_flt 
||      LDW   .D1T1 *A_patch0++[1],        A_jump10     ;[1,6]load next offst 
* =========================== PIPE LOOP EPILOG ============================ * 
        SHR     .S2 B_y1,       6,         B_t_y1       ;[18,3]  
||      SHR     .S1 A_y0,       6,         A_t_y0       ;[18,3]  
||      SUB     .L1 A_taps,     4,         A_taps       ;[18,3]  
||      DOTP2   .M1 A_h01h00,   A_x01x00,  A_p00        ;[10,5]  
||      DOTP2   .M2 B_h13h12,   B_x13x12,  B_p11        ;[10,5]  
||      LDW   .D1T1 *A_patch[0],A_k1k0                  ; 
 
  [!A_taps]MPY  .M1 1,          A_round,   A_y0         ;[19,3]if(!samp)y0=y1=0; 
||[!A_taps]MPY .M2X 1,          A_round,   B_y1         ;[19,3]if(!samp)y0=y1=0; 
||      ADD     .S2 B_p10,      B_p11,     B_p1         ;[15,4]  
 
        PACK2  .L2X B_t_y1,     A_t_y0,    B_t_y10      ;[20,3]  
||      ADD     .L1 A_p00,      A_p01,     A_p0         ;[16,4]  
||      DOTP2   .M2 B_h11h10,   B_x11x10,  B_p10        ;[ 8,6]  
||      B       .S1 PIPE_DOWN                           ; 
||      MVK     .S2 0,          B_i                     ; 
 
  [!A_taps]MV   .L1 A_l_hh,     A_taps                  ;[21,3]taps = l_hh 
||[!A_taps]STW.D2T2 B_t_y10,    *B_line0_y++[1]         ;[21,3]if(!sample) 
||      ADD     .S1 A_p0,       A_y0,      A_y0         ;[17,4]  *line0_y++=t_y0 
||      ADD     .S2 B_p1,       B_y1,      B_y1         ;[17,4]  
||      DOTP2   .M1 A_h03h02,   A_x03x02,  A_p01        ;[ 9,6]  
PIPE_DOWN: 
        SHR     .S2 B_y1,       6,         B_t_y1       ;[18,4]  
||      SHR     .S1 A_y0,       6,         A_t_y0       ;[18,4]  
||      SUB     .L1 A_taps,     4,         A_taps       ;[18,4]  
||      DOTP2   .M1 A_h01h00,   A_x01x00,  A_p00        ;[10,6]  
||      DOTP2   .M2 B_h13h12,   B_x13x12,  B_p11        ;[10,6]  
 
  [!A_taps]MPY  .M1 1,          A_round,   A_y0         ;[19,4]if(!samp)y0=y1=0 
||[!A_taps]MPY .M2X 1,          A_round,   B_y1         ;[19,4]if(!samp)y0=y1=0 
||      ADD     .S2 B_p10,      B_p11,     B_p1         ;[15,5]  
||      SUB2    .L1 A_kbka,     A_k1k0,    A_kbka_      ; 
 
        PACK2  .L2X B_t_y1,     A_t_y0,    B_t_y10      ;[20,4]  
||      ADD     .L1 A_p00,      A_p01,     A_p0         ;[16,5]  
||      BDEC    .S2 PIPE_DOWN,  B_i                     ; 
||      SHR2    .S1 A_kbka_,    1,         A_kbka_      ; 
 
  [!A_taps]MV   .L1 A_l_hh,     A_taps                  ;[21,4]taps=l_hh 
||      ADD     .S1 A_p0,       A_y0,      A_y0         ;[17,5]  
||      ADD     .S2 B_p1,       B_y1,      B_y1         ;[17,5]  
||[!A_taps]STW.D2T2 B_t_y10,    *B_line0_y++[1]         ;[21,4](!samp) 
                                                        ;       *line0_y++=t_y0 
* ========================================================================= * 
        BNOP    .S2 B3,         2                       ;return to call 
 
        STW   .D1T1 A_kbka_,    *A_filt_state[1]        ; 
 
        STW   .D1T2 B_f_cnt,    *A_filt_state[2]        ; 
 
        MVC     .S2 B_csr,      CSR                     ; 
        ;BRANCH OCCURS  
* ========================================================================= * 
*   End of file:  scale_horz_h.asm                                          * 
* ------------------------------------------------------------------------- * 
*             Copyright (c) 2001 Texas Instruments, Incorporated.           * 
*                            All Rights Reserved.                           * 
* ========================================================================= *