www.pudn.com > scaling.rar > pix_expand_h.asm


* ========================================================================= * 
*   TEXAS INSTRUMENTS, INC.                                                 * 
*                                                                           * 
*   NAME                                                                    * 
*       pix_expand                                                          * 
*                                                                           * 
*   USAGE                                                                   * 
*       This routine is C-callable and can be called as:                    * 
*                                                                           * 
*       void pix_expand_asm                                                 * 
*       (                                                                   * 
*           int n,                                    /* # of elements */   * 
*           const unsigned char *restrict in_data,    /* Input data    */   * 
*           short               *restrict out_data    /* Output data   */   * 
*       )                                                                   * 
*                                                                           * 
*                                                                           * 
*   DESCRIPTION                                                             * 
*       The code takes an array of bytes and promotes them to half-words    * 
*       by zero-extension.                                                  * 
*                                                                           * 
*       This is the C equivalent of the assembly code, without              * 
*       restrictions.  The assembly code has restrictions, as noted below.  * 
*                                                                           * 
*       void pix_expand                                                     * 
*       (                                                                   * 
*           int n,                                                          * 
*           const unsigned char *restrict in_data,                          * 
*           short               *restrict out_data                          * 
*       )                                                                   * 
*       {                                                                   * 
*           int i;                                                          * 
*                                                                           * 
*           for (i = 0; i < n; i++)                                         * 
*               out_data[i] =  in_data[i];                                  * 
*       }                                                                   * 
*                                                                           * 
*   ASSUMPTIONS                                                             * 
*       Input and output arrays must be double-word (8-byte) aligned.       * 
*                                                                           * 
*       The input must be at least 16 elements long and contain a           * 
*       multiple of 16 elements.                                            * 
*                                                                           * 
*   NOTE                                                                    * 
*       Interrupts are masked during the entire duration of this            * 
*       function, as the entire function occurs within branch delay slots.  * 
*                                                                           * 
*   MEMORY NOTE                                                             * 
*       No bank conflicts occur.  This is a LITTLE ENDIAN implementation.   * 
*                                                                           * 
*   TECHNIQUES                                                              * 
*       The loop is unrolled 16 times, loading bytes with LDDW.  It uses    * 
*       UNPKHU4 and UNPKLU4 to unpack the data and store the results with   * 
*       STDW.                                                               * 
*                                                                           * 
*       To shave a few extra cycles from the function, the return branch    * 
*       is issued from within the kernel.                                   * 
*                                                                           * 
*   CYCLES                                                                  * 
*       cycles = 3 * (n / 16) + 15.                                         * 
*       For n = 1072, cycles = 216.                                         * 
*                                                                           * 
*   CODESIZE                                                                * 
*       100 bytes.                                                          * 
*                                                                           * 
* ------------------------------------------------------------------------- * 
*             Copyright (c) 2000 Texas Instruments, Incorporated.           * 
*                            All Rights Reserved.                           * 
* ========================================================================= * 
        .include "pix_expand_h.h62" 
_pix_expand_asm: 
* ===================== SYMBOLIC REGISTER ASSIGNMENTS ===================== * 
        .asg            A0,          A_i 
        .asg            B1,          B_i  
        .asg            A2,          A_p_10 
        .asg            A3,          A_p_32 
        .asg            A4,          A_n 
        .asg            A6,          A_o0 
        .asg            A7,          A_i1 
        .asg            A8,          A_p_3210 
        .asg            A8,          A_p_98 
        .asg            A9,          A_p_7654 
        .asg            A9,          A_p_BA 
        .asg            B0,          B_p 
        .asg            B3,          B_ret 
        .asg            B4,          B_i0 
        .asg            B5,          B_o1 
        .asg            B6,          B_p_BA98 
        .asg            B6,          B_p_DC 
        .asg            B7,          B_p_FE 
        .asg            B7,          B_p_FEDC 
        .asg            B8,          B_p_54 
        .asg            B9,          B_p_76 
* ========================================================================= * 
* =========================== PIPE LOOP PROLOG ============================ * 
        B               loop                                    ;[ 4,1] 
 
        ADD             B_i0,       8,          A_i1 
||      ADD             A_o0,       8,          B_o1 
||      SHR             A_n,        4,          A_i 
 
        LDDW    .D1T2   *A_i1++[2], B_p_FEDC:B_p_BA98           ;[ 1,1] 
||      LDDW    .D2T1   *B_i0++[2], A_p_7654:A_p_3210           ;[ 1,1] 
||      MVK             0xFFFF8000, B_p 
* =========================== PIPE LOOP KERNEL ============================ * 
loop: 
  [!B_p]STDW    .D1T1   A_p_BA:A_p_98,          *A_o0[2]        ;[ 8,1] 
||[!B_p]STDW    .D2T2   B_p_FE:B_p_DC,          *B_o1[2]        ;[ 8,1] 
||[ A_i]BPOS    .S1     loop,       A_i                         ;[ 5,2] 
||[!A_i]B       .S2     B_ret 
||      SUB     .L1     A_i,        1,          A_i 
||      ADD     .L2X    1,          A_i,        B_i             ;final load 
||[ B_p]MPY     .M2     B_p,        2,          B_p 
 
  [!B_p]STDW    .D1T1   A_p_32:A_p_10,          *A_o0++[4]      ;[ 9,1] 
||[!B_p]STDW    .D2T2   B_p_76:B_p_54,          *B_o1++[4]      ;[ 9,1] 
||      UNPKLU4 .S1     A_p_3210,   A_p_10                      ;[ 6,2] 
||      UNPKHU4 .L1     A_p_3210,   A_p_32                      ;[ 6,2] 
||      UNPKLU4 .L2X    A_p_7654,   B_p_54                      ;[ 6,2] 
||      UNPKHU4 .S2X    A_p_7654,   B_p_76                      ;[ 6,2] 
 
        UNPKLU4 .S1X    B_p_BA98,   A_p_98                      ;[ 7,2] 
||      UNPKHU4 .L1X    B_p_BA98,   A_p_BA                      ;[ 7,2] 
||      UNPKLU4 .L2     B_p_FEDC,   B_p_DC                      ;[ 7,2] 
||      UNPKHU4 .S2     B_p_FEDC,   B_p_FE                      ;[ 7,2] 
||[ B_i]LDDW    .D1T2   *A_i1++[2], B_p_FEDC:B_p_BA98           ;[ 1,4] 
||[ B_i]LDDW    .D2T1   *B_i0++[2], A_p_7654:A_p_3210           ;[ 1,4] 
* ========================================================================= * 
* ========================================================================= * 
*   End of file:  pix_expand_h.asm                                          * 
* ------------------------------------------------------------------------- * 
*             Copyright (c) 2000 Texas Instruments, Incorporated.           * 
*                            All Rights Reserved.                           * 
* ========================================================================= *