www.pudn.com > ccs_encoder.rar > fdct.asm


            ; .sect ".data:copyright_h" 
; 
        .text 
        .global _IMG_fdct_8x8 
_IMG_fdct_8x8: 
; ========================== SYMBOLIC CONSTANTS =========================== ; 
        .asg            0xB505,     cst_c0  ; Cosine term c0 
        .asg            0x2C62,     cst_c1  ; Cosine term c1 
        .asg            0x29CF,     cst_c2  ; Cosine term c2 
        .asg            0x25A0,     cst_c3  ; Cosine term c3 
        .asg            0x1924,     cst_c5  ; Cosine term c5 
        .asg            0x1151,     cst_c6  ; Cosine term c6 
        .asg            0x08D4,     cst_c7  ; Cosine term c7 
 
; =============== SYMBOLIC REGISTER ASSIGNMENTS: VERT LOOP ================ ; 
        .asg            A11,        A_k1c0  ; 1, Cosine term c0     (packed) 
        .asg            A12,        A_c1c7  ; Cosine terms c1, c7   (packed) 
        .asg            A13,        A_c2c6  ; Cosine terms c2, c6   (packed) 
        .asg            B11,        B_k1c0  ; 1, Cosine term c0     (packed) 
        .asg            B12,        B_c1c7  ; Cosine terms c1, c7   (packed) 
        .asg            B13,        B_c2c6  ; Cosine terms c2, c6   (packed) 
        .asg            B14,        B_c3c5  ; Cosine terms c3, c5   (packed) 
        .asg            A4,         A_i_ptr ; Input pointer 
        .asg            B10,        B_o_ptr ; Output pointer 
        .asg            A9,         A_f0    ; Spatial domain sample f0 
        .asg            B8,         B_f1    ; Spatial domain sample f1 
        .asg            B6,         B_f2    ; Spatial domain sample f2 
        .asg            A5,         A_f3    ; Spatial domain sample f3 
        .asg            A7,         A_f4    ; Spatial domain sample f4 
        .asg            B7,         B_f5    ; Spatial domain sample f5 
        .asg            B15,        B_f6    ; Spatial domain sample f6 
        .asg            A6,         A_f7    ; Spatial domain sample f7 
        .asg            A6,         A_g0    ; Node g0 in flow graph 
        .asg            B8,         B_g1    ; Node g1 in flow graph 
        .asg            B6,         B_h1    ; Node h1 in flow graph 
        .asg            A7,         A_h0    ; Node h0 in flow graph 
        .asg            A0,         A_s1    ; Node s1 (h2) in flow graph 
        .asg            B4,         B_h3    ; Node h3 in flow graph 
        .asg            B15,        B_g3    ; Node g3 in flow graph 
        .asg            A15,        A_q1    ; Node q1 (g2) in flow graph 
        .asg            A6,         A_p0    ; Node p0 in flow graph 
        .asg            B6,         B_p1    ; Node p1 in flow graph 
        .asg            B15,        B_s0a   ; Node s0 intermediate result 
        .asg            B5,         B_s0b   ; Node s0 intermediate result 
        .asg            B15,        B_s0    ; Node s0 in flow graph 
        .asg            A3,         A_r0    ; Node r0 in flow graph 
        .asg            B15,        B_r1    ; Node r1 in flow graph 
        .asg            B4,         B_q0a   ; Node q0 intermediate result 
        .asg            A14,        A_q0b   ; Node q0 intermediate result 
        .asg            A3,         A_q0    ; Node q0 in flow graph 
        .asg            A10,        A_Q1    ; Node Q1 in flow graph 
        .asg            B5,         B_S1    ; Node S1 in flow graph 
        .asg            A3,         A_Q0    ; Node Q0 in flow graph 
        .asg            B4,         B_S0    ; Node S0 in flow graph 
        .asg            A14,        A_c1Q1  ; Intermediate value c1 * Q1 
        .asg            A6,         A_c2r0  ; Intermediate value c2 * r0 
        .asg            A7,         A_c3Q0  ; Intermediate value c3 * Q0 
        .asg            A3,         A_c5Q0  ; Intermediate value c5 * Q0 
        .asg            A14,        A_c6r0  ; Intermediate value c6 * r0 
        .asg            A8,         A_c7Q1  ; Intermediate value c7 * Q1 
        .asg            B5,         B_c1S1  ; Intermediate value c1 * S1 
        .asg            B0,         B_c2r1  ; Intermediate value c2 * r1 
        .asg            B0,         B_c3S0  ; Intermediate value c3 * S0 
        .asg            B3,         B_c5S0  ; Intermediate value c5 * S0 
        .asg            B6,         B_c6r1  ; Intermediate value c6 * r1 
        .asg            B5,         B_c7S1  ; Intermediate value c7 * S1 
        .asg            B9,         B_F0    ; Frequency domain term F0 
        .asg            A8,         A_F1    ; Frequency domain term F1 
        .asg            A5,         A_F2    ; Frequency domain term F2 
        .asg            B4,         B_F3    ; Frequency domain term F3 
        .asg            B3,         B_F4    ; Frequency domain term F4 
        .asg            A9,         A_F5    ; Frequency domain term F5 
        .asg            A10,        A_F6    ; Frequency domain term F6 
        .asg            B4,         B_F7    ; Frequency domain term F7 
        .asg            A8,         A_F1t   ; Truncated result for F1 
        .asg            A5,         A_F2t   ; Truncated result for F2 
        .asg            B7,         B_F3t   ; Truncated result for F3 
        .asg            A10,        A_F5t   ; Truncated result for F5 
        .asg            A10,        A_F6t   ; Truncated result for F6 
        .asg            B5,         B_F7t   ; Truncated result for F7 
        .asg            B2,         B_i     ; Inner loop counter #1 
        .asg            A1,         A_i     ; Inner loop counter #2 
        .asg            B1,         B_o     ; Outer loop counter 
        .asg            A2,         A_c     ; Prolog collapse counter 
; ========================================================================= ; 
 
 
* ========================================================================= * 
*   Initialization code / Stack Management                                  * 
*                                                                           * 
*   This code is responsible for saving registers to the stack, disabling   * 
*   interrupts, and setting up for the vertical loop.                       * 
*                                                                           * 
*   This function requires 16 words of stack.  A10...A15, B10...B14, CSR,   * 
*   IRP, and 'num_fdcts' are all pushed on the stack.  For speed, this      * 
*   code uses twin stack pointers to offload registers onto the stack as    * 
*   quickly as possible.                                                    * 
*                                                                           * 
*   The majority of the code in this function is not interruptible.         * 
*   Therefore, interrupts are disabled almost immediately after entry       * 
*   into the function, and the previous interruptibility state is restored  * 
*   on exit.  The previous value of CSR is pushed on the stack and          * 
*   restored on exit.                                                       * 
*                                                                           * 
*   Since all 32 registers are used by the vertical loop, the stack         * 
*   pointer is saved in the IRP register.  The previous contents of IRP     * 
*   are also pushed on the stack.                                           * 
*                                                                           * 
*   Initialization for constants (cosine terms, etc.) is overlapped with    * 
*   the prolog of the vertical loop to save time.  Pointer setup for the    * 
*   output pointer is also hidden in the prolog.                            * 
*                                                                           * 
*   Early exit code suppresses most of the function's activity (including   * 
*   most of the stack accesses) if num_fdcts (in B4) is zero.  It is not    * 
*   possible to exit the function faster.                                   * 
* ========================================================================= * 
 
;- 
        STW     .D2T1   A15,        * B15--[16]     ; Save A15, get stk frame 
||      MV      .L1X    B15,        A15             ; Twin Stack Pointer 
||      SHL     .S2     B4,         3,          B_o ; iters == num_fdcts * 8 
 
  [ B_o]STW     .D1T2   B14,        *-A15  [ 2]     ; Save B14 (SP[14]) 
||[ B_o]ADD     .L2     B_o,        -1,         B_o ; Adj. for parallel iters 
||[ B_o]ADDK    .S1     48,         A_i_ptr         ; Point to row 3, col 0 
||[!B_o]RET     .S2     B3                          ; Abort if num_fdcts == 0 
||[!B_o]LDW     .D2T1   *++B15[16], A15             ; Restore A15 on abort 
; ===== Interrupts masked by branch delay slots ===== 
;- 
  [ B_o]STW     .D1T1   A13,        *-A15  [ 3]     ; Save A13 (SP[13]) 
||[ B_o]STW     .D2T2   B11,        *+B15  [ 8]     ; Save B11 
||[ B_o]MVC     .S2     CSR,        B0              ; Snapshot CSR 
 
  [ B_o]STW     .D1T1   A12,        *-A15  [ 5]     ; Save A12 (SP[11]) 
||[ B_o]STW     .D2T2   B12,        *+B15  [10]     ; Save B12 
 
  [ B_o]STW     .D1T2   B13,        *-A15  [ 7]     ; Save B13 (SP[ 9]) 
||[ B_o]STW     .D2T1   A11,        *+B15  [12]     ; Save A11 
||[ B_o]MVC     .S2     IRP,        B5              ; Snapshot IRP 
;- 
  [ B_o]STW     .D1T1   A10,        *-A15  [ 9]     ; Save A10 (SP[ 7]) 
||[ B_o]STW     .D2T2   B10,        *+B15  [ 6]     ; Save B10 
||[ B_o]AND     .L2     B0,         -2,         B2  ; Clear GIE bit in CSR 
||[ B_o]MV      .L1X    B5,         A1              ; Partitioning MV 
 
  [ B_o]STW     .D2T2   B3,         *+B15  [ 5]     ; Save return address 
||[ B_o]STW     .D1T1   A1,         *-A15  [12]     ; Save IRP (SP[ 4]) 
||[ B_o]MV      .L1X    B0,         A0              ; Partitioning MV 
||[ B_o]MVC     .S2     B2,         CSR             ; Mask interrupts 
; ===== Branch Occurs ===== 
;- 
; =========================== PIPE LOOP PROLOG ============================ ; 
        LDH     .D1T1   *+A_i_ptr  [ 8],        A_f4            ;[ 1,1] 
||      MVC     .S2     B15,        IRP             ; Save Stack Pointer 
 
        LDH     .D1T2   *-A_i_ptr  [16],        B_f1            ;[ 2,1] 
||      MVK     .S1     4,          A_i             ; Inner loop counter #1 
 
        LDH     .D1T1   * A_i_ptr++[ 1],        A_f3            ;[ 3,1] 
||      MVKL    .S1     cst_c7,     A_c1c7          ; Cosine term C7 
 
        LDH     .D1T1   *-A_i_ptr  [25],        A_f0            ;[ 4,1] 
||      MVKL    .S1     cst_c0,     A_k1c0          ; Cosine term C0 
;- 
        LDH     .D1T2   *+A_i_ptr  [15],        B_f5            ;[ 5,1] 
||      MVKL    .S1     cst_c6,     A_c2c6          ; Cosine term C6 
||      MVKL    .S2     cst_c6,     B_c2c6          ; Cosine term C6 
||      MV      .L2X    A_c1c7,     B_c1c7          ; Twin constant register 
 
        LDH     .D1T2   *-A_i_ptr  [ 9],        B_f2            ;[ 6,1] 
||      MVKLH   .S1     cst_c2,     A_c2c6          ; Cosine term C2 
||      SUB     .L1     A_i,        2,          A_c ; Prolog collapse cnt = 2 
||      ADD     .L2X    A_i_ptr,    -2,         B_o_ptr 
;- 
        LDH     .D1T2   *+A_i_ptr  [23],        B_f6            ;[ 7,1] 
||      MVKLH   .S2     cst_c1,     B_c1c7          ; Cosine term C1 
 
        LDH     .D1T1   *+A_i_ptr  [31],        A_f7            ;[ 8,1] 
||      MVKLH   .S1     cst_c1,     A_c1c7          ; Cosine term C1 
||      MVKLH   .S2     cst_c2,     B_c2c6          ; Cosine term C2 
 
        MVKL    .S2     cst_c5,     B_c3c5          ; Cosine term C5 
||      MVKLH   .S1     1,          A_k1c0          ; Constant: 0x0001 
||      STW     .D2T1   A14,        *+B15  [15]     ; Save A14 
;- 
        SUB     .L1     A_f3,       A_f4,       A_q1            ;[ 9,1] q1=g2 
||      ADD     .S1     A_f3,       A_f4,       A_h0            ;[10,1] 
||      MVKLH   .S2     cst_c3,     B_c3c5          ; Cosine term C3 
||      STW     .D2T2   B_o,        *+B15  [ 3]     ; Spill horiz loop count 
||      STW     .D1T1   A0,         *-A15  [14]     ; Save CSR (SP[ 2]) 
 
        LDH     .D1T1   *+A_i_ptr  [ 8],        A_f4            ;[ 1,2] 
||      MV      .L2X    A_k1c0,     B_k1c0          ; Twin constant register 
||      MVK     .S2     16,         B_i             ; Inner loop counter #2 
;- 
; =========================== PIPE LOOP KERNEL ============================ ; 
v_loop: 
        SHR     .S1     A_F6,       13,         A_F6t           ;[22,1] 
||      MPY     .M2     B_S0,       B_c3c5,     B_c5S0          ;[22,1] 
||      MPY     .M1X    A_Q0,       B_c3c5,     A_c5Q0          ;[22,1] 
||      ADD     .D2     B_f1,       B_f6,       B_g1            ;[12,2] 
||      SUB     .S2     B_f2,       B_f5,       B_g3            ;[12,2] 
||      SUB     .L2     B_f1,       B_f6,       B_h3            ;[12,2] 
||      LDH     .D1T2   *-A_i_ptr  [16],        B_f1            ;[ 2,3] 
||[ A_c]ADD     .L1     A_c,        -1,         A_c             ;pro. collapse 
;- 
v_loop_1: 
  [!A_c]STH     .D2T2   B_F4,       *+B_o_ptr  [ 8]             ;[23,1] 
||      MPY     .M2     B_S1,       B_c1c7,     B_c7S1          ;[23,1] 
||      MPYLH   .M1X    A_Q0,       B_c3c5,     A_c3Q0          ;[23,1] 
||      ADD     .L2     B_h3,       B_g3,       B_s0a           ;[13,2] 
||      SUB     .S2     B_h3,       B_g3,       B_q0a           ;[13,2] 
||      SUB     .S1     A_f0,       A_f7,       A_s1            ;[13,2] s1=h2 
||      ADD     .L1     A_f0,       A_f7,       A_g0            ;[13,2] 
||      LDH     .D1T1   * A_i_ptr++[ 1],        A_f3            ;[ 3,3] 
;- 
v_loop_2: 
  [!A_c]STH     .D2T2   B_F0,       *-B_o_ptr  [24]             ;[24,1] 
||      SUB     .S2X    B_c3S0,     A_c5Q0,     B_F3            ;[24,1] 
||      MPYLH   .M2     B_S1,       B_c1c7,     B_c1S1          ;[24,1] 
||      ADD     .L2     B_f2,       B_f5,       B_h1            ;[14,2] 
||      SUB     .S1     A_g0,       A_h0,       A_r0            ;[14,2] 
||      ADD     .L1     A_g0,       A_h0,       A_p0            ;[14,2] 
||      MPYSU   .M1X    B_q0a,      A_k1c0,     A_q0b           ;[14,2] 
||      LDH     .D1T1   *-A_i_ptr  [25],        A_f0            ;[ 4,3] 
;- 
v_loop_3: 
  [!A_c]SHR     .S1     A_F2,       13,         A_F2t           ;[25,1] 
||[!A_c]MPY     .M1     A_i,        4,          A_i             ;[25,1] 
||      SHR     .S2     B_F3,       13,         B_F3t           ;[25,1] 
||      SUB     .L2X    B_c7S1,     A_c1Q1,     B_F7            ;[25,1] 
||      ADD     .L1X    A_c3Q0,     B_c5S0,     A_F5            ;[25,1] 
||      SUB     .D2     B_g1,       B_h1,       B_r1            ;[15,2] 
||      MPYSU   .M2     B_s0a,      B_k1c0,     B_s0b           ;[15,2] 
||      LDH     .D1T2   *+A_i_ptr  [15],        B_f5            ;[ 5,3] 
;- 
v_loop_4: 
        ADD     .L1X    A_c7Q1,     B_c1S1,     A_F1            ;[26,1] 
||[ B_o]B       .S2     v_loop                                  ;[26,1] 
||[!A_c]STH     .D2T1   A_F6t,      *+B_o_ptr  [24]             ;[26,1] 
||      ADD     .L2     B_g1,       B_h1,       B_p1            ;[16,2] 
||      ADDK    .S1     07FFFh,     A_q0b                       ;[16,2] 
||      MPY     .M1     A_r0,       A_c2c6,     A_c6r0          ;[16,2] 
||      MPY     .M2     B_i,        4,          B_i             ;[ 6,3] 
||      LDH     .D1T2   *-A_i_ptr  [ 9],        B_f2            ;[ 6,3] 
;- 
v_loop_5: 
  [!A_c]STH     .D2T1   A_F2t,      *-B_o_ptr  [ 8]             ;[27,1] 
||      SHR     .S1     A_F5,       13,         A_F5t           ;[27,1] 
||      MPY     .M2     B_r1,       B_c2c6,     B_c6r1          ;[17,2] 
||      SUB     .L2X    A_p0,       B_p1,       B_F4            ;[17,2] 
||      ADDK    .S2     07FFFh,     B_s0b                       ;[17,2] 
||      MPYH    .M1     A_q0b,      A_k1c0,     A_q0            ;[17,2] 
||      LDH     .D1T2   *+A_i_ptr  [23],        B_f6            ;[ 7,3] 
;- 
v_loop_6: 
  [!A_c]STH     .D2T2   B_F3t,      * B_o_ptr++[ 1]             ;[28,1] 
||      SHR     .S1     A_F1,       13,         A_F1t           ;[28,1] 
||      ADD     .L2X    A_p0,       B_p1,       B_F0            ;[18,2] 
||      MPYLH   .M1     A_r0,       A_c2c6,     A_c2r0          ;[18,2] 
||      MPYH    .M2     B_s0b,      B_k1c0,     B_s0            ;[18,2] 
||[ B_o]SUB     .S2     B_o,        1,          B_o             ;[18,2] 
||      LDH     .D1T1   *+A_i_ptr  [31],        A_f7            ;[ 8,3] 
v_loop_7: 
;- 
        SHR     .S2     B_F7,       13,         B_F7t           ;[29,1] 
||[!A_c]STH     .D2T1   A_F5t,      *+B_o_ptr  [15]             ;[29,1] 
||      MPYLH   .M2     B_r1,       B_c2c6,     B_c2r1          ;[19,2] 
||      SUB     .L1     A_q1,       A_q0,       A_Q0            ;[19,2] 
||      ADD     .D1     A_q1,       A_q0,       A_Q1            ;[19,2] 
||      SUB     .S1     A_f3,       A_f4,       A_q1            ;[ 9,3] q1=g2 
;- 
v_loop_8: 
  [!A_c]STH     .D2T1   A_F1t,      *-B_o_ptr  [17]             ;[30,1] 
||      ADD     .L1X    B_c6r1,     A_c2r0,     A_F2            ;[20,2] 
||      SUB     .L2X    A_s1,       B_s0,       B_S0            ;[20,2] 
||      MPYLH   .M1     A_Q1,       A_c1c7,     A_c1Q1          ;[20,2] 
||      ADD     .D1     A_f3,       A_f4,       A_h0            ;[10,3] 
||[!B_i]ADD     .S2     B_i,        4,          B_i             ;[10,3] 
||[!B_i]ADDK    .S1     112,        A_i_ptr                     ;[10,3] 
;- 
v_loop_9: 
  [!A_c]STH     .D2T2   B_F7t,      *+B_o_ptr  [31]             ;[31,1] 
||[!A_i]ADDK    .S2     112,        B_o_ptr                     ;[31,1] 
||[!A_i]ADD     .S1     A_i,        4,          A_i             ;[31,1] 
||      SUB     .L1X    A_c6r0,     B_c2r1,     A_F6            ;[21,2] 
||      MPYLH   .M2     B_S0,       B_c3c5,     B_c3S0          ;[21,2] 
||      ADD     .L2X    A_s1,       B_s0,       B_S1            ;[21,2] 
||      MPY     .M1     A_Q1,       A_c1c7,     A_c7Q1          ;[21,2] 
||      LDH     .D1T1   *+A_i_ptr  [ 8],        A_f4            ;[ 1,4] 
; =========================== PIPE LOOP EPILOG ============================ ; 
 
* ========================================================================= * 
*   Epilog / Inter-loop / Prolog Code                                       * 
*                                                                           * 
*   The code from the vertical loop's epilog has been interscheduled        * 
*   with inter-loop code and prolog code for the horizontal loop.           * 
*   This allows hiding some of the overhead as we pipe-down one loop and    * 
*   pipe-up the next.                                                       * 
*                                                                           * 
*   Notably, we restore B15 and IRP here (rather than after the loop)       * 
*   and unspill our loop trip count from the stack, all in parallel with    * 
*   the prolog and epilog code.  Also, the epilog of the first loop has     * 
*   been heavily overlapped with the prolog of the second loop.  Since      * 
*   a handful of symbolic names have been assigned to different registers,  * 
*   and others have conflicting names between the two loops, we use a set   * 
*   of intermediate symbolic names that bridge the transition.              * 
*                                                                           * 
*   To save a STH/LDH pair, the value of "F7t" from the first loop is       * 
*   forwarded directly to the input "f7" of the second loop.  (The last     * 
*   FDCT performed by the vertical loop overlaps the first FDCT performed   * 
*   by the second loop.)  This is done through a "sign extension", to       * 
*   exactly mimic the overflow behavior of the original C code.             * 
*                                                                           * 
*   For speed, we twin the stack pointer in a spare slot here so that the   * 
*   stack restore after the loop can proceed as quickly as possible.        * 
* ========================================================================= * 
 
        .asg            A4,         Ah_io_ptr   ; Horiz Input/output pointer 
        .asg            A14,        Av_c1Q1     ; Vert: Intermediate c1 * Q1 
        .asg            A6,         Av_c3Q0     ; Vert: Intermediate c3 * Q0 
        .asg            A8,         Av_c7Q1     ; Vert: Intermediate c7 * Q1 
        .asg            B6,         Bv_c1S1     ; Vert: Intermediate c1 * S1 
        .asg            B3,         Bv_c5S0     ; Vert: Intermediate c5 * S0 
        .asg            B5,         Bv_c7S1     ; Vert: Intermediate c7 * S1 
        .asg            A8,         Av_F1       ; Vert: Freq. domain term F1 
        .asg            A5,         Av_F2       ; Vert: Freq. domain term F2 
        .asg            B4,         Bv_F3       ; Vert: Freq. domain term F3 
        .asg            A9,         Av_F5       ; Vert: Freq. domain term F5 
        .asg            B4,         Bv_F7       ; Vert: Freq. domain term F7 
        .asg            A8,         Av_F1t      ; Vert: Trunc. result for F1 
        .asg            A5,         Av_F2t      ; Vert: Trunc. result for F2 
        .asg            B7,         Bv_F3t      ; Vert: Trunc. result for F3 
        .asg            A9,         Av_F5t      ; Vert: Trunc. result for F5 
        .asg            B5,         Bv_F7t      ; Vert: Trunc. result for F7 
;- 
        SHR     .S1     A_F6,       13,         A_F6t           ;[22,4] 
||      MPYLH   .M2     B_S1,       B_c1c7,     Bv_c1S1         ;[24,4] 
||      MPY     .M1X    A_Q0,       B_c3c5,     A_c5Q0          ;[22,4] 
||      STH     .D2T2   B_F4,       *+B_o_ptr  [ 8]             ;[23,4] 
 
        MPY     .M2     B_S0,       B_c3c5,     Bv_c5S0         ;[22,4] 
||      MPYLH   .M1X    A_Q0,       B_c3c5,     Av_c3Q0         ;[23,4] 
||      STH     .D2T1   A_F6t,      *+B_o_ptr  [24]             ;[26,4] 
||      MVC     .S2     IRP,        B15 
;- 
        STH     .D2T2   B_F0,       *-B_o_ptr  [24]             ;[24,4] 
||      MPY     .M2     B_S1,       B_c1c7,     B_c7S1          ;[23,4] 
||      SUB     .S2X    B_c3S0,     A_c5Q0,     B_F3            ;[24,4] 
||      ADD     .L1X    A_c7Q1,     Bv_c1S1,    A_F1            ;[26,4] 
||      ADDK    .S1     -54,        Ah_io_ptr   ; Adjust pointer 
 
        SHR     .S2     B_F3,       13,         B_F3t           ;[25,4] 
||      LDW     .D2T1   *B15[4],    A2          ; Load IRP's value 
||      SHR     .S1     A_F1,       13,         Av_F1t          ;[28,4] 
;- 
; ========================================================================= ; 
 
; =============== SYMBOLIC REGISTER ASSIGNMENTS: HORIZ LOOP =============== ; 
        .asg            A14,        A_c3c5  ; Cosine terms c3, c5   (packed) 
        .asg            B1,         B_k_rnd ; Rounding constant 0x7FFF 
        .asg            A12,        A_k_rnd ; Rounding constant 0x7FFF 
        .asg            B2,         B_io_ptr; Input/output pointer 
        .asg            A4,         A_io_ptr; Input/output pointer 
        .asg            A7,         A_f0    ; Spatial domain sample f0 
        .asg            B13,        B_f1    ; Spatial domain sample f1 
        .asg            B3,         B_f2    ; Spatial domain sample f2 
        .asg            A10,        A_f3    ; Spatial domain sample f3 
        .asg            A0,         A_f4    ; Spatial domain sample f4 
        .asg            B7,         B_f5    ; Spatial domain sample f5 
        .asg            B9,         B_f6t   ; Spatial domain sample f6 (tmp) 
        .asg            B10,        B_f6    ; Spatial domain sample f6 
        .asg            A8,         A_f7    ; Spatial domain sample f7 
        .asg            A0,         A_g0    ; Node g0 in flow graph 
        .asg            B3,         B_g1    ; Node g1 in flow graph 
        .asg            B8,         B_h1    ; Node h1 in flow graph 
        .asg            A9,         A_h0    ; Node h0 in flow graph 
        .asg            A1,         A_s1    ; Node s1 in flow graph 
        .asg            B13,        B_h3    ; Node h3 in flow graph 
        .asg            B10,        B_g3    ; Node g3 in flow graph 
        .asg            A10,        A_q1    ; Node q1 in flow graph 
        .asg            A5,         A_p0    ; Node p0 in flow graph 
        .asg            B4,         B_p1    ; Node p1 in flow graph 
        .asg            B4,         B_s0a   ; Node s0 intermediate value 
        .asg            B5,         B_s0b   ; Node s0 intermediate value 
        .asg            B5,         B_s0c   ; Node s0 intermediate value 
        .asg            B9,         B_s0    ; Node s0 in flow graph 
        .asg            A0,         A_r0    ; Node r0 in flow graph 
        .asg            B7,         B_r1    ; Node r1 in flow graph 
        .asg            B5,         B_q0a   ; Node q0 intermediate value 
        .asg            A3,         A_q0b   ; Node q0 intermediate value 
        .asg            A0,         A_q0c   ; Node q0 intermediate value 
        .asg            A6,         A_q0    ; Node q0 in flow graph 
        .asg            A9,         A_Q1    ; Node Q1 in flow graph 
        .asg            B8,         B_S1    ; Node S1 in flow graph 
        .asg            A6,         A_Q0    ; Node Q0 in flow graph 
        .asg            B5,         B_S0    ; Node S0 in flow graph 
        .asg            A0,         A_c1Q1  ; Intermediate value c1 * Q1 
        .asg            A5,         A_c2r0  ; Intermediate value c2 * r0 
        .asg            A3,         A_c3Q0  ; Intermediate value c3 * Q0 
        .asg            A9,         A_c5Q0  ; Intermediate value c5 * Q0 
        .asg            A3,         A_c6r0  ; Intermediate value c6 * r0 
        .asg            A3,         A_c7Q1  ; Intermediate value c7 * Q1 
        .asg            B7,         B_c1S1  ; Intermediate value c1 * S1 
        .asg            B3,         B_c2r1  ; Intermediate value c2 * r1 
        .asg            B4,         B_c3S0  ; Intermediate value c3 * S0 
        .asg            B10,        B_c5S0  ; Intermediate value c5 * S0 
        .asg            B4,         B_c6r1  ; Intermediate value c6 * r1 
        .asg            B4,         B_c7S1  ; Intermediate value c7 * S1 
        .asg            B5,         B_F0    ; Frequency domain term F0 
        .asg            A6,         A_F1    ; Frequency domain term F1 
        .asg            B9,         B_F2    ; Frequency domain term F2 
        .asg            B3,         B_F3    ; Frequency domain term F3 
        .asg            A1,         A_F4    ; Frequency domain term F4 
        .asg            A9,         A_F5    ; Frequency domain term F5 
        .asg            A3,         A_F6    ; Frequency domain term F6 
        .asg            B7,         B_F7    ; Frequency domain term F7 
        .asg            B5,         B_F0r   ; Rounded value for F0 
        .asg            A6,         A_F1r   ; Rounded value for F1 
        .asg            B3,         B_F2r   ; Rounded value for F2 
        .asg            B3,         B_F3r   ; Rounded value for F3 
        .asg            A7,         A_F4r   ; Rounded value for F4 
        .asg            A9,         A_F5r   ; Rounded value for F5 
        .asg            A3,         A_F6r   ; Rounded value for F6 
        .asg            B5,         B_F7r   ; Rounded value for F7 
        .asg            B6,         B_F0t   ; Truncated result for F0 
        .asg            A8,         A_F1t   ; Truncated result for F1 
        .asg            B6,         B_F2t   ; Truncated result for F2 
        .asg            B4,         B_F3t   ; Truncated result for F3 
        .asg            A7,         A_F4t   ; Truncated result for F4 
        .asg            A0,         A_F5t   ; Truncated result for F5 
        .asg            A5,         A_F6t   ; Truncated result for F6 
        .asg            B13,        B_F7t   ; Truncated result for F7 
        .asg            A2,         A_o     ; Outer loop counter 
        .asg            B0,         B_c     ; Prolog collapse counter 
        .asg            A1,         A_c     ; Prolog collapse counter copy 
; ========================================================================= ; 
 
* ========================================================================= * 
*   (Instructions marked "(v)" in the prolog below are from the vertical    * 
*   loop's epilog.)                                                         * 
* ========================================================================= * 
 
; =========================== PIPE LOOP PROLOG ============================ ; 
        LDH     .D1T2   *-A_io_ptr  [ 4],       B_f2            ;[ 1,1] 
||      SUB             A_io_ptr,   12,         B_io_ptr 
||      STH     .D2T1   Av_F1t,     *-B_o_ptr  [16]             ;[30,4] (v) 
||      SHR     .S1     Av_F2,      13,         Av_F2t          ;[25,4] (v) 
 
        LDH     .D1T1   *-A_io_ptr  [ 3],       A_f3            ;[ 2,1] 
||      LDH     .D2T2   *+B_io_ptr  [ 5],       B_f5            ;[ 2,1] 
||      SUB     .L2X    Bv_c7S1,    Av_c1Q1,    Bv_F7           ;[25,4] (v) 
||      ADD     .L1X    Av_c3Q0,    Bv_c5S0,    Av_F5           ;[25,4] (v) 
;- 
        LDH     .D2T1   *+B_io_ptr  [ 4],       A_f4            ;[ 3,1] 
||      LDH     .D1T2   * A_io_ptr--[ 7],       B_f6t           ;[ 3,1] 
||      MVK     .S1     0x7FFF,     A_k_rnd         ; Rounding value 
||      MVK     .S2     0x7FFF,     B_k_rnd         ; Rounding value 
 
        LDH     .D2T1   * B_io_ptr--[ 8],       A_f0            ;[ 5,1] 
||      LDH     .D1T2   *+A_io_ptr  [ 2],       B_f1            ;[ 4,1] 
||      SHR     .S1     Av_F5,      13,         Av_F5t          ;[27,4] (v) 
 
;- 
        SHL     .S1X    Bv_F7,      3,          A_f7            ;[29,4] (v) 
||      STH     .D2T2   Bv_F3t,     * B_o_ptr  [ 0]             ;[28,4] (v) 
 
        CLR     .S2     B_k1c0,     0,15,       B_c ; Prolog collapse: 0x10000 
||      MV      .L1X    B_c3c5,     A_c3c5          ; Twin constant register 
||      STH     .D2T1   Av_F5t,     *+B_o_ptr  [16]             ;[29,4] (v) 
 
        MV      .L1X    B15,        A15             ; Twin stack pointer 
||      MVC     .S2X    A2,         IRP             ; Restore IRP 
||      STH     .D2T1   Av_F2t,     *-B_o_ptr  [ 8]             ;[27,4] (v) 
;- 
 
        ADD     .L1     A_f3,       A_f4,       A_h0            ;[ 8,1] 
||      ADD     .S2     B_f2,       B_f5,       B_h1            ;[ 8,1] 
||      SUB     .L2     B_f2,       B_f5,       B_g3            ;[ 9,1] 
||      LDW     .D2T1   *B15[3],    A_o 
 
        SUB     .L1     A_f3,       A_f4,       A_q1            ;[ 9,1] q1=g2 
||      ADD     .L2     B_f1,       B_f6t,      B_g1            ;[ 9,1] 
||      SUB     .S2     B_f1,       B_f6t,      B_h3            ;[ 9,1] 
||      SHR     .S1     A_f7,       16,         A_f7 
;- 
; =========================== PIPE LOOP KERNEL ============================ ; 
h_loop: 
  [!B_c]STH     .D1T2   B_F0t,      *+A_io_ptr[ 9]              ;[20,1] 
||      MPY     .M1     A_Q0,       A_c3c5,     A_c5Q0          ;[20,1] 
||      MPYLH   .M2     B_S0,       B_c3c5,     B_c3S0          ;[20,1] 
||      ADD     .S1X    A_c7Q1,     B_c1S1,     A_F1            ;[20,1] 
||      SUB     .S2X    B_c7S1,     A_c1Q1,     B_F7            ;[20,1] 
||      ADD     .L1     A_f0,       A_f7,       A_g0            ;[10,2] 
||      ADD     .D2     B_h3,       B_g3,       B_s0a           ;[10,2] 
||      SUB     .L2     B_h3,       B_g3,       B_q0a           ;[10,2] 
;- 
h_loop_1: 
        ADD     .L2     B_F7,       B_k_rnd,    B_F7r           ;[21,1] 
||[!B_c]STH     .D2T1   A_F6t,      *+B_io_ptr[22]              ;[21,1] 
||      SUB     .S2     B_g1,       B_h1,       B_r1            ;[11,2] 
||      MPYSU   .M2     B_s0a,      B_k1c0,     B_s0b           ;[11,2] 
||      MPYSU   .M1X    B_q0a,      A_k1c0,     A_q0b           ;[11,2] 
||      ADD     .S1     A_g0,       A_h0,       A_p0            ;[11,2] 
||      SUB     .L1     A_g0,       A_h0,       A_r0            ;[11,2] 
||      LDH     .D1T2   *-A_io_ptr  [ 2],       B_f5            ;[ 1,3] 
;- 
h_loop_2: 
        SUB     .S2X    B_c3S0,     A_c5Q0,     B_F3            ;[22,1] 
||      ADD     .S1X    A_c3Q0,     B_c5S0,     A_F5            ;[22,1] 
||      ADD     .L1     A_F1,       A_k_rnd,    A_F1r           ;[22,1] 
||      MPYH    .M2     B_F7r,      B_k1c0,     B_F7t           ;[22,1] 
||      ADD     .L2     B_g1,       B_h1,       B_p1            ;[12,2] 
||      MPY     .M1     A_r0,       A_c2c6,     A_c6r0          ;[12,2] 
||      LDH     .D1T1   *-A_io_ptr  [ 4],       A_f3            ;[ 2,3] 
||      LDH     .D2T2   *+B_io_ptr  [ 2],       B_f2            ;[ 2,3] 
;- 
h_loop_3: 
        ADD     .S2     B_F3,       B_k_rnd,    B_F3r           ;[23,1] 
||      SUB     .S1X    A_p0,       B_p1,       A_F4            ;[13,2] 
||      ADD     .L1     A_q0b,      A_k_rnd,    A_q0c           ;[13,2] 
||      ADD     .L2     B_s0b,      B_k_rnd,    B_s0c           ;[13,2] 
||      MPYLH   .M2X    B_r1,       A_c2c6,     B_c2r1          ;[13,2] 
||      MPYLH   .M1     A_r0,       A_c2c6,     A_c2r0          ;[13,2] 
||      LDH     .D2T1   *+B_io_ptr  [ 4],       A_f4            ;[ 3,3] 
||      LDH     .D1T2   *-A_io_ptr  [ 1],       B_f6            ;[ 3,3] 
;- 
h_loop_4: 
        ADD     .L1     A_F5,       A_k_rnd,    A_F5r           ;[24,1] 
||      SHR     .S2     B_F3r,      16,         B_F3t           ;[24,1] 
||[ A_o]B       .S1     h_loop                                  ;[24,1] 
||      ADD     .L2X    A_p0,       B_p1,       B_F0            ;[14,2] 
||      MPYH    .M2     B_s0c,      B_k1c0,     B_s0            ;[14,2] 
||      MPYH    .M1     A_q0c,      A_k1c0,     A_q0            ;[14,2] 
||      LDH     .D1T1   * A_io_ptr--[ 8],       A_f7            ;[ 4,3] 
||      LDH     .D2T2   *+B_io_ptr  [ 1],       B_f1            ;[ 4,3] 
;- 
h_loop_5: 
        ADD     .S2     B_F2,       B_k_rnd,    B_F2r           ;[25,1] 
||      MPYH    .M1     A_F1r,      A_k1c0,     A_F1t           ;[25,1] 
||      ADD     .D1     A_F4,       4,          A_F4r           ;[15,2] 
||      SUB     .S1X    A_c6r0,     B_c2r1,     A_F6            ;[15,2] 
||      SUB     .L1     A_f0,       A_f7,       A_s1            ;[15,2] s1=h2 
||      ADD     .L2     B_F0,       6,          B_F0r           ;[15,2] 
||      MPY     .M2X    B_r1,       A_c2c6,     B_c6r1          ;[15,2] 
||      LDH     .D2T1   * B_io_ptr--[ 8],       A_f0            ;[ 5,3] 
;- 
h_loop_6: 
        SHR     .S1     A_F5r,      16,         A_F5t           ;[26,1] 
||      SHR     .S2     B_F2r,      16,         B_F2t           ;[26,1] 
||[!B_c]STH     .D2T2   B_F3t,      *+B_io_ptr[27]              ;[26,1] 
||      SUB     .L1     A_q1,       A_q0,       A_Q0            ;[16,2] 
||      ADD     .L2X    A_s1,       B_s0,       B_S1            ;[16,2] 
||      ADD     .D1     A_q1,       A_q0,       A_Q1            ;[16,2] 
||      MPYUS   .M2     B_c,        2,          B_c             ;pro. collapse 
||      MPYHL   .M1X    B_c,        A_c2c6,     A_c             ;pro. collapse 
;- 
h_loop_7: 
  [!B_c]STH     .D1T1   A_F5t,      *+A_io_ptr[22]              ;[27,1] 
||[!B_c]STH     .D2T2   B_F2t,      *+B_io_ptr[26]              ;[27,1] 
||      ADD     .L1     A_F6,       A_k_rnd,    A_F6r           ;[17,2] 
||      SHR     .S2     B_F0r,      3,          B_F0t           ;[17,2] 
||      SHR     .S1     A_F4r,      3,          A_F4t           ;[17,2] 
||      SUB     .L2X    A_s1,       B_s0,       B_S0            ;[17,2] 
||      MPY     .M2     B_S1,       B_c1c7,     B_c7S1          ;[17,2] 
||      MPY     .M1X    A_Q1,       B_c1c7,     A_c7Q1          ;[17,2] 
;- 
h_loop_8: 
  [!A_c]STH     .D1T1   A_F1t,      *+A_io_ptr[18]              ;[28,1] 
||[!A_c]STH     .D2T2   B_F7t,      *+B_io_ptr[31]              ;[28,1] 
||      SHR     .S1     A_F6r,      16,         A_F6t           ;[18,2] 
||      ADD     .L2X    B_c6r1,     A_c2r0,     B_F2            ;[18,2] 
||      MPYLH   .M2     B_S1,       B_c1c7,     B_c1S1          ;[18,2] 
||      MPYLH   .M1X    A_Q1,       B_c1c7,     A_c1Q1          ;[18,2] 
||      ADD     .L1     A_f3,       A_f4,       A_h0            ;[ 8,3] 
||      ADD     .S2     B_f2,       B_f5,       B_h1            ;[ 8,3] 
;- 
h_loop_9: 
  [ A_o]SUB     .S1     A_o,        1,          A_o             ;[19,2] 
||[!B_c]STH     .D1T1   A_F4t,      *+A_io_ptr[13]              ;[19,2] 
||      MPYLH   .M1     A_Q0,       A_c3c5,     A_c3Q0          ;[19,2] 
||      MPY     .M2     B_S0,       B_c3c5,     B_c5S0          ;[19,2] 
||      SUB     .L1     A_f3,       A_f4,       A_q1            ;[ 9,3] q1=g2 
||      ADD     .L2     B_f1,       B_f6,       B_g1            ;[ 9,3] 
||      SUB     .D2     B_f1,       B_f6,       B_h3            ;[ 9,3] 
||      SUB     .S2     B_f2,       B_f5,       B_g3            ;[ 9,3] 
; =========================== PIPE LOOP EPILOG ============================ ; 
; EPILOG: 
;- 
* ========================================================================= * 
*   Epilog / Final Cleanup Code.                                            * 
*                                                                           * 
*   This code performs the final stores from the epilog while retoring      * 
*   Save-On-Entry values from the stack.  The two processes are heavily     * 
*   interwoven in the interest of speed.  For instance, the return addr.    * 
*   is loaded immediately and branched to as soon as it lands in the        * 
*   register file.  Meanwhile, the final epilog stores complete as the      * 
*   return-branch is taken.                                                 * 
*                                                                           * 
*   Note that a handful of symbolic names have been reassigned in the       * 
*   epilog to avoid interfering with the values being loaded from the       * 
*   stack.                                                                  * 
* ========================================================================= * 
        .asg            B5,         B_F7t 
        .asg            B9,         B_F2r 
        .asg            B8,         B_F3 
        .asg            B8,         B_F3r 
        .asg            A9,         A_F5t 
 
        MPY     .M1     A_Q0,       A_c3c5,     A_c5Q0 
||      MPYLH   .M2     B_S0,       B_c3c5,     B_c3S0 
||      ADD     .S1X    A_c7Q1,     B_c1S1,     A_F1 
||      SUB     .S2X    B_c7S1,     A_c1Q1,     B_F7 
||      ADD     .L2     B_F2,       B_k_rnd,    B_F2r 
||      LDW     .D2T1   *+ B15[ 2], A0              ; Load CSR's value 
||      LDW     .D1T2   *+ A15[ 5], B3              ; Load return address 
;- 
        ADD     .L2     B_F7,       B_k_rnd,    B_F7r 
||      ADD     .L1     A_F1,       A_k_rnd,    A_F1r 
||      LDW     .D2T2   *+ B15[ 8], B11             ; Restore B11 
||      LDW     .D1T1   *+ A15[13], A13             ; Restore A13 
 
        MPYH    .M2     B_F7r,      B_k1c0,     B_F7t 
||      MPYH    .M1     A_F1r,      A_k1c0,     A_F1t 
||      LDW     .D1T2   *+ A15[ 6], B10             ; Restore B10 
||      LDW     .D2T1   *+ B15[ 7], A10             ; Restore A10 
;- 
        ADD     .S1X    A_c3Q0,     B_c5S0,     A_F5 
||      SUB     .S2X    B_c3S0,     A_c5Q0,     B_F3 
||      LDW     .D1T2   *+ A15[14], B14             ; Restore B14 
||      LDW     .D2T1   *+ B15[15], A14             ; Restore A14 
 
        ADD     .L2     B_F3,       B_k_rnd,    B_F3r 
||      ADD     .L1     A_F5,       A_k_rnd,    A_F5r 
||      LDW     .D1T2   *+ A15[10], B12             ; Restore B12 
||      LDW     .D2T1   *+ B15[11], A12             ; Restore A12 
;- 
        RET     .S2     B3                          ; Return to caller 
||      LDW     .D2T1   *+ B15[12], A11             ; Restore A11 
 
        SHR     .S2     B_F3r,      16,         B_F3t 
||      LDW     .D2T1   *++B15[16], A15             ; Rst. A15, release stack 
||      LDW     .D1T2   *+ A15[ 9], B13             ; Restore B13 
 
        STH     .D1T1   A_F1t,      *+A_io_ptr[10] 
||      STH     .D2T2   B_F7t,      *+B_io_ptr[23] 
||      SHR     .S1     A_F5r,      16,         A_F5t 
;- 
        STH     .D2T1   A_F6t,      *+B_io_ptr[22] 
||      STH     .D1T2   B_F0t,      *+A_io_ptr[ 9] 
 
        SHR     .S2     B_F2r,      16,         B_F2t 
||      STH     .D2T2   B_F3t,      *+B_io_ptr[19] 
 
        STH     .D1T1   A_F5t,      *+A_io_ptr[14] 
||      STH     .D2T2   B_F2t,      *+B_io_ptr[18] 
||      MVC     .S2X    A0,         CSR             ; Restore CSR 
;- 
; ===== Interruptibility state restored here ===== 
; ===== Branch Occurs ===== 
 
* ========================================================================= * 
*   End of file:  img_fdct_8x8.asm                                          * 
* ------------------------------------------------------------------------- * 
*             Copyright (c) 2003 Texas Instruments, Incorporated.           * 
*                            All Rights Reserved.                           * 
* ========================================================================= *