www.pudn.com > ccs_encoder.rar > idct.asm


 .text 
        .global _IMG_idct_8x8 
_IMG_idct_8x8: 
; ============================ SYMBOLIC CONSTANTS ============================ 
        .asg            0x0B19,     cst_c1  ; Cosine term c1 
        .asg            0x0A74,     cst_c2  ; Cosine term c2 
        .asg            0x0968,     cst_c3  ; Cosine term c3 
        .asg            0x0800,     cst_c4  ; Cosine term c4 
        .asg            0x0649,     cst_c5  ; Cosine term c5 
        .asg            0x0454,     cst_c6  ; Cosine term c6 
        .asg            0x0235,     cst_c7  ; Cosine term c7 
        .asg            11,         q_pt    ; Q-point for calculations 
        .asg            16,         kq_a    ; Extract const for c4 "mpy" 
        .asg            16-q_pt,    kq_b    ; Extract const for c4 "mpy" 
        .asg            9,          trunc1  ; Truncation after horizontal pass 
        .asg            9,          results ; Final precision of results 
        .asg            32-results, trunc2  ; Final truncation right-shift 
        .asg            16-results, satl    ; Final saturation left-shift 
; =============== SYMBOLIC REGISTER ASSIGNMENTS FOR HORIZ LOOP =============== 
        .asg            B13,        B_c7c5  ; Cosine terms c7, c5   (packed) 
        .asg            A13,        A_c7c5  ; Cosine terms c7, c5   (packed) 
        .asg            B12,        B_c3c1  ; Cosine terms c3, c1   (packed) 
        .asg            A12,        A_c3c1  ; Cosine terms c3, c1   (packed) 
        .asg            B14,        B_c6c2  ; Cosine terms c6, c2   (packed) 
        .asg            A14,        A_i_ptr ; Input pointer #1 
        .asg            B15,        B_i_ptr ; Input pointer #2 
        .asg            A11,        A_o_ptr ; Output pointer #1 
        .asg            B11,        B_o_ptr ; Output pointer #2 
        .asg            B2,         B_o     ; Outer loop counter 
        .asg            A5,         A_X1X0  ; Incoming coefs X1, X0 (packed) 
        .asg            A10,        A_X3X2  ; Incoming coefs X3, X2 (packed) 
        .asg            B7,         B_X5X4  ; Incoming coefs X5, X4 (packed) 
        .asg            B10,        B_X7X6  ; Incoming coefs X7, X6 (packed) 
        .asg            A7,         A_X2c6  ; X2 * c6 
        .asg            B0,         B_X6c2  ; X6 * c2 
        .asg            A0,         A_X2c2  ; X2 * c2 
        .asg            B1,         B_X6c6  ; X6 * c6 
        .asg            A6,         A_P0    ; Node P0 in signal flow graph 
        .asg            B8,         B_P1    ; Node P1 in signal flow graph 
        .asg            A8,         A_p0    ; Node p0 in signal flow graph 
        .asg            A0,         A_p1    ; Node p1 in signal flow graph 
        .asg            B0,         B_r1    ; Node r1 in signal flow graph 
        .asg            B4,         B_r0    ; Node r0 in signal flow graph 
        .asg            B7,         B_g0    ; Node g0 in signal flow graph 
        .asg            B3,         B_g1    ; Node g1 in signal flow graph 
        .asg            A15,        A_h1    ; Node h1 in signal flow graph 
        .asg            A15,        A_h0    ; Node h0 in signal flow graph 
        .asg            A3,         A_X1c1  ; X1 * c1 
        .asg            A0,         A_X1c3  ; X1 * c3 
        .asg            A3,         A_X1c5  ; X1 * c5 
        .asg            A9,         A_X1c7  ; X1 * c7 
        .asg            A9,         A_X3c1  ; X3 * c1 
        .asg            A0,         A_X3c3  ; X3 * c3 
        .asg            A5,         A_X3c5  ; X3 * c5 
        .asg            A5,         A_X3c7  ; X3 * c7 
        .asg            B0,         B_X5c1  ; X5 * c1 
        .asg            B4,         B_X5c3  ; X5 * c3 
        .asg            B3,         B_X5c5  ; X5 * c5 
        .asg            B6,         B_X5c7  ; X5 * c7 
        .asg            B0,         B_X7c1  ; X7 * c1 
        .asg            B3,         B_X7c3  ; X7 * c3 
        .asg            B9,         B_X7c5  ; X7 * c5 
        .asg            B1,         B_X7c7  ; X7 * c7 
        .asg            A7,         A_g2a   ; X1 * c7 - X3 * c5 
        .asg            B8,         B_g2b   ; X5 * c3 - X7 * c1 
        .asg            A6,         A_g2    ; Node g2 in signal flow graph 
        .asg            A3,         A_g3a   ; X1 * c5 - X3 * c1 
        .asg            B6,         B_g3b   ; X5 * c7 + X7 * c3 
        .asg            A4,         A_g3    ; Node g3 in signal flow graph 
        .asg            A6,         A_h3a   ; X1 * c3 - X2 * c7 
        .asg            B7,         B_h3b   ; X5 * c1 + X7 * c5 
        .asg            B5,         B_h3n   ; Node h3, negated. 
        .asg            A0,         A_h2a   ; X1 * c1 + X3 * c3 
        .asg            B3,         B_h2b   ; X5 * c5 + X7 * c7 
        .asg            B1,         B_h2    ; Node h2 in signal flow graph 
        .asg            B4,         B_x0    ; Output x0, pre-truncation 
        .asg            B0,         B_x1    ; Output x1, pre-truncation 
        .asg            A4,         A_x2    ; Output x2, pre-truncation 
        .asg            A4,         A_x3    ; Output x3, pre-truncation 
        .asg            A7,         A_x4    ; Output x4, pre-truncation 
        .asg            A15,        A_x5    ; Output x5, pre-truncation 
        .asg            B6,         B_x6    ; Output x6, pre-truncation 
        .asg            B3,         B_x7    ; Output x7, pre-truncation 
        .asg            B4,         B_x0t   ; Output x0, truncated to 16 bits 
        .asg            B5,         B_x1t   ; Output x1, truncated to 16 bits 
        .asg            A4,         A_x2t   ; Output x2, truncated to 16 bits 
        .asg            A8,         A_x3t   ; Output x3, truncated to 16 bits 
        .asg            A7,         A_x4t   ; Output x4, truncated to 16 bits 
        .asg            A5,         A_x5t   ; Output x5, truncated to 16 bits 
        .asg            B3,         B_x6t   ; Output x6, truncated to 16 bits 
        .asg            B9,         B_x7t   ; Output x7, truncated to 16 bits 
        .asg            A2,         A_i     ; Inner-loop counter. 
; ============================================================================ 
 
* ========================================================================= * 
*   Initialization code for horizontal loop:  Saves registers to            * 
*   the stack, sets up cosine terms, pointers and loop control.             * 
*                                                                           * 
*   The stack frame for this code is 16 words large.  It holds the Save     * 
*   on Entry (SOE) registers A10..A15, B10..B14, as well as the return      * 
*   address (B3), CSR, IRP, and a single spill value.  (The loop counter    * 
*   initializer is shared between both loops and so I spill it to the       * 
*   stack.)  I twin the stack pointer to speed up stack accesses.  The      * 
*   stack frame layout is slightly funky to avoid bank conflicts while      * 
*   allowing me to get to everything when I need it most.                   * 
*                                                                           * 
*   The horizontal loop starts at the end of the IDCT array and works back  * 
*   towards the beginning.  As a result, the input and output pointers are  * 
*   initialized like so:                                                    * 
*                                                                           * 
*    -- A_i_ptr is set to point to the coefficients "X0" and "X1" in the    * 
*       last row of the last valid IDCT block in the input.  B_i_ptr is     * 
*       set to point to the coefficients "X4" and "X5" in that same row.    * 
*                                                                           * 
*    -- A_o_ptr is set to point to the coefficient "x4" in the rightmost    * 
*       column of the scratch block I require at the end of the array.      * 
*       B_o_ptr is set to point to "x3" in that same column.                * 
*                                                                           * 
*   The loop count is simply the number of IDCTs times 8, minus 1 to        * 
*   handle the parallel iterations in the kernel.  (It would've been more,  * 
*   except that I've performed some limited prolog and epilog collapsing,   * 
*   so I need to iterate the kernel more times.)  A happy coincidence       * 
*   gives both horizontal and vertical loops the exact same trip count,     * 
*   so I spill this value to the stack and simply restore it unchanged      * 
*   for the second loop, rather than recalculating it.                      * 
*                                                                           * 
*   Since I was able to free up a single predication register in the first  * 
*   loop, I prolog-collapsed one stage of the prolog.  I use A1 as my       * 
*   prolog-collapse counter.  To save a MVK (since this code bottlenecks    * 
*   heavily on S units), I initialize it to -1 with an OR, rather than a    * 
*   more traditional 1.                                                     * 
*                                                                           * 
*   Both loops use all 32 registers, so I have saved the stack pointer in   * 
*   IRP.  This is safe since interrupts are explicitly disabled for the     * 
*   entire function.                                                        * 
*                                                                           * 
*   Note:  This setup code could possibly be a cycle or two faster.  For    * 
*   instance, I could copy B15 to A15 before the decrement and use          * 
*   negative indexes for the STWs through A15, saving a whole cycle on      * 
*   the stack saves.  The resulting code doesn't pack as nicely, though.    * 
* ========================================================================= * 
 
;- 
        STW     .D2T1   A15,        *B15--[16]      ; Save A15, get stack frame 
||      MVC     .S2     CSR,        B0              ; Grab the current CSR 
 
        AND     .L2     B0,         -2,         B1  ; Clear GIE bit in CSR 
||      MV      .L1X    B15,        A15             ; Twin the stack pointer 
 
        STW     .D1T1   A14,        *+A15 [13]      ; Save SOE reg A14 
||      STW     .D2T2   B14,        *+B15 [12]      ; Save SOE reg B14 
||      MV      .L1X    B0,         A0              ; Partitioning MV. 
||      MVC     .S2     B1,         CSR             ; Interrupts disabled here 
 
;- 
        STW     .D1T1   A13,        *+A15 [10]      ; Save SOE reg A13 
||      STW     .D2T2   B13,        *+B15 [11]      ; Save SOE reg B13 
 
        STW     .D1T1   A12,        *+A15 [ 9]      ; Save SOE reg A12 
||      STW     .D2T2   B12,        *+B15 [ 8]      ; Save SOE reg B12 
 
        STW     .D1T1   A11,        *+A15 [ 7]      ; Save SOE reg A11 
||      STW     .D2T2   B11,        *+B15 [ 6]      ; Save SOE reg B11 
||      SHL     .S2     B4,         3,      B_o     ; Set up outer loop counter 
||      OR      .L1     A1,         -1,     A1      ; Prolog collapse counter 
 
;- 
        STW     .D1T1   A10,        *+A15 [ 5]      ; Save SOE reg A10 
||      STW     .D2T2   B10,        *+B15 [ 4]      ; Save SOE reg B10 
||      SHL     .S2     B4,         7,      B4      ; Set up end-of-array ptr 
||[B_o] SUB     .L2     B_o,        1,      B_o     ; Loop count = IDCTs*8 - 1 
 
        STW     .D2T2   B3,         *+B15 [ 2]      ; Remember the return addr 
||      STW     .D1T1   A0,         *+A15 [ 3]      ; Remember the CSR state 
||      ADD     .L2X    A4,         B4,     B4      ; Point to scratch area 
||      MVC     .S2     IRP,        B0 
 
;- 
        STW     .D2T2   B_o,        *+B15 [ 1]      ; Spill our loop count init 
||      MVC     .S2     B15,        IRP             ; Save stack ptr in IRP 
||      SUB     .L2     B4,         8,      B_i_ptr ; Point to X5X4, row 7 
||      MV      .L1X    B4,         A_o_ptr 
||      MVK     .S1     7,          A_i             ; Set up inner loop counter 
 
        SUB     .L1X    B_i_ptr,    8,      A_i_ptr ; Point to X1X0, row 7 
||      ADDAH   .D2     B4,         31,     B_o_ptr ; Point to x3, col 7 
||      ADDK    .S1     78,         A_o_ptr         ; Point to x4, col 7 
;- 
; ============================ PIPE LOOP PROLOG ============================== 
h_prolog: 
  [ B_o]LDW     .D1T1   * A_i_ptr--[4],         A_X1X0  ;[ 1,1] 
||[ B_o]LDW     .D2T2   *+B_i_ptr[1],           B_X7X6  ;[ 1,1] 
||      MVK     .S1     cst_c1,     A_c3c1              ; c1 
||[!B_o]B       .S2     idct_8x8_abort                  ; num_idcts==0? Abort. 
 
  [ B_o]LDW     .D1T1   *+A_i_ptr[5],           A_X3X2  ;[ 2,1] 
||[ B_o]LDW     .D2T2   * B_i_ptr--[4],         B_X5X4  ;[ 2,1] 
||      MVK     .S1     cst_c5,     A_c7c5              ; c5 
||      MVK     .S2     cst_c2,     B_c6c2              ; c2 
;- 
        STW     .D1T2   B0,         *A15[14]            ; save IRP 
 
        MVKLH   .S1     cst_c7,     A_c7c5              ; c7 
||      MVKLH   .S2     cst_c6,     B_c6c2              ; c6 
 
        MVKLH   .S1     cst_c3,     A_c3c1              ; c3 
||      MVK     .S2     cst_c5,     B_c7c5              ; c5 
 
        MPYH    .M1     A_X1X0,     A_c7c5,     A_X1c7  ;[ 6,1] 
||      MPYLH   .M2     B_X7X6,     B_c6c2,     B_X6c6  ;[ 6,1] 
||      MVKLH   .S2     cst_c7,     B_c7c5              ; c7 
 
; ===== Branch Occurs ===== 
;- 
        EXT     .S1     A_X1X0,     kq_a, kq_b, A_P0    ;[ 7,1] 
||      MPY     .M1X    A_X3X2,     B_c6c2,     A_X2c2  ;[ 7,1] 
||      MPYHL   .M2     B_X7X6,     B_c7c5,     B_X7c5  ;[ 7,1] 
||      MV      .L2X    A_c3c1,     B_c3c1 
 
        ADDK    .S1     256,        A_P0                ;[ 8,1] 
||      EXT     .S2     B_X5X4,     kq_a, kq_b, B_P1    ;[ 8,1] 
||      MPYHL   .M1     A_X1X0,     A_c3c1,     A_X1c1  ;[ 8,1] 
||      MPYH    .M2     B_X7X6,     B_c7c5,     B_X7c7  ;[ 8,1] 
;- 
; ============================ PIPE LOOP KERNEL ============================== 
h_loop: 
h_loop_0: 
        SUB     .L2     B_g1,       B_h3n,      B_x1    ;[19,1] 
||      STH     .D2T2   B_x0t,      *-B_o_ptr[24]       ;[19,1] 
||      ADD     .D1     A_i,        1,          A_i     ;[19,1] 
||      SHR     .S1     A_x3,       trunc1,     A_x3t   ;[19,1] 
||      ADD     .L1X    A_g3a,      B_g3b,      A_g3    ;[19,1] 
||      ADD     .S2X    A_X2c2,     B_X6c6,     B_r0    ;[ 9,2] 
||      MPYH    .M1     A_X3X2,     A_c3c1,     A_X3c3  ;[ 9,2] 
||      MPYHL   .M2     B_X5X4,     B_c7c5,     B_X5c5  ;[ 9,2] 
 
h_loop_1: 
        ADD     .L2     B_g1,       B_h3n,      B_x6    ;[20,1] 
||[!A1] STH     .D2T1   A_x3t,      * B_o_ptr--[1]      ;[20,1] 
||      ADD     .S1     A_h1,       A_g3,       A_x2    ;[20,1] 
||      SUB     .D1     A_h1,       A_g3,       A_x5    ;[20,1] 
||      ADD     .L1X    A_P0,       B_P1,       A_p0    ;[10,2] 
||      MPYHL   .M1     A_X1X0,     A_c7c5,     A_X1c5  ;[10,2] 
||      MPYHL   .M2     B_X7X6,     B_c3c1,     B_X7c1  ;[10,2] 
 
h_loop_2: 
        SHR     .S1     A_x5,       trunc1,     A_x5t   ;[21,1] 
||      SHR     .S2     B_x1,       trunc1,     B_x1t   ;[21,1] 
||      ADD     .L1     A_X1c1,     A_X3c3,     A_h2a   ;[11,2] 
||      ADD     .L2     B_X5c5,     B_X7c7,     B_h2b   ;[11,2] 
||      MPYH    .M1     A_X1X0,     A_c3c1,     A_X1c3  ;[11,2] 
||      MPYH    .M2     B_X5X4,     B_c7c5,     B_X5c7  ;[11,2] 
||      LDW     .D1T1   * A_i_ptr--[4],         A_X1X0  ;[ 1,3] 
||      LDW     .D2T2   *+B_i_ptr[1],           B_X7X6  ;[ 1,3] 
 
h_loop_3: 
        SHR     .S2     B_x6,       trunc1,     B_x6t   ;[22,1] 
||      SHR     .S1     A_x2,       trunc1,     A_x2t   ;[22,1] 
||      SUB     .L1X    A_p0,       B_r0,       A_h0    ;[12,2] 
||      ADD     .L2X    A_h2a,      B_h2b,      B_h2    ;[12,2] 
||      MPYH    .M1     A_X3X2,     A_c7c5,     A_X3c7  ;[12,2] 
||      MPYH    .M2     B_X5X4,     B_c3c1,     B_X5c3  ;[12,2] 
||      LDW     .D1T1   *+A_i_ptr[5],           A_X3X2  ;[ 2,3] 
||      LDW     .D2T2   * B_i_ptr--[4],         B_X5X4  ;[ 2,3] 
 
h_loop_4: 
  [ B_o]B       .S2     h_loop                          ;[23,1] 
||      STH     .D1T1   A_x5t,      *+A_o_ptr[8]        ;[23,1] 
||      SHR     .S1     A_x4,       trunc1,     A_x4t   ;[23,1] 
||      ADD     .L2X    A_p0,       B_r0,       B_g0    ;[13,2] 
||[ B_o]SUB     .D2     B_o,        1,          B_o     ;[13,2] 
||[!A1] AND     .L1     A_i,        7,          A_i     ;[13,2] 
||      MPYHL   .M1     A_X3X2,     A_c7c5,     A_X3c5  ;[13,2] 
||      MPYHL   .M2     B_X5X4,     B_c3c1,     B_X5c1  ;[13,2] 
 
h_loop_5: 
  [!A1] STH     .D1T1   A_x4t,      * A_o_ptr--[1]      ;[24,1] 
||      SUB     .S1     A_X1c3,     A_X3c7,     A_h3a   ;[14,2] 
||      SUB     .L1X    A_P0,       B_P1,       A_p1    ;[14,2] 
||      ADD     .S2     B_g0,       B_h2,       B_x0    ;[14,2] 
||      SUB     .L2     B_X5c3,     B_X7c1,     B_g2b   ;[14,2] 
||      MPYHL   .M1     A_X3X2,     A_c3c1,     A_X3c1  ;[14,2] 
||      MPY     .M2     B_X7X6,     B_c6c2,     B_X6c2  ;[14,2] 
 
h_loop_6: 
        STH     .D1T2   B_x6t,      *+A_o_ptr[17]       ;[25,1] 
||      SUB     .D2     B_g0,       B_h2,       B_x7    ;[15,2] 
||      SHR     .S2     B_x0,       trunc1,     B_x0t   ;[15,2] 
||      SUB     .S1     A_X1c7,     A_X3c5,     A_g2a   ;[15,2] 
||      ADD     .L2     B_X5c1,     B_X7c5,     B_h3b   ;[15,2] 
||      MPYLH   .M1X    A_X3X2,     B_c6c2,     A_X2c6  ;[15,2] 
||      MPYH    .M2     B_X7X6,     B_c3c1,     B_X7c3  ;[15,2] 
||[ A1] ADD     .L1     A1,         1,          A1 
 
h_loop_7: 
  [!A_i]SUBAW   .D1     A_o_ptr,    28,         A_o_ptr ;[26,1] 
||      STH     .D2T2   B_x1t,      *-B_o_ptr[15]       ;[26,1] 
||      SHR     .S2     B_x7,       trunc1,     B_x7t   ;[16,2] 
||      SUB     .L1     A_X1c5,     A_X3c1,     A_g3a   ;[16,2] 
||      SUB     .L2X    B_h3b,      A_h3a,      B_h3n   ;[16,2] 
||      ADD     .S1X    A_g2a,      B_g2b,      A_g2    ;[16,2] 
||      MPYH    .M1     A_X1X0,     A_c7c5,     A_X1c7  ;[ 6,3] 
||      MPYLH   .M2     B_X7X6,     B_c6c2,     B_X6c6  ;[ 6,3] 
 
h_loop_8: 
        STH     .D2T1   A_x2t,      *-B_o_ptr[7]        ;[27,1] 
||      ADD     .L1     A_h0,       A_g2,       A_x3    ;[17,2] 
||      SUB     .D1     A_h0,       A_g2,       A_x4    ;[17,2] 
||      SUB     .L2X    A_X2c6,     B_X6c2,     B_r1    ;[17,2] 
||      EXT     .S1     A_X1X0,     kq_a, kq_b, A_P0    ;[ 7,3] 
||      EXT     .S2     B_X5X4,     kq_a, kq_b, B_P1    ;[ 7,3] 
||      MPY     .M1X    A_X3X2,     B_c6c2,     A_X2c2  ;[ 7,3] 
||      MPYHL   .M2     B_X7X6,     B_c7c5,     B_X7c5  ;[ 7,3] 
 
h_loop_9: 
  [!A_i]SUBAW   .D2     B_o_ptr,    28,         B_o_ptr ;[28,1] 
||      STH     .D1T2   B_x7t,      *+A_o_ptr[24]       ;[18,2] 
||      ADD     .S2X    A_p1,       B_r1,       B_g1    ;[18,2] 
||      SUB     .L1X    A_p1,       B_r1,       A_h1    ;[18,2] 
||      ADD     .L2     B_X5c7,     B_X7c3,     B_g3b   ;[18,2] 
||      ADDK    .S1     256,        A_P0                ;[ 8,3] 
||      MPYHL   .M1     A_X1X0,     A_c3c1,     A_X1c1  ;[ 8,3] 
||      MPYH    .M2     B_X7X6,     B_c7c5,     B_X7c7  ;[ 8,3] 
 
; ============================ PIPE LOOP EPILOG ============================== 
h_epilog: 
        SUB     .L2     B_g1,       B_h3n,      B_x1    ;[19,3] 
||      STH     .D2T2   B_x0t,      *-B_o_ptr[24]       ;[19,3] 
||      SHR     .S1     A_x3,       trunc1,     A_x3t   ;[19,3] 
||      ADD     .L1X    A_g3a,      B_g3b,      A_g3    ;[19,3] 
 
        ADD     .L2     B_g1,       B_h3n,      B_x6    ;[20,3] 
||      STH     .D2T1   A_x3t,      *+B_o_ptr[0]        ;[20,3] 
||      ADD     .S1     A_h1,       A_g3,       A_x2    ;[20,3] 
||      SUB     .D1     A_h1,       A_g3,       A_x5    ;[20,3] 
;- 
        SHR     .S1     A_x5,       trunc1,     A_x5t   ;[21,3] 
||      SHR     .S2     B_x1,       trunc1,     B_x1t   ;[21,3] 
 
        SHR     .S2     B_x6,       trunc1,     B_x6t   ;[22,3] 
||      SHR     .S1     A_x2,       trunc1,     A_x2t   ;[22,3] 
||      STH     .D2T2   B_x1t,      *-B_o_ptr[16]       ;[26,3] 
 
        STH     .D1T1   A_x5t,      *+A_o_ptr[8]        ;[23,3] 
||      SHR     .S1     A_x4,       trunc1,     A_x4t   ;[23,3] 
 
* ========================================================================= * 
*   Interloop code:  Performs remaining epilog from horizontal pass, and    * 
*   begins setup of the vertical pass.                                      * 
*                                                                           * 
*   In order to save some time between loops, I start performing pointer    * 
*   fixups and constant initializations in the epilog of the horizontal     * 
*   pass loop.  The horizontal pass works from the bottom of the            * 
*   IDCT list and ends at the top, whereas the vertical pass works from     * 
*   the top of the list and ends up at the bottom.  As a result, the        * 
*   displacement between the required pointer settings between the two      * 
*   loops is fixed, regardless of the number of IDCTs processed, since      * 
*   the two loops pointers always meet at the top of the list.              * 
*                                                                           * 
*   The vertical loop needs a new repacking of the cosine terms: c6c3 and   * 
*   c2c1.  By playing around w/ how the cosine terms are packed,            * 
*   I was able to save two whole registers in the vertical loop and thus    * 
*   fit into the register file.  I do this repacking partly here, and       * 
*   partly in the vertical loop's prolog.                                   * 
* ========================================================================= * 
 
        STH     .D1T1   A_x4t,      *+A_o_ptr[0]        ;[24,3] 
;- 
        STH     .D1T2   B_x6t,      *+A_o_ptr[16]       ;[25,3] 
||      ADDK    .S1     168,        A_i_ptr             ; Fixup for vert loop 
||      ADDK    .S2     156,        B_i_ptr             ; Fixup for vert loop 
 
        .asg            A15,        A_c6c3              ; Symbolic name from 
                                                        ;    vertical loop. 
 
        STH     .D2T1   A_x2t,      *-B_o_ptr[8]        ;[27,3] 
||      SHR     .S1     A_c3c1,     16, A_c6c3          ; Set up new cos cst 
||      MVC     .S2     IRP,        B0                  ; Get SP so we can 
                                                        ;    unspill A_o. 
 
; ============================================================================ 
 
; =============== SYMBOLIC REGISTER ASSIGNMENTS FOR VERT LOOP ================ 
        .asg            A14,        A_i_ptr ; Input pointer #1 
        .asg            B15,        B_i_ptr ; Input pointer #2 
        .asg            A11,        A_o_ptr ; Output pointer #1 
        .asg            B11,        B_o_ptr ; Output pointer #2 
        .asg            B13,        B_c7c5  ; Cosine terms c7, c5   (packed) 
        .asg            A13,        A_c7c5  ; Cosine terms c7, c5   (packed) 
        .asg            A15,        A_c6c3  ; Cosine terms c6, c3   (packed) 
        .asg            B12,        B_c2c1  ; Cosine terms c2, c1   (packed) 
        .asg            A4,         A_c1c4  ; Cosine term  c1, c4 (alternates) 
        .asg            A2,         A_o     ; Outer loop counter 
        .asg            B2,         B_i     ; Inner loop counter 
        .asg            A12,        A_X7X6  ; Incoming coefs X7, X6 (packed) 
        .asg            A8,         A_X5X4  ; Incoming coefs X5, X4 (packed) 
        .asg            B10,        B_X3X2  ; Incoming coefs X3, X2 (packed) 
        .asg            B14,        B_X1X0  ; Incoming coefs X1, X0 (packed) 
        .asg            B9,         B_rnd   ; Rounding value applied to P0 
        .asg            B1,         B_P0_t  ; Node P0, temporary pre-rounding 
        .asg            B5,         B_P0    ; Rounded value of Node P0 
        .asg            A7,         A_P1    ; Node P1 in signal flow graph 
        .asg            B0,         B_X2c2  ; X2 * c2 
        .asg            B4,         B_X2c6  ; X2 * c6 
        .asg            A4,         A_X6c2  ; X6 * c2 
        .asg            A3,         A_X6c6  ; X6 * c6 
        .asg            A5,         A_p0    ; Node p0 in signal flow graph 
        .asg            A8,         A_p1    ; Node p1 in signal flow graph 
        .asg            B4,         B_r1    ; Node r1 in signal flow graph 
        .asg            B3,         B_r0    ; Node r0 in signal flow graph 
        .asg            B0,         B_g0    ; Node g0 in signal flow graph 
        .asg            A1,         A_g1    ; Node g1 in signal flow graph 
        .asg            B3,         B_h1    ; Node h1 in signal flow graph 
        .asg            A3,         A_h0    ; Node h0 in signal flow graph 
        .asg            B5,         B_X1c1  ; X1 * c1 
        .asg            B1,         B_X1c3  ; X1 * c3 
        .asg            B3,         B_X1c5  ; X1 * c5 
        .asg            B8,         B_X1c7  ; X1 * c7 
        .asg            B0,         B_X3c1  ; X3 * c1 
        .asg            B0,         B_X3c3  ; X3 * c3 
        .asg            B0,         B_X3c5  ; X3 * c5 
        .asg            B9,         B_X3c7  ; X3 * c7 
        .asg            A3,         A_X5c1  ; X5 * c1 
        .asg            A1,         A_X5c3  ; X5 * c3 
        .asg            A5,         A_X5c5  ; X5 * c5 
        .asg            A0,         A_X5c7  ; X5 * c7 
        .asg            A6,         A_X7c1  ; X7 * c1 
        .asg            A7,         A_X7c3  ; X7 * c3 
        .asg            A4,         A_X7c5  ; X7 * c5 
        .asg            A6,         A_X7c7  ; X7 * c7 
        .asg            A3,         A_h2a   ; X5 * c5 + X7 * c7 
        .asg            B3,         B_h2b   ; X1 * c1 + X3 * c3 
        .asg            B6,         B_h2    ; Node h2 in signal flow graph 
        .asg            A4,         A_h3a   ; X5 * c1 + X7 * c5 
        .asg            B1,         B_h3b   ; X1 * c3 + X3 * c7 
        .asg            A3,         A_h3    ; Node h3 in signal flow graph 
        .asg            A9,         A_g3a   ; X5 * c7 + X7 * c3 
        .asg            B1,         B_g3b   ; X1 * c5 + X3 * c1 
        .asg            B7,         B_g3    ; Node g3 in signal flow graph 
        .asg            A9,         A_g2a   ; X5 * c3 + X7 * c1 
        .asg            B1,         B_g2b   ; X1 * c7 + X3 * c5 
        .asg            A0,         A_g2    ; Node g2 in signal flow graph 
        .asg            B8,         B_x0    ; Output x0, pre-saturate/truncate 
        .asg            A1,         A_x1    ; Output x1, pre-saturate/truncate 
        .asg            B7,         B_x2    ; Output x2, pre-saturate/truncate 
        .asg            A4,         A_x3    ; Output x3, pre-saturate/truncate 
        .asg            A0,         A_x4    ; Output x4, pre-saturate/truncate 
        .asg            B4,         B_x5    ; Output x5, pre-saturate/truncate 
        .asg            A5,         A_x6    ; Output x6, pre-saturate/truncate 
        .asg            B6,         B_x7    ; Output x7, pre-saturate/truncate 
        .asg            B5,         B_x0s   ; Output x0, saturated to 9 bits 
        .asg            A10,        A_x1s   ; Output x1, saturated to 9 bits 
        .asg            B3,         B_x2s   ; Output x2, saturated to 9 bits 
        .asg            A6,         A_x3s   ; Output x3, saturated to 9 bits 
        .asg            A7,         A_x4s   ; Output x4, saturated to 9 bits 
        .asg            B4,         B_x5s   ; Output x5, saturated to 9 bits 
        .asg            A3,         A_x6s   ; Output x6, saturated to 9 bits 
        .asg            B6,         B_x7s   ; Output x7, saturated to 9 bits 
        .asg            B8,         B_x0t   ; Output x0, truncated to 9 bits 
        .asg            A0,         A_x1t   ; Output x1, truncated to 9 bits 
        .asg            B0,         B_x2t   ; Output x2, truncated to 9 bits 
        .asg            A6,         A_x3t   ; Output x3, truncated to 9 bits 
        .asg            A7,         A_x4t   ; Output x4, truncated to 9 bits 
        .asg            B4,         B_x5t   ; Output x5, truncated to 9 bits 
        .asg            A5,         A_x6t   ; Output x6, truncated to 9 bits 
        .asg            B3,         B_x7t   ; Output x7, truncated to 9 bits 
; ============================================================================ 
 
; ============================ PIPE LOOP PROLOG ============================== 
v_prolog: 
        LDW     .D2T1   *B0[1],     A_o                 ; Unspill loop counter 
||      ADDK    .S2     -128,       B_o_ptr             ; Fixup for vert loop 
;- 
        LDW     .D1T1   *+A_i_ptr[1],           A_X7X6  ;[ 1,1] 
||      LDW     .D2T2   *-B_i_ptr[1],           B_X1X0  ;[ 1,1] 
 
        ADDK    .S1     -128,       A_o_ptr             ; Fixup for vert loop 
 
        ; Set up modified constants for second loop 
        ; Note: A_c7c5, B_c7c5 are in same regs both loops. 
        ; Also, B_c2c1 reuses h_loop's B_c3c1. 
 
        LDW     .D2T2   * B_i_ptr++[4],         B_X3X2  ;[ 3,1] 
||      LDW     .D1T1   * A_i_ptr++[4],         A_X5X4  ;[ 3,1] 
 
        MVKLH   .S2     cst_c2,     B_c2c1              ; c2 (B_c2c1 == B_c3c1) 
||      MVKLH   .S1     cst_c6,     A_c6c3              ; c6 
 
        MVK     .S2     8,          B_i                 ; Inner loop counter. 
;- 
        MPYHL   .M1     A_X7X6,     A_c6c3,     A_X7c3  ;[ 6,1] 
 
        MPYH    .M1     A_X7X6,     A_c7c5,     A_X7c7  ;[ 7,1] 
||      MPYHL   .M2     B_X1X0,     B_c2c1,     B_X1c1  ;[ 7,1] 
 
        MVK     .S1     cst_c4,     A_c1c4              ;[ 8,1] 
||      MPYH    .M1     A_X5X4,     A_c7c5,     A_X5c7  ;[ 8,1] 
||      MPYHL   .M2     B_X1X0,     B_c7c5,     B_X1c5  ;[ 8,1] 
 
        MPY     .M1     A_X5X4,     A_c1c4,     A_P1    ;[ 9,1] 
||      MPYHL   .M2     B_X3X2,     B_c2c1,     B_X3c1  ;[ 9,1] 
;- 
        ADD     .D1     A_X5c7,     A_X7c3,     A_g3a   ;[10,1] 
||      MPYHL   .M1     A_X5X4,     A_c6c3,     A_X5c3  ;[10,1] 
||      MPYHL   .M2X    B_X3X2,     A_c6c3,     B_X3c3  ;[10,1] 
 
        SUB     .L2     B_X1c5,     B_X3c1,     B_g3b   ;[11,1] 
||      MPYHL   .M1     A_X5X4,     A_c7c5,     A_X5c5  ;[11,1] 
||      MPY     .M2X    B_X1X0,     A_c1c4,     B_P0_t  ;[11,1] 
||      MVK     .S2     -32768,     B_rnd               ;[ 6,1] 
||      B       .S1     v_loop_0 + 8                    ; skip 2 
;- 
        ADD     .L2X    B_g3b,      A_g3a,      B_g3    ;[12,1] 
||      MPYHL   .M1X    A_X7X6,     B_c2c1,     A_X7c1  ;[12,1] 
||      MPYH    .M2     B_X3X2,     B_c7c5,     B_X3c7  ;[12,1] 
||      LDW     .D1T1   *+A_i_ptr[1],           A_X7X6  ;[ 1,2] 
||      LDW     .D2T2   *-B_i_ptr[1],           B_X1X0  ;[ 1,2] 
||      B       .S2     v_loop_1 + 8                    ; skip 2 
;- 
        SUB     .D2     B_P0_t,     B_rnd,      B_P0    ;[13,1] 
||      ADD     .L2     B_X1c1,     B_X3c3,     B_h2b   ;[13,1] 
||      ADD     .L1     A_X5c5,     A_X7c7,     A_h2a   ;[13,1] 
||      MPYLH   .M1X    A_X7X6,     B_c2c1,     A_X6c2  ;[13,1] 
||      MPYLH   .M2X    B_X3X2,     A_c6c3,     B_X2c6  ;[13,1] 
||      B       .S2     v_loop_2 + 12                   ; skip 3 
||      MVKL    .S1     cst_c1,     A_c1c4              ; 
;- 
        SUB     .L1     A_X5c3,     A_X7c1,     A_g2a   ;[14,1] 
||      MPYHL   .M1     A_X5X4,     A_c1c4,     A_X5c1  ;[14,1] 
||      MPYHL   .M2X    B_X1X0,     A_c6c3,     B_X1c3  ;[14,1] 
||      LDW     .D2T2   * B_i_ptr++[4],         B_X3X2  ;[ 3,2] 
||      LDW     .D1T1   * A_i_ptr++[4],         A_X5X4  ;[ 3,2] 
||      B       .S2     v_loop_3 + 4                    ; skip 1 
||      ADD     .S1X    B_P0,       A_P1,       A_p0    ;[16,1] 
;- 
        ADD     .L2X    B_h2b,      A_h2a,      B_h2    ;[15,1] 
||      SUB     .L1X    B_P0,       A_P1,       A_p1    ;[15,1] 
||      MPYHL   .M1     A_X7X6,     A_c7c5,     A_X7c5  ;[15,1] 
||      MPYLH   .M2     B_X3X2,     B_c2c1,     B_X2c2  ;[15,1] 
||      B       .S2     v_loop_4 + 4                    ; skip 1 
 
        SUB     .L2X    B_X2c6,     A_X6c2,     B_r1    ;[16,1] 
||      MPYLH   .M1     A_X7X6,     A_c6c3,     A_X6c6  ;[16,1] 
||      MPYH    .M2     B_X1X0,     B_c7c5,     B_X1c7  ;[16,1] 
;- 
; ===== Branch Occurs ===== 
; ============================ PIPE LOOP KERNEL ============================== 
v_loop: 
v_loop_0: 
        STH     .D1T2   B_x7t,      *+A_o_ptr[24]       ;[28,1] 
||      SHR     .S1     A_x4s,      trunc2,     A_x4t   ;[28,1] 
||      ADD     .L1     A_X5c1,     A_X7c5,     A_h3a   ;[17,2] 
||      SUB     .D2     B_X1c3,     B_X3c7,     B_h3b   ;[17,2] 
||      SUB     .L2X    A_p1,       B_r1,       B_h1    ;[17,2] 
||      MPYHL   .M2     B_X3X2,     B_c7c5,     B_X3c5  ;[17,2] 
||      MVK     .S2     -32768,     B_rnd               ;[ 6,3] 
||      MPYHL   .M1     A_X7X6,     A_c6c3,     A_X7c3  ;[ 6,3] 
 
v_loop_1: 
        STH     .D1T1   A_x4t,      * A_o_ptr++[1]      ;[29,1] 
||      SHR     .S1     A_x1s,      trunc2,     A_x1t   ;[29,1] 
||      ADD     .S2     B_h1,       B_g3,       B_x2    ;[18,2] 
||      SUB     .D2     B_h1,       B_g3,       B_x5    ;[18,2] 
||      ADD     .L1X    A_p1,       B_r1,       A_g1    ;[18,2] 
||      ADD     .L2X    B_X2c2,     A_X6c6,     B_r0    ;[18,2] 
||      MPYH    .M1     A_X7X6,     A_c7c5,     A_X7c7  ;[ 7,3] 
||      MPYHL   .M2     B_X1X0,     B_c2c1,     B_X1c1  ;[ 7,3] 
 
v_loop_2: 
  [!B_i]ADDAW   .D1     A_o_ptr,    28,         A_o_ptr ;[30,1] 
||      STH     .D2T1   A_x3t,      * B_o_ptr++[1]      ;[30,1] 
||      SHR     .S2     B_x0s,      trunc2,     B_x0t   ;[30,1] 
||      SUB     .L2     B_X1c7,     B_X3c5,     B_g2b   ;[19,2] 
||      SUB     .L1X    B_h3b,      A_h3a,      A_h3    ;[19,2] 
||      MVK     .S1     cst_c4,     A_c1c4              ;[ 8,3] 
||      MPYH    .M1     A_X5X4,     A_c7c5,     A_X5c7  ;[ 8,3] 
||      MPYHL   .M2     B_X1X0,     B_c7c5,     B_X1c5  ;[ 8,3] 
 
v_loop_3: 
        STH     .D2T1   A_x1t,      *-B_o_ptr[17]       ;[31,1] 
||      ADD     .L2X    A_p0,       B_r0,       B_g0    ;[20,2] 
||      SSHL    .S2     B_x5,       satl,       B_x5s   ;[20,2] 
||      SUB     .S1X    A_p0,       B_r0,       A_h0    ;[20,2] 
||      SUB     .L1     A_g1,       A_h3,       A_x6    ;[20,2] 
||      ADD     .D1     A_g1,       A_h3,       A_x1    ;[20,2] 
||      MPY     .M1     A_X5X4,     A_c1c4,     A_P1    ;[ 9,3] 
||      MPYHL   .M2     B_X3X2,     B_c2c1,     B_X3c1  ;[ 9,3] 
 
v_loop_4: 
        STH     .D2T2   B_x0t,      *-B_o_ptr[25]       ;[32,1] 
||      SUB     .S2     B_g0,       B_h2,       B_x7    ;[21,2] 
||      ADD     .L2     B_g0,       B_h2,       B_x0    ;[21,2] 
||      ADD     .L1X    B_g2b,      A_g2a,      A_g2    ;[21,2] 
||      SSHL    .S1     A_x1,       satl,       A_x1s   ;[21,2] 
||      ADD     .D1     A_X5c7,     A_X7c3,     A_g3a   ;[10,3] 
||      MPYHL   .M1     A_X5X4,     A_c6c3,     A_X5c3  ;[10,3] 
||      MPYHL   .M2X    B_X3X2,     A_c6c3,     B_X3c3  ;[10,3] 
 
v_loop_5: 
  [ A_o]B       .S1     v_loop                          ;[33,1] 
||[!B_i]ADDAW   .D2     B_o_ptr,    28,         B_o_ptr ;[33,1] 
||      SSHL    .S2     B_x2,       satl,       B_x2s   ;[22,2] 
||      ADD     .D1     A_h0,       A_g2,       A_x3    ;[22,2] 
||[ A_o]SUB     .L1     A_o,        1,          A_o     ;[22,2] 
||      SUB     .L2     B_X1c5,     B_X3c1,     B_g3b   ;[11,3] 
||      MPYHL   .M1     A_X5X4,     A_c7c5,     A_X5c5  ;[11,3] 
||      MPY     .M2X    B_X1X0,     A_c1c4,     B_P0_t  ;[11,3] 
 
v_loop_6: 
        SHR     .S2     B_x5s,      trunc2,     B_x5t   ;[23,2] 
||      SUB     .L1     A_h0,       A_g2,       A_x4    ;[23,2] 
||      SSHL    .S1     A_x6,       satl,       A_x6s   ;[23,2] 
||      ADD     .L2X    B_g3b,      A_g3a,      B_g3    ;[12,3] 
||      MPYHL   .M1X    A_X7X6,     B_c2c1,     A_X7c1  ;[12,3] 
||      MPYH    .M2     B_X3X2,     B_c7c5,     B_X3c7  ;[12,3] 
||      LDW     .D1T1   *+A_i_ptr[1],           A_X7X6  ;[ 1,4] 
||      LDW     .D2T2   *-B_i_ptr[1],           B_X1X0  ;[ 1,4] 
 
v_loop_7: 
        SHR     .S2     B_x2s,      trunc2,     B_x2t   ;[24,2] 
||      STH     .D1T2   B_x5t,      *+A_o_ptr[8]        ;[24,2] 
||      SHR     .S1     A_x6s,      trunc2,     A_x6t   ;[24,2] 
||      SUB     .D2     B_P0_t,     B_rnd,      B_P0    ;[13,3] 
||      ADD     .L2     B_X1c1,     B_X3c3,     B_h2b   ;[13,3] 
||      ADD     .L1     A_X5c5,     A_X7c7,     A_h2a   ;[13,3] 
||      MPYLH   .M1X    A_X7X6,     B_c2c1,     A_X6c2  ;[13,3] 
||      MPYLH   .M2X    B_X3X2,     A_c6c3,     B_X2c6  ;[13,3] 
 
v_loop_8: 
        AND     .L2     B_i,        7,          B_i     ;[36,1] 
||      SSHL    .S2     B_x7,       satl,       B_x7s   ;[25,2] 
||      SSHL    .S1     A_x3,       satl,       A_x3s   ;[25,2] 
||      SUB     .L1     A_X5c3,     A_X7c1,     A_g2a   ;[14,3] 
||      MPYHL   .M1X    A_X5X4,     B_c2c1,     A_X5c1  ;[14,3] 
||      MPYHL   .M2X    B_X1X0,     A_c6c3,     B_X1c3  ;[14,3] 
||      LDW     .D2T2   * B_i_ptr++[4],         B_X3X2  ;[ 3,4] 
||      LDW     .D1T1   * A_i_ptr++[4],         A_X5X4  ;[ 3,4] 
 
v_loop_9: 
        STH     .D2T2   B_x2t,      *-B_o_ptr[8]        ;[26,2] 
||      SHR     .S1     A_x3s,      trunc2,     A_x3t   ;[26,2] 
||      SHR     .S2     B_x7s,      trunc2,     B_x7t   ;[26,2] 
||      ADD     .L2X    B_h2b,      A_h2a,      B_h2    ;[15,3] 
||      SUB     .L1X    B_P0,       A_P1,       A_p1    ;[15,3] 
||      MPYHL   .M1     A_X7X6,     A_c7c5,     A_X7c5  ;[15,3] 
||      MPYLH   .M2     B_X3X2,     B_c2c1,     B_X2c2  ;[15,3] 
 
v_loop_a: 
  [ A_o]SUB     .D2     B_i,        1,          B_i     ;[27,2] 
||      STH     .D1T1   A_x6t,      *+A_o_ptr[16]       ;[27,2] 
||      SSHL    .S2     B_x0,       satl,       B_x0s   ;[27,2] 
||      SSHL    .S1     A_x4,       satl,       A_x4s   ;[27,2] 
||      SUB     .L2X    B_X2c6,     A_X6c2,     B_r1    ;[16,3] 
||      ADD     .L1X    B_P0,       A_P1,       A_p0    ;[16,3] 
||      MPYLH   .M1     A_X7X6,     A_c6c3,     A_X6c6  ;[16,3] 
||      MPYH    .M2     B_X1X0,     B_c7c5,     B_X1c7  ;[16,3] 
 
; ============================ PIPE LOOP EPILOG ============================== 
v_epilog: 
* ========================================================================= * 
*   Post-vertical loop code:  Performs remaining vertical-loop epilog,      * 
*   pulls registers from the stack, restores the interrupt-enable state,    * 
*   and returns to the caller.                                              * 
*                                                                           * 
*   For speed, I start pulling items from the stack as quickly as           * 
*   possible. I pop the return address earliest, followed by the CSR        * 
*   restore value and the rest of the stack frame (basically, the SOE       * 
*   registers).                                                             * 
*                                                                           * 
*   I throw the return branch in flight nearly as soon as the return addr   * 
*   arrives from the stack in order to return to the caller as soon as      * 
*   possible.  I don't think it's possible to save any more time in this    * 
*   epilog code.  :-)                                                       * 
*                                                                           * 
*   Once the stack-frame restore is complete, I allow the remainder of      * 
*   the epilog (mostly shifts and stores) to complete, in the remaining     * 
*   delay slots of the return branch.  Since the stack-restore loads        * 
*   need to complete before this time anyway, I couldn't of used those      * 
*   cycles for much else anyway.                                            * 
*                                                                           * 
*   The interrupt-enable state is not restored until the return branch      * 
*   is in flight.  This implies that any pending interrupt will be taken    * 
*   on arrival in the calling function, assuming it called the IDCT with    * 
*   interrupts enabled.                                                     * 
*                                                                           * 
*   Again, this code uses twin stack-pointers for speed.                    * 
*                                                                           * 
*   To highlight how intertwined the epilog is with the stack frame code    * 
*   I've added comments highlighting what is what.                          * 
*                                                                           * 
*   I've played a trick in order to allow an early abort from the code:     * 
*   If the loop trip count is calculated to be zero by the main setup code  * 
*   at the beginning, an emergency branch is made to the abort label        * 
*   below.  (The abort is triggered only if we're asked to do zero IDCTs.)  * 
*   The outer loop trip count for the first loop (B_o) is stored in B2.     * 
*   The second loop uses B2 for its inner loop trip count (B_i).  Under     * 
*   normal operation, B2 (aka. B_o) is non-zero upon entry to this code.    * 
*   However, in the case of an abort, it will be zero, since we did not     * 
*   execute either loop.  Therefore we can use B_o to shut off the epilog   * 
*   stores in the case of an early abort.                                   * 
* ========================================================================= * 
idct_8x8_abort: 
  [ B_o]STH     .D1T2   B_x7t,      *+A_o_ptr[24]       ; epilog code 
||      SHR     .S1     A_x4s,      trunc2,     A_x4t   ; epilog code 
||      MVC     .S2     IRP,        B15                 ; Get stack pointer 
 
        LDW     .D2T2   *+ B15[ 2], B3                  ; Get return address 
||      MV      .L2     B_o_ptr,    B0                  ; We need this later. 
||      SHR     .S1     A_x1s,      trunc2,     A_x1t   ; epilog code 
||      MV      .L1X    B15,        A1 
 
        LDW     .D2T1   *+ B15[14], A2 
;- 
        LDW     .D2T2   *+ B15[10], B1                  ; A13 value's 
||      LDW     .D1T1   *+ A1 [13], A14                 ; Restore A14 
 
        LDW     .D2T2   *+ B15[12], B14                 ; Restore B14 
||      LDW     .D1T1   *+ A1 [ 3], A3                  ; CSR value's 
 
        LDW     .D1T2   *+ A1 [ 4], B10                 ; Restore A10 
||      LDW     .D2T1   *+ B15[ 5], A10                 ; Restore A10 
 
        LDW     .D1T2   *+ A1 [ 6], B11                 ; Restore A11 
||      LDW     .D2T1   *+ B15[ 7], A11                 ; Restore A11 
;- 
        LDW     .D1T2   *+ A1 [ 8], B12                 ; Restore A12 
||      LDW     .D2T1   *+ B15[ 9], A12                 ; Restore A12 
||      RET     .S2     B3                              ; Go home! 
 
        LDW     .D1T2   *+ A1 [11], B13                 ; Restore B13 
||      LDW     .D2T1   *++B15[16], A15                 ; Restore A15, B15 
||      MV      .L1X    B1,         A13                 ; Restore A13 
 
  [ B_o]STH     .D2T1   A_x3t,      * B0                ; epilog code 
||      SHR     .S2     B_x0s,      trunc2,     B_x0t   ; epilog code 
;- 
 
  [ B_o]STH     .D2T1   A_x1t,      *-B0[16]            ; epilog code 
 
  [ B_o]STH     .D2T2   B_x0t,      *-B0[24]            ; epilog code 
||      MVC     .S2X    A2,         IRP                 ; Restore IRP 
 
  [ B_o]STH     .D2T1   A_x4t,      *+B0[8]             ; epilog code 
||      MVC     .S2X    A3,         CSR                 ; Restore CSR 
;- 
v_end: 
 
* ========================================================================= * 
*   End of file:  img_idct_8x8.asm                                          * 
* ------------------------------------------------------------------------- *