www.pudn.com > ccs_encoder.rar > idct.asm
.text
.global _IMG_idct_8x8
_IMG_idct_8x8:
; ============================ SYMBOLIC CONSTANTS ============================
.asg 0x0B19, cst_c1 ; Cosine term c1
.asg 0x0A74, cst_c2 ; Cosine term c2
.asg 0x0968, cst_c3 ; Cosine term c3
.asg 0x0800, cst_c4 ; Cosine term c4
.asg 0x0649, cst_c5 ; Cosine term c5
.asg 0x0454, cst_c6 ; Cosine term c6
.asg 0x0235, cst_c7 ; Cosine term c7
.asg 11, q_pt ; Q-point for calculations
.asg 16, kq_a ; Extract const for c4 "mpy"
.asg 16-q_pt, kq_b ; Extract const for c4 "mpy"
.asg 9, trunc1 ; Truncation after horizontal pass
.asg 9, results ; Final precision of results
.asg 32-results, trunc2 ; Final truncation right-shift
.asg 16-results, satl ; Final saturation left-shift
; =============== SYMBOLIC REGISTER ASSIGNMENTS FOR HORIZ LOOP ===============
.asg B13, B_c7c5 ; Cosine terms c7, c5 (packed)
.asg A13, A_c7c5 ; Cosine terms c7, c5 (packed)
.asg B12, B_c3c1 ; Cosine terms c3, c1 (packed)
.asg A12, A_c3c1 ; Cosine terms c3, c1 (packed)
.asg B14, B_c6c2 ; Cosine terms c6, c2 (packed)
.asg A14, A_i_ptr ; Input pointer #1
.asg B15, B_i_ptr ; Input pointer #2
.asg A11, A_o_ptr ; Output pointer #1
.asg B11, B_o_ptr ; Output pointer #2
.asg B2, B_o ; Outer loop counter
.asg A5, A_X1X0 ; Incoming coefs X1, X0 (packed)
.asg A10, A_X3X2 ; Incoming coefs X3, X2 (packed)
.asg B7, B_X5X4 ; Incoming coefs X5, X4 (packed)
.asg B10, B_X7X6 ; Incoming coefs X7, X6 (packed)
.asg A7, A_X2c6 ; X2 * c6
.asg B0, B_X6c2 ; X6 * c2
.asg A0, A_X2c2 ; X2 * c2
.asg B1, B_X6c6 ; X6 * c6
.asg A6, A_P0 ; Node P0 in signal flow graph
.asg B8, B_P1 ; Node P1 in signal flow graph
.asg A8, A_p0 ; Node p0 in signal flow graph
.asg A0, A_p1 ; Node p1 in signal flow graph
.asg B0, B_r1 ; Node r1 in signal flow graph
.asg B4, B_r0 ; Node r0 in signal flow graph
.asg B7, B_g0 ; Node g0 in signal flow graph
.asg B3, B_g1 ; Node g1 in signal flow graph
.asg A15, A_h1 ; Node h1 in signal flow graph
.asg A15, A_h0 ; Node h0 in signal flow graph
.asg A3, A_X1c1 ; X1 * c1
.asg A0, A_X1c3 ; X1 * c3
.asg A3, A_X1c5 ; X1 * c5
.asg A9, A_X1c7 ; X1 * c7
.asg A9, A_X3c1 ; X3 * c1
.asg A0, A_X3c3 ; X3 * c3
.asg A5, A_X3c5 ; X3 * c5
.asg A5, A_X3c7 ; X3 * c7
.asg B0, B_X5c1 ; X5 * c1
.asg B4, B_X5c3 ; X5 * c3
.asg B3, B_X5c5 ; X5 * c5
.asg B6, B_X5c7 ; X5 * c7
.asg B0, B_X7c1 ; X7 * c1
.asg B3, B_X7c3 ; X7 * c3
.asg B9, B_X7c5 ; X7 * c5
.asg B1, B_X7c7 ; X7 * c7
.asg A7, A_g2a ; X1 * c7 - X3 * c5
.asg B8, B_g2b ; X5 * c3 - X7 * c1
.asg A6, A_g2 ; Node g2 in signal flow graph
.asg A3, A_g3a ; X1 * c5 - X3 * c1
.asg B6, B_g3b ; X5 * c7 + X7 * c3
.asg A4, A_g3 ; Node g3 in signal flow graph
.asg A6, A_h3a ; X1 * c3 - X2 * c7
.asg B7, B_h3b ; X5 * c1 + X7 * c5
.asg B5, B_h3n ; Node h3, negated.
.asg A0, A_h2a ; X1 * c1 + X3 * c3
.asg B3, B_h2b ; X5 * c5 + X7 * c7
.asg B1, B_h2 ; Node h2 in signal flow graph
.asg B4, B_x0 ; Output x0, pre-truncation
.asg B0, B_x1 ; Output x1, pre-truncation
.asg A4, A_x2 ; Output x2, pre-truncation
.asg A4, A_x3 ; Output x3, pre-truncation
.asg A7, A_x4 ; Output x4, pre-truncation
.asg A15, A_x5 ; Output x5, pre-truncation
.asg B6, B_x6 ; Output x6, pre-truncation
.asg B3, B_x7 ; Output x7, pre-truncation
.asg B4, B_x0t ; Output x0, truncated to 16 bits
.asg B5, B_x1t ; Output x1, truncated to 16 bits
.asg A4, A_x2t ; Output x2, truncated to 16 bits
.asg A8, A_x3t ; Output x3, truncated to 16 bits
.asg A7, A_x4t ; Output x4, truncated to 16 bits
.asg A5, A_x5t ; Output x5, truncated to 16 bits
.asg B3, B_x6t ; Output x6, truncated to 16 bits
.asg B9, B_x7t ; Output x7, truncated to 16 bits
.asg A2, A_i ; Inner-loop counter.
; ============================================================================
* ========================================================================= *
* Initialization code for horizontal loop: Saves registers to *
* the stack, sets up cosine terms, pointers and loop control. *
* *
* The stack frame for this code is 16 words large. It holds the Save *
* on Entry (SOE) registers A10..A15, B10..B14, as well as the return *
* address (B3), CSR, IRP, and a single spill value. (The loop counter *
* initializer is shared between both loops and so I spill it to the *
* stack.) I twin the stack pointer to speed up stack accesses. The *
* stack frame layout is slightly funky to avoid bank conflicts while *
* allowing me to get to everything when I need it most. *
* *
* The horizontal loop starts at the end of the IDCT array and works back *
* towards the beginning. As a result, the input and output pointers are *
* initialized like so: *
* *
* -- A_i_ptr is set to point to the coefficients "X0" and "X1" in the *
* last row of the last valid IDCT block in the input. B_i_ptr is *
* set to point to the coefficients "X4" and "X5" in that same row. *
* *
* -- A_o_ptr is set to point to the coefficient "x4" in the rightmost *
* column of the scratch block I require at the end of the array. *
* B_o_ptr is set to point to "x3" in that same column. *
* *
* The loop count is simply the number of IDCTs times 8, minus 1 to *
* handle the parallel iterations in the kernel. (It would've been more, *
* except that I've performed some limited prolog and epilog collapsing, *
* so I need to iterate the kernel more times.) A happy coincidence *
* gives both horizontal and vertical loops the exact same trip count, *
* so I spill this value to the stack and simply restore it unchanged *
* for the second loop, rather than recalculating it. *
* *
* Since I was able to free up a single predication register in the first *
* loop, I prolog-collapsed one stage of the prolog. I use A1 as my *
* prolog-collapse counter. To save a MVK (since this code bottlenecks *
* heavily on S units), I initialize it to -1 with an OR, rather than a *
* more traditional 1. *
* *
* Both loops use all 32 registers, so I have saved the stack pointer in *
* IRP. This is safe since interrupts are explicitly disabled for the *
* entire function. *
* *
* Note: This setup code could possibly be a cycle or two faster. For *
* instance, I could copy B15 to A15 before the decrement and use *
* negative indexes for the STWs through A15, saving a whole cycle on *
* the stack saves. The resulting code doesn't pack as nicely, though. *
* ========================================================================= *
;-
STW .D2T1 A15, *B15--[16] ; Save A15, get stack frame
|| MVC .S2 CSR, B0 ; Grab the current CSR
AND .L2 B0, -2, B1 ; Clear GIE bit in CSR
|| MV .L1X B15, A15 ; Twin the stack pointer
STW .D1T1 A14, *+A15 [13] ; Save SOE reg A14
|| STW .D2T2 B14, *+B15 [12] ; Save SOE reg B14
|| MV .L1X B0, A0 ; Partitioning MV.
|| MVC .S2 B1, CSR ; Interrupts disabled here
;-
STW .D1T1 A13, *+A15 [10] ; Save SOE reg A13
|| STW .D2T2 B13, *+B15 [11] ; Save SOE reg B13
STW .D1T1 A12, *+A15 [ 9] ; Save SOE reg A12
|| STW .D2T2 B12, *+B15 [ 8] ; Save SOE reg B12
STW .D1T1 A11, *+A15 [ 7] ; Save SOE reg A11
|| STW .D2T2 B11, *+B15 [ 6] ; Save SOE reg B11
|| SHL .S2 B4, 3, B_o ; Set up outer loop counter
|| OR .L1 A1, -1, A1 ; Prolog collapse counter
;-
STW .D1T1 A10, *+A15 [ 5] ; Save SOE reg A10
|| STW .D2T2 B10, *+B15 [ 4] ; Save SOE reg B10
|| SHL .S2 B4, 7, B4 ; Set up end-of-array ptr
||[B_o] SUB .L2 B_o, 1, B_o ; Loop count = IDCTs*8 - 1
STW .D2T2 B3, *+B15 [ 2] ; Remember the return addr
|| STW .D1T1 A0, *+A15 [ 3] ; Remember the CSR state
|| ADD .L2X A4, B4, B4 ; Point to scratch area
|| MVC .S2 IRP, B0
;-
STW .D2T2 B_o, *+B15 [ 1] ; Spill our loop count init
|| MVC .S2 B15, IRP ; Save stack ptr in IRP
|| SUB .L2 B4, 8, B_i_ptr ; Point to X5X4, row 7
|| MV .L1X B4, A_o_ptr
|| MVK .S1 7, A_i ; Set up inner loop counter
SUB .L1X B_i_ptr, 8, A_i_ptr ; Point to X1X0, row 7
|| ADDAH .D2 B4, 31, B_o_ptr ; Point to x3, col 7
|| ADDK .S1 78, A_o_ptr ; Point to x4, col 7
;-
; ============================ PIPE LOOP PROLOG ==============================
h_prolog:
[ B_o]LDW .D1T1 * A_i_ptr--[4], A_X1X0 ;[ 1,1]
||[ B_o]LDW .D2T2 *+B_i_ptr[1], B_X7X6 ;[ 1,1]
|| MVK .S1 cst_c1, A_c3c1 ; c1
||[!B_o]B .S2 idct_8x8_abort ; num_idcts==0? Abort.
[ B_o]LDW .D1T1 *+A_i_ptr[5], A_X3X2 ;[ 2,1]
||[ B_o]LDW .D2T2 * B_i_ptr--[4], B_X5X4 ;[ 2,1]
|| MVK .S1 cst_c5, A_c7c5 ; c5
|| MVK .S2 cst_c2, B_c6c2 ; c2
;-
STW .D1T2 B0, *A15[14] ; save IRP
MVKLH .S1 cst_c7, A_c7c5 ; c7
|| MVKLH .S2 cst_c6, B_c6c2 ; c6
MVKLH .S1 cst_c3, A_c3c1 ; c3
|| MVK .S2 cst_c5, B_c7c5 ; c5
MPYH .M1 A_X1X0, A_c7c5, A_X1c7 ;[ 6,1]
|| MPYLH .M2 B_X7X6, B_c6c2, B_X6c6 ;[ 6,1]
|| MVKLH .S2 cst_c7, B_c7c5 ; c7
; ===== Branch Occurs =====
;-
EXT .S1 A_X1X0, kq_a, kq_b, A_P0 ;[ 7,1]
|| MPY .M1X A_X3X2, B_c6c2, A_X2c2 ;[ 7,1]
|| MPYHL .M2 B_X7X6, B_c7c5, B_X7c5 ;[ 7,1]
|| MV .L2X A_c3c1, B_c3c1
ADDK .S1 256, A_P0 ;[ 8,1]
|| EXT .S2 B_X5X4, kq_a, kq_b, B_P1 ;[ 8,1]
|| MPYHL .M1 A_X1X0, A_c3c1, A_X1c1 ;[ 8,1]
|| MPYH .M2 B_X7X6, B_c7c5, B_X7c7 ;[ 8,1]
;-
; ============================ PIPE LOOP KERNEL ==============================
h_loop:
h_loop_0:
SUB .L2 B_g1, B_h3n, B_x1 ;[19,1]
|| STH .D2T2 B_x0t, *-B_o_ptr[24] ;[19,1]
|| ADD .D1 A_i, 1, A_i ;[19,1]
|| SHR .S1 A_x3, trunc1, A_x3t ;[19,1]
|| ADD .L1X A_g3a, B_g3b, A_g3 ;[19,1]
|| ADD .S2X A_X2c2, B_X6c6, B_r0 ;[ 9,2]
|| MPYH .M1 A_X3X2, A_c3c1, A_X3c3 ;[ 9,2]
|| MPYHL .M2 B_X5X4, B_c7c5, B_X5c5 ;[ 9,2]
h_loop_1:
ADD .L2 B_g1, B_h3n, B_x6 ;[20,1]
||[!A1] STH .D2T1 A_x3t, * B_o_ptr--[1] ;[20,1]
|| ADD .S1 A_h1, A_g3, A_x2 ;[20,1]
|| SUB .D1 A_h1, A_g3, A_x5 ;[20,1]
|| ADD .L1X A_P0, B_P1, A_p0 ;[10,2]
|| MPYHL .M1 A_X1X0, A_c7c5, A_X1c5 ;[10,2]
|| MPYHL .M2 B_X7X6, B_c3c1, B_X7c1 ;[10,2]
h_loop_2:
SHR .S1 A_x5, trunc1, A_x5t ;[21,1]
|| SHR .S2 B_x1, trunc1, B_x1t ;[21,1]
|| ADD .L1 A_X1c1, A_X3c3, A_h2a ;[11,2]
|| ADD .L2 B_X5c5, B_X7c7, B_h2b ;[11,2]
|| MPYH .M1 A_X1X0, A_c3c1, A_X1c3 ;[11,2]
|| MPYH .M2 B_X5X4, B_c7c5, B_X5c7 ;[11,2]
|| LDW .D1T1 * A_i_ptr--[4], A_X1X0 ;[ 1,3]
|| LDW .D2T2 *+B_i_ptr[1], B_X7X6 ;[ 1,3]
h_loop_3:
SHR .S2 B_x6, trunc1, B_x6t ;[22,1]
|| SHR .S1 A_x2, trunc1, A_x2t ;[22,1]
|| SUB .L1X A_p0, B_r0, A_h0 ;[12,2]
|| ADD .L2X A_h2a, B_h2b, B_h2 ;[12,2]
|| MPYH .M1 A_X3X2, A_c7c5, A_X3c7 ;[12,2]
|| MPYH .M2 B_X5X4, B_c3c1, B_X5c3 ;[12,2]
|| LDW .D1T1 *+A_i_ptr[5], A_X3X2 ;[ 2,3]
|| LDW .D2T2 * B_i_ptr--[4], B_X5X4 ;[ 2,3]
h_loop_4:
[ B_o]B .S2 h_loop ;[23,1]
|| STH .D1T1 A_x5t, *+A_o_ptr[8] ;[23,1]
|| SHR .S1 A_x4, trunc1, A_x4t ;[23,1]
|| ADD .L2X A_p0, B_r0, B_g0 ;[13,2]
||[ B_o]SUB .D2 B_o, 1, B_o ;[13,2]
||[!A1] AND .L1 A_i, 7, A_i ;[13,2]
|| MPYHL .M1 A_X3X2, A_c7c5, A_X3c5 ;[13,2]
|| MPYHL .M2 B_X5X4, B_c3c1, B_X5c1 ;[13,2]
h_loop_5:
[!A1] STH .D1T1 A_x4t, * A_o_ptr--[1] ;[24,1]
|| SUB .S1 A_X1c3, A_X3c7, A_h3a ;[14,2]
|| SUB .L1X A_P0, B_P1, A_p1 ;[14,2]
|| ADD .S2 B_g0, B_h2, B_x0 ;[14,2]
|| SUB .L2 B_X5c3, B_X7c1, B_g2b ;[14,2]
|| MPYHL .M1 A_X3X2, A_c3c1, A_X3c1 ;[14,2]
|| MPY .M2 B_X7X6, B_c6c2, B_X6c2 ;[14,2]
h_loop_6:
STH .D1T2 B_x6t, *+A_o_ptr[17] ;[25,1]
|| SUB .D2 B_g0, B_h2, B_x7 ;[15,2]
|| SHR .S2 B_x0, trunc1, B_x0t ;[15,2]
|| SUB .S1 A_X1c7, A_X3c5, A_g2a ;[15,2]
|| ADD .L2 B_X5c1, B_X7c5, B_h3b ;[15,2]
|| MPYLH .M1X A_X3X2, B_c6c2, A_X2c6 ;[15,2]
|| MPYH .M2 B_X7X6, B_c3c1, B_X7c3 ;[15,2]
||[ A1] ADD .L1 A1, 1, A1
h_loop_7:
[!A_i]SUBAW .D1 A_o_ptr, 28, A_o_ptr ;[26,1]
|| STH .D2T2 B_x1t, *-B_o_ptr[15] ;[26,1]
|| SHR .S2 B_x7, trunc1, B_x7t ;[16,2]
|| SUB .L1 A_X1c5, A_X3c1, A_g3a ;[16,2]
|| SUB .L2X B_h3b, A_h3a, B_h3n ;[16,2]
|| ADD .S1X A_g2a, B_g2b, A_g2 ;[16,2]
|| MPYH .M1 A_X1X0, A_c7c5, A_X1c7 ;[ 6,3]
|| MPYLH .M2 B_X7X6, B_c6c2, B_X6c6 ;[ 6,3]
h_loop_8:
STH .D2T1 A_x2t, *-B_o_ptr[7] ;[27,1]
|| ADD .L1 A_h0, A_g2, A_x3 ;[17,2]
|| SUB .D1 A_h0, A_g2, A_x4 ;[17,2]
|| SUB .L2X A_X2c6, B_X6c2, B_r1 ;[17,2]
|| EXT .S1 A_X1X0, kq_a, kq_b, A_P0 ;[ 7,3]
|| EXT .S2 B_X5X4, kq_a, kq_b, B_P1 ;[ 7,3]
|| MPY .M1X A_X3X2, B_c6c2, A_X2c2 ;[ 7,3]
|| MPYHL .M2 B_X7X6, B_c7c5, B_X7c5 ;[ 7,3]
h_loop_9:
[!A_i]SUBAW .D2 B_o_ptr, 28, B_o_ptr ;[28,1]
|| STH .D1T2 B_x7t, *+A_o_ptr[24] ;[18,2]
|| ADD .S2X A_p1, B_r1, B_g1 ;[18,2]
|| SUB .L1X A_p1, B_r1, A_h1 ;[18,2]
|| ADD .L2 B_X5c7, B_X7c3, B_g3b ;[18,2]
|| ADDK .S1 256, A_P0 ;[ 8,3]
|| MPYHL .M1 A_X1X0, A_c3c1, A_X1c1 ;[ 8,3]
|| MPYH .M2 B_X7X6, B_c7c5, B_X7c7 ;[ 8,3]
; ============================ PIPE LOOP EPILOG ==============================
h_epilog:
SUB .L2 B_g1, B_h3n, B_x1 ;[19,3]
|| STH .D2T2 B_x0t, *-B_o_ptr[24] ;[19,3]
|| SHR .S1 A_x3, trunc1, A_x3t ;[19,3]
|| ADD .L1X A_g3a, B_g3b, A_g3 ;[19,3]
ADD .L2 B_g1, B_h3n, B_x6 ;[20,3]
|| STH .D2T1 A_x3t, *+B_o_ptr[0] ;[20,3]
|| ADD .S1 A_h1, A_g3, A_x2 ;[20,3]
|| SUB .D1 A_h1, A_g3, A_x5 ;[20,3]
;-
SHR .S1 A_x5, trunc1, A_x5t ;[21,3]
|| SHR .S2 B_x1, trunc1, B_x1t ;[21,3]
SHR .S2 B_x6, trunc1, B_x6t ;[22,3]
|| SHR .S1 A_x2, trunc1, A_x2t ;[22,3]
|| STH .D2T2 B_x1t, *-B_o_ptr[16] ;[26,3]
STH .D1T1 A_x5t, *+A_o_ptr[8] ;[23,3]
|| SHR .S1 A_x4, trunc1, A_x4t ;[23,3]
* ========================================================================= *
* Interloop code: Performs remaining epilog from horizontal pass, and *
* begins setup of the vertical pass. *
* *
* In order to save some time between loops, I start performing pointer *
* fixups and constant initializations in the epilog of the horizontal *
* pass loop. The horizontal pass works from the bottom of the *
* IDCT list and ends at the top, whereas the vertical pass works from *
* the top of the list and ends up at the bottom. As a result, the *
* displacement between the required pointer settings between the two *
* loops is fixed, regardless of the number of IDCTs processed, since *
* the two loops pointers always meet at the top of the list. *
* *
* The vertical loop needs a new repacking of the cosine terms: c6c3 and *
* c2c1. By playing around w/ how the cosine terms are packed, *
* I was able to save two whole registers in the vertical loop and thus *
* fit into the register file. I do this repacking partly here, and *
* partly in the vertical loop's prolog. *
* ========================================================================= *
STH .D1T1 A_x4t, *+A_o_ptr[0] ;[24,3]
;-
STH .D1T2 B_x6t, *+A_o_ptr[16] ;[25,3]
|| ADDK .S1 168, A_i_ptr ; Fixup for vert loop
|| ADDK .S2 156, B_i_ptr ; Fixup for vert loop
.asg A15, A_c6c3 ; Symbolic name from
; vertical loop.
STH .D2T1 A_x2t, *-B_o_ptr[8] ;[27,3]
|| SHR .S1 A_c3c1, 16, A_c6c3 ; Set up new cos cst
|| MVC .S2 IRP, B0 ; Get SP so we can
; unspill A_o.
; ============================================================================
; =============== SYMBOLIC REGISTER ASSIGNMENTS FOR VERT LOOP ================
.asg A14, A_i_ptr ; Input pointer #1
.asg B15, B_i_ptr ; Input pointer #2
.asg A11, A_o_ptr ; Output pointer #1
.asg B11, B_o_ptr ; Output pointer #2
.asg B13, B_c7c5 ; Cosine terms c7, c5 (packed)
.asg A13, A_c7c5 ; Cosine terms c7, c5 (packed)
.asg A15, A_c6c3 ; Cosine terms c6, c3 (packed)
.asg B12, B_c2c1 ; Cosine terms c2, c1 (packed)
.asg A4, A_c1c4 ; Cosine term c1, c4 (alternates)
.asg A2, A_o ; Outer loop counter
.asg B2, B_i ; Inner loop counter
.asg A12, A_X7X6 ; Incoming coefs X7, X6 (packed)
.asg A8, A_X5X4 ; Incoming coefs X5, X4 (packed)
.asg B10, B_X3X2 ; Incoming coefs X3, X2 (packed)
.asg B14, B_X1X0 ; Incoming coefs X1, X0 (packed)
.asg B9, B_rnd ; Rounding value applied to P0
.asg B1, B_P0_t ; Node P0, temporary pre-rounding
.asg B5, B_P0 ; Rounded value of Node P0
.asg A7, A_P1 ; Node P1 in signal flow graph
.asg B0, B_X2c2 ; X2 * c2
.asg B4, B_X2c6 ; X2 * c6
.asg A4, A_X6c2 ; X6 * c2
.asg A3, A_X6c6 ; X6 * c6
.asg A5, A_p0 ; Node p0 in signal flow graph
.asg A8, A_p1 ; Node p1 in signal flow graph
.asg B4, B_r1 ; Node r1 in signal flow graph
.asg B3, B_r0 ; Node r0 in signal flow graph
.asg B0, B_g0 ; Node g0 in signal flow graph
.asg A1, A_g1 ; Node g1 in signal flow graph
.asg B3, B_h1 ; Node h1 in signal flow graph
.asg A3, A_h0 ; Node h0 in signal flow graph
.asg B5, B_X1c1 ; X1 * c1
.asg B1, B_X1c3 ; X1 * c3
.asg B3, B_X1c5 ; X1 * c5
.asg B8, B_X1c7 ; X1 * c7
.asg B0, B_X3c1 ; X3 * c1
.asg B0, B_X3c3 ; X3 * c3
.asg B0, B_X3c5 ; X3 * c5
.asg B9, B_X3c7 ; X3 * c7
.asg A3, A_X5c1 ; X5 * c1
.asg A1, A_X5c3 ; X5 * c3
.asg A5, A_X5c5 ; X5 * c5
.asg A0, A_X5c7 ; X5 * c7
.asg A6, A_X7c1 ; X7 * c1
.asg A7, A_X7c3 ; X7 * c3
.asg A4, A_X7c5 ; X7 * c5
.asg A6, A_X7c7 ; X7 * c7
.asg A3, A_h2a ; X5 * c5 + X7 * c7
.asg B3, B_h2b ; X1 * c1 + X3 * c3
.asg B6, B_h2 ; Node h2 in signal flow graph
.asg A4, A_h3a ; X5 * c1 + X7 * c5
.asg B1, B_h3b ; X1 * c3 + X3 * c7
.asg A3, A_h3 ; Node h3 in signal flow graph
.asg A9, A_g3a ; X5 * c7 + X7 * c3
.asg B1, B_g3b ; X1 * c5 + X3 * c1
.asg B7, B_g3 ; Node g3 in signal flow graph
.asg A9, A_g2a ; X5 * c3 + X7 * c1
.asg B1, B_g2b ; X1 * c7 + X3 * c5
.asg A0, A_g2 ; Node g2 in signal flow graph
.asg B8, B_x0 ; Output x0, pre-saturate/truncate
.asg A1, A_x1 ; Output x1, pre-saturate/truncate
.asg B7, B_x2 ; Output x2, pre-saturate/truncate
.asg A4, A_x3 ; Output x3, pre-saturate/truncate
.asg A0, A_x4 ; Output x4, pre-saturate/truncate
.asg B4, B_x5 ; Output x5, pre-saturate/truncate
.asg A5, A_x6 ; Output x6, pre-saturate/truncate
.asg B6, B_x7 ; Output x7, pre-saturate/truncate
.asg B5, B_x0s ; Output x0, saturated to 9 bits
.asg A10, A_x1s ; Output x1, saturated to 9 bits
.asg B3, B_x2s ; Output x2, saturated to 9 bits
.asg A6, A_x3s ; Output x3, saturated to 9 bits
.asg A7, A_x4s ; Output x4, saturated to 9 bits
.asg B4, B_x5s ; Output x5, saturated to 9 bits
.asg A3, A_x6s ; Output x6, saturated to 9 bits
.asg B6, B_x7s ; Output x7, saturated to 9 bits
.asg B8, B_x0t ; Output x0, truncated to 9 bits
.asg A0, A_x1t ; Output x1, truncated to 9 bits
.asg B0, B_x2t ; Output x2, truncated to 9 bits
.asg A6, A_x3t ; Output x3, truncated to 9 bits
.asg A7, A_x4t ; Output x4, truncated to 9 bits
.asg B4, B_x5t ; Output x5, truncated to 9 bits
.asg A5, A_x6t ; Output x6, truncated to 9 bits
.asg B3, B_x7t ; Output x7, truncated to 9 bits
; ============================================================================
; ============================ PIPE LOOP PROLOG ==============================
v_prolog:
LDW .D2T1 *B0[1], A_o ; Unspill loop counter
|| ADDK .S2 -128, B_o_ptr ; Fixup for vert loop
;-
LDW .D1T1 *+A_i_ptr[1], A_X7X6 ;[ 1,1]
|| LDW .D2T2 *-B_i_ptr[1], B_X1X0 ;[ 1,1]
ADDK .S1 -128, A_o_ptr ; Fixup for vert loop
; Set up modified constants for second loop
; Note: A_c7c5, B_c7c5 are in same regs both loops.
; Also, B_c2c1 reuses h_loop's B_c3c1.
LDW .D2T2 * B_i_ptr++[4], B_X3X2 ;[ 3,1]
|| LDW .D1T1 * A_i_ptr++[4], A_X5X4 ;[ 3,1]
MVKLH .S2 cst_c2, B_c2c1 ; c2 (B_c2c1 == B_c3c1)
|| MVKLH .S1 cst_c6, A_c6c3 ; c6
MVK .S2 8, B_i ; Inner loop counter.
;-
MPYHL .M1 A_X7X6, A_c6c3, A_X7c3 ;[ 6,1]
MPYH .M1 A_X7X6, A_c7c5, A_X7c7 ;[ 7,1]
|| MPYHL .M2 B_X1X0, B_c2c1, B_X1c1 ;[ 7,1]
MVK .S1 cst_c4, A_c1c4 ;[ 8,1]
|| MPYH .M1 A_X5X4, A_c7c5, A_X5c7 ;[ 8,1]
|| MPYHL .M2 B_X1X0, B_c7c5, B_X1c5 ;[ 8,1]
MPY .M1 A_X5X4, A_c1c4, A_P1 ;[ 9,1]
|| MPYHL .M2 B_X3X2, B_c2c1, B_X3c1 ;[ 9,1]
;-
ADD .D1 A_X5c7, A_X7c3, A_g3a ;[10,1]
|| MPYHL .M1 A_X5X4, A_c6c3, A_X5c3 ;[10,1]
|| MPYHL .M2X B_X3X2, A_c6c3, B_X3c3 ;[10,1]
SUB .L2 B_X1c5, B_X3c1, B_g3b ;[11,1]
|| MPYHL .M1 A_X5X4, A_c7c5, A_X5c5 ;[11,1]
|| MPY .M2X B_X1X0, A_c1c4, B_P0_t ;[11,1]
|| MVK .S2 -32768, B_rnd ;[ 6,1]
|| B .S1 v_loop_0 + 8 ; skip 2
;-
ADD .L2X B_g3b, A_g3a, B_g3 ;[12,1]
|| MPYHL .M1X A_X7X6, B_c2c1, A_X7c1 ;[12,1]
|| MPYH .M2 B_X3X2, B_c7c5, B_X3c7 ;[12,1]
|| LDW .D1T1 *+A_i_ptr[1], A_X7X6 ;[ 1,2]
|| LDW .D2T2 *-B_i_ptr[1], B_X1X0 ;[ 1,2]
|| B .S2 v_loop_1 + 8 ; skip 2
;-
SUB .D2 B_P0_t, B_rnd, B_P0 ;[13,1]
|| ADD .L2 B_X1c1, B_X3c3, B_h2b ;[13,1]
|| ADD .L1 A_X5c5, A_X7c7, A_h2a ;[13,1]
|| MPYLH .M1X A_X7X6, B_c2c1, A_X6c2 ;[13,1]
|| MPYLH .M2X B_X3X2, A_c6c3, B_X2c6 ;[13,1]
|| B .S2 v_loop_2 + 12 ; skip 3
|| MVKL .S1 cst_c1, A_c1c4 ;
;-
SUB .L1 A_X5c3, A_X7c1, A_g2a ;[14,1]
|| MPYHL .M1 A_X5X4, A_c1c4, A_X5c1 ;[14,1]
|| MPYHL .M2X B_X1X0, A_c6c3, B_X1c3 ;[14,1]
|| LDW .D2T2 * B_i_ptr++[4], B_X3X2 ;[ 3,2]
|| LDW .D1T1 * A_i_ptr++[4], A_X5X4 ;[ 3,2]
|| B .S2 v_loop_3 + 4 ; skip 1
|| ADD .S1X B_P0, A_P1, A_p0 ;[16,1]
;-
ADD .L2X B_h2b, A_h2a, B_h2 ;[15,1]
|| SUB .L1X B_P0, A_P1, A_p1 ;[15,1]
|| MPYHL .M1 A_X7X6, A_c7c5, A_X7c5 ;[15,1]
|| MPYLH .M2 B_X3X2, B_c2c1, B_X2c2 ;[15,1]
|| B .S2 v_loop_4 + 4 ; skip 1
SUB .L2X B_X2c6, A_X6c2, B_r1 ;[16,1]
|| MPYLH .M1 A_X7X6, A_c6c3, A_X6c6 ;[16,1]
|| MPYH .M2 B_X1X0, B_c7c5, B_X1c7 ;[16,1]
;-
; ===== Branch Occurs =====
; ============================ PIPE LOOP KERNEL ==============================
v_loop:
v_loop_0:
STH .D1T2 B_x7t, *+A_o_ptr[24] ;[28,1]
|| SHR .S1 A_x4s, trunc2, A_x4t ;[28,1]
|| ADD .L1 A_X5c1, A_X7c5, A_h3a ;[17,2]
|| SUB .D2 B_X1c3, B_X3c7, B_h3b ;[17,2]
|| SUB .L2X A_p1, B_r1, B_h1 ;[17,2]
|| MPYHL .M2 B_X3X2, B_c7c5, B_X3c5 ;[17,2]
|| MVK .S2 -32768, B_rnd ;[ 6,3]
|| MPYHL .M1 A_X7X6, A_c6c3, A_X7c3 ;[ 6,3]
v_loop_1:
STH .D1T1 A_x4t, * A_o_ptr++[1] ;[29,1]
|| SHR .S1 A_x1s, trunc2, A_x1t ;[29,1]
|| ADD .S2 B_h1, B_g3, B_x2 ;[18,2]
|| SUB .D2 B_h1, B_g3, B_x5 ;[18,2]
|| ADD .L1X A_p1, B_r1, A_g1 ;[18,2]
|| ADD .L2X B_X2c2, A_X6c6, B_r0 ;[18,2]
|| MPYH .M1 A_X7X6, A_c7c5, A_X7c7 ;[ 7,3]
|| MPYHL .M2 B_X1X0, B_c2c1, B_X1c1 ;[ 7,3]
v_loop_2:
[!B_i]ADDAW .D1 A_o_ptr, 28, A_o_ptr ;[30,1]
|| STH .D2T1 A_x3t, * B_o_ptr++[1] ;[30,1]
|| SHR .S2 B_x0s, trunc2, B_x0t ;[30,1]
|| SUB .L2 B_X1c7, B_X3c5, B_g2b ;[19,2]
|| SUB .L1X B_h3b, A_h3a, A_h3 ;[19,2]
|| MVK .S1 cst_c4, A_c1c4 ;[ 8,3]
|| MPYH .M1 A_X5X4, A_c7c5, A_X5c7 ;[ 8,3]
|| MPYHL .M2 B_X1X0, B_c7c5, B_X1c5 ;[ 8,3]
v_loop_3:
STH .D2T1 A_x1t, *-B_o_ptr[17] ;[31,1]
|| ADD .L2X A_p0, B_r0, B_g0 ;[20,2]
|| SSHL .S2 B_x5, satl, B_x5s ;[20,2]
|| SUB .S1X A_p0, B_r0, A_h0 ;[20,2]
|| SUB .L1 A_g1, A_h3, A_x6 ;[20,2]
|| ADD .D1 A_g1, A_h3, A_x1 ;[20,2]
|| MPY .M1 A_X5X4, A_c1c4, A_P1 ;[ 9,3]
|| MPYHL .M2 B_X3X2, B_c2c1, B_X3c1 ;[ 9,3]
v_loop_4:
STH .D2T2 B_x0t, *-B_o_ptr[25] ;[32,1]
|| SUB .S2 B_g0, B_h2, B_x7 ;[21,2]
|| ADD .L2 B_g0, B_h2, B_x0 ;[21,2]
|| ADD .L1X B_g2b, A_g2a, A_g2 ;[21,2]
|| SSHL .S1 A_x1, satl, A_x1s ;[21,2]
|| ADD .D1 A_X5c7, A_X7c3, A_g3a ;[10,3]
|| MPYHL .M1 A_X5X4, A_c6c3, A_X5c3 ;[10,3]
|| MPYHL .M2X B_X3X2, A_c6c3, B_X3c3 ;[10,3]
v_loop_5:
[ A_o]B .S1 v_loop ;[33,1]
||[!B_i]ADDAW .D2 B_o_ptr, 28, B_o_ptr ;[33,1]
|| SSHL .S2 B_x2, satl, B_x2s ;[22,2]
|| ADD .D1 A_h0, A_g2, A_x3 ;[22,2]
||[ A_o]SUB .L1 A_o, 1, A_o ;[22,2]
|| SUB .L2 B_X1c5, B_X3c1, B_g3b ;[11,3]
|| MPYHL .M1 A_X5X4, A_c7c5, A_X5c5 ;[11,3]
|| MPY .M2X B_X1X0, A_c1c4, B_P0_t ;[11,3]
v_loop_6:
SHR .S2 B_x5s, trunc2, B_x5t ;[23,2]
|| SUB .L1 A_h0, A_g2, A_x4 ;[23,2]
|| SSHL .S1 A_x6, satl, A_x6s ;[23,2]
|| ADD .L2X B_g3b, A_g3a, B_g3 ;[12,3]
|| MPYHL .M1X A_X7X6, B_c2c1, A_X7c1 ;[12,3]
|| MPYH .M2 B_X3X2, B_c7c5, B_X3c7 ;[12,3]
|| LDW .D1T1 *+A_i_ptr[1], A_X7X6 ;[ 1,4]
|| LDW .D2T2 *-B_i_ptr[1], B_X1X0 ;[ 1,4]
v_loop_7:
SHR .S2 B_x2s, trunc2, B_x2t ;[24,2]
|| STH .D1T2 B_x5t, *+A_o_ptr[8] ;[24,2]
|| SHR .S1 A_x6s, trunc2, A_x6t ;[24,2]
|| SUB .D2 B_P0_t, B_rnd, B_P0 ;[13,3]
|| ADD .L2 B_X1c1, B_X3c3, B_h2b ;[13,3]
|| ADD .L1 A_X5c5, A_X7c7, A_h2a ;[13,3]
|| MPYLH .M1X A_X7X6, B_c2c1, A_X6c2 ;[13,3]
|| MPYLH .M2X B_X3X2, A_c6c3, B_X2c6 ;[13,3]
v_loop_8:
AND .L2 B_i, 7, B_i ;[36,1]
|| SSHL .S2 B_x7, satl, B_x7s ;[25,2]
|| SSHL .S1 A_x3, satl, A_x3s ;[25,2]
|| SUB .L1 A_X5c3, A_X7c1, A_g2a ;[14,3]
|| MPYHL .M1X A_X5X4, B_c2c1, A_X5c1 ;[14,3]
|| MPYHL .M2X B_X1X0, A_c6c3, B_X1c3 ;[14,3]
|| LDW .D2T2 * B_i_ptr++[4], B_X3X2 ;[ 3,4]
|| LDW .D1T1 * A_i_ptr++[4], A_X5X4 ;[ 3,4]
v_loop_9:
STH .D2T2 B_x2t, *-B_o_ptr[8] ;[26,2]
|| SHR .S1 A_x3s, trunc2, A_x3t ;[26,2]
|| SHR .S2 B_x7s, trunc2, B_x7t ;[26,2]
|| ADD .L2X B_h2b, A_h2a, B_h2 ;[15,3]
|| SUB .L1X B_P0, A_P1, A_p1 ;[15,3]
|| MPYHL .M1 A_X7X6, A_c7c5, A_X7c5 ;[15,3]
|| MPYLH .M2 B_X3X2, B_c2c1, B_X2c2 ;[15,3]
v_loop_a:
[ A_o]SUB .D2 B_i, 1, B_i ;[27,2]
|| STH .D1T1 A_x6t, *+A_o_ptr[16] ;[27,2]
|| SSHL .S2 B_x0, satl, B_x0s ;[27,2]
|| SSHL .S1 A_x4, satl, A_x4s ;[27,2]
|| SUB .L2X B_X2c6, A_X6c2, B_r1 ;[16,3]
|| ADD .L1X B_P0, A_P1, A_p0 ;[16,3]
|| MPYLH .M1 A_X7X6, A_c6c3, A_X6c6 ;[16,3]
|| MPYH .M2 B_X1X0, B_c7c5, B_X1c7 ;[16,3]
; ============================ PIPE LOOP EPILOG ==============================
v_epilog:
* ========================================================================= *
* Post-vertical loop code: Performs remaining vertical-loop epilog, *
* pulls registers from the stack, restores the interrupt-enable state, *
* and returns to the caller. *
* *
* For speed, I start pulling items from the stack as quickly as *
* possible. I pop the return address earliest, followed by the CSR *
* restore value and the rest of the stack frame (basically, the SOE *
* registers). *
* *
* I throw the return branch in flight nearly as soon as the return addr *
* arrives from the stack in order to return to the caller as soon as *
* possible. I don't think it's possible to save any more time in this *
* epilog code. :-) *
* *
* Once the stack-frame restore is complete, I allow the remainder of *
* the epilog (mostly shifts and stores) to complete, in the remaining *
* delay slots of the return branch. Since the stack-restore loads *
* need to complete before this time anyway, I couldn't of used those *
* cycles for much else anyway. *
* *
* The interrupt-enable state is not restored until the return branch *
* is in flight. This implies that any pending interrupt will be taken *
* on arrival in the calling function, assuming it called the IDCT with *
* interrupts enabled. *
* *
* Again, this code uses twin stack-pointers for speed. *
* *
* To highlight how intertwined the epilog is with the stack frame code *
* I've added comments highlighting what is what. *
* *
* I've played a trick in order to allow an early abort from the code: *
* If the loop trip count is calculated to be zero by the main setup code *
* at the beginning, an emergency branch is made to the abort label *
* below. (The abort is triggered only if we're asked to do zero IDCTs.) *
* The outer loop trip count for the first loop (B_o) is stored in B2. *
* The second loop uses B2 for its inner loop trip count (B_i). Under *
* normal operation, B2 (aka. B_o) is non-zero upon entry to this code. *
* However, in the case of an abort, it will be zero, since we did not *
* execute either loop. Therefore we can use B_o to shut off the epilog *
* stores in the case of an early abort. *
* ========================================================================= *
idct_8x8_abort:
[ B_o]STH .D1T2 B_x7t, *+A_o_ptr[24] ; epilog code
|| SHR .S1 A_x4s, trunc2, A_x4t ; epilog code
|| MVC .S2 IRP, B15 ; Get stack pointer
LDW .D2T2 *+ B15[ 2], B3 ; Get return address
|| MV .L2 B_o_ptr, B0 ; We need this later.
|| SHR .S1 A_x1s, trunc2, A_x1t ; epilog code
|| MV .L1X B15, A1
LDW .D2T1 *+ B15[14], A2
;-
LDW .D2T2 *+ B15[10], B1 ; A13 value's
|| LDW .D1T1 *+ A1 [13], A14 ; Restore A14
LDW .D2T2 *+ B15[12], B14 ; Restore B14
|| LDW .D1T1 *+ A1 [ 3], A3 ; CSR value's
LDW .D1T2 *+ A1 [ 4], B10 ; Restore A10
|| LDW .D2T1 *+ B15[ 5], A10 ; Restore A10
LDW .D1T2 *+ A1 [ 6], B11 ; Restore A11
|| LDW .D2T1 *+ B15[ 7], A11 ; Restore A11
;-
LDW .D1T2 *+ A1 [ 8], B12 ; Restore A12
|| LDW .D2T1 *+ B15[ 9], A12 ; Restore A12
|| RET .S2 B3 ; Go home!
LDW .D1T2 *+ A1 [11], B13 ; Restore B13
|| LDW .D2T1 *++B15[16], A15 ; Restore A15, B15
|| MV .L1X B1, A13 ; Restore A13
[ B_o]STH .D2T1 A_x3t, * B0 ; epilog code
|| SHR .S2 B_x0s, trunc2, B_x0t ; epilog code
;-
[ B_o]STH .D2T1 A_x1t, *-B0[16] ; epilog code
[ B_o]STH .D2T2 B_x0t, *-B0[24] ; epilog code
|| MVC .S2X A2, IRP ; Restore IRP
[ B_o]STH .D2T1 A_x4t, *+B0[8] ; epilog code
|| MVC .S2X A3, CSR ; Restore CSR
;-
v_end:
* ========================================================================= *
* End of file: img_idct_8x8.asm *
* ------------------------------------------------------------------------- *