www.pudn.com > ccs_encoder.rar > fdct.asm
; .sect ".data:copyright_h"
;
.text
.global _IMG_fdct_8x8
_IMG_fdct_8x8:
; ========================== SYMBOLIC CONSTANTS =========================== ;
.asg 0xB505, cst_c0 ; Cosine term c0
.asg 0x2C62, cst_c1 ; Cosine term c1
.asg 0x29CF, cst_c2 ; Cosine term c2
.asg 0x25A0, cst_c3 ; Cosine term c3
.asg 0x1924, cst_c5 ; Cosine term c5
.asg 0x1151, cst_c6 ; Cosine term c6
.asg 0x08D4, cst_c7 ; Cosine term c7
; =============== SYMBOLIC REGISTER ASSIGNMENTS: VERT LOOP ================ ;
.asg A11, A_k1c0 ; 1, Cosine term c0 (packed)
.asg A12, A_c1c7 ; Cosine terms c1, c7 (packed)
.asg A13, A_c2c6 ; Cosine terms c2, c6 (packed)
.asg B11, B_k1c0 ; 1, Cosine term c0 (packed)
.asg B12, B_c1c7 ; Cosine terms c1, c7 (packed)
.asg B13, B_c2c6 ; Cosine terms c2, c6 (packed)
.asg B14, B_c3c5 ; Cosine terms c3, c5 (packed)
.asg A4, A_i_ptr ; Input pointer
.asg B10, B_o_ptr ; Output pointer
.asg A9, A_f0 ; Spatial domain sample f0
.asg B8, B_f1 ; Spatial domain sample f1
.asg B6, B_f2 ; Spatial domain sample f2
.asg A5, A_f3 ; Spatial domain sample f3
.asg A7, A_f4 ; Spatial domain sample f4
.asg B7, B_f5 ; Spatial domain sample f5
.asg B15, B_f6 ; Spatial domain sample f6
.asg A6, A_f7 ; Spatial domain sample f7
.asg A6, A_g0 ; Node g0 in flow graph
.asg B8, B_g1 ; Node g1 in flow graph
.asg B6, B_h1 ; Node h1 in flow graph
.asg A7, A_h0 ; Node h0 in flow graph
.asg A0, A_s1 ; Node s1 (h2) in flow graph
.asg B4, B_h3 ; Node h3 in flow graph
.asg B15, B_g3 ; Node g3 in flow graph
.asg A15, A_q1 ; Node q1 (g2) in flow graph
.asg A6, A_p0 ; Node p0 in flow graph
.asg B6, B_p1 ; Node p1 in flow graph
.asg B15, B_s0a ; Node s0 intermediate result
.asg B5, B_s0b ; Node s0 intermediate result
.asg B15, B_s0 ; Node s0 in flow graph
.asg A3, A_r0 ; Node r0 in flow graph
.asg B15, B_r1 ; Node r1 in flow graph
.asg B4, B_q0a ; Node q0 intermediate result
.asg A14, A_q0b ; Node q0 intermediate result
.asg A3, A_q0 ; Node q0 in flow graph
.asg A10, A_Q1 ; Node Q1 in flow graph
.asg B5, B_S1 ; Node S1 in flow graph
.asg A3, A_Q0 ; Node Q0 in flow graph
.asg B4, B_S0 ; Node S0 in flow graph
.asg A14, A_c1Q1 ; Intermediate value c1 * Q1
.asg A6, A_c2r0 ; Intermediate value c2 * r0
.asg A7, A_c3Q0 ; Intermediate value c3 * Q0
.asg A3, A_c5Q0 ; Intermediate value c5 * Q0
.asg A14, A_c6r0 ; Intermediate value c6 * r0
.asg A8, A_c7Q1 ; Intermediate value c7 * Q1
.asg B5, B_c1S1 ; Intermediate value c1 * S1
.asg B0, B_c2r1 ; Intermediate value c2 * r1
.asg B0, B_c3S0 ; Intermediate value c3 * S0
.asg B3, B_c5S0 ; Intermediate value c5 * S0
.asg B6, B_c6r1 ; Intermediate value c6 * r1
.asg B5, B_c7S1 ; Intermediate value c7 * S1
.asg B9, B_F0 ; Frequency domain term F0
.asg A8, A_F1 ; Frequency domain term F1
.asg A5, A_F2 ; Frequency domain term F2
.asg B4, B_F3 ; Frequency domain term F3
.asg B3, B_F4 ; Frequency domain term F4
.asg A9, A_F5 ; Frequency domain term F5
.asg A10, A_F6 ; Frequency domain term F6
.asg B4, B_F7 ; Frequency domain term F7
.asg A8, A_F1t ; Truncated result for F1
.asg A5, A_F2t ; Truncated result for F2
.asg B7, B_F3t ; Truncated result for F3
.asg A10, A_F5t ; Truncated result for F5
.asg A10, A_F6t ; Truncated result for F6
.asg B5, B_F7t ; Truncated result for F7
.asg B2, B_i ; Inner loop counter #1
.asg A1, A_i ; Inner loop counter #2
.asg B1, B_o ; Outer loop counter
.asg A2, A_c ; Prolog collapse counter
; ========================================================================= ;
* ========================================================================= *
* Initialization code / Stack Management *
* *
* This code is responsible for saving registers to the stack, disabling *
* interrupts, and setting up for the vertical loop. *
* *
* This function requires 16 words of stack. A10...A15, B10...B14, CSR, *
* IRP, and 'num_fdcts' are all pushed on the stack. For speed, this *
* code uses twin stack pointers to offload registers onto the stack as *
* quickly as possible. *
* *
* The majority of the code in this function is not interruptible. *
* Therefore, interrupts are disabled almost immediately after entry *
* into the function, and the previous interruptibility state is restored *
* on exit. The previous value of CSR is pushed on the stack and *
* restored on exit. *
* *
* Since all 32 registers are used by the vertical loop, the stack *
* pointer is saved in the IRP register. The previous contents of IRP *
* are also pushed on the stack. *
* *
* Initialization for constants (cosine terms, etc.) is overlapped with *
* the prolog of the vertical loop to save time. Pointer setup for the *
* output pointer is also hidden in the prolog. *
* *
* Early exit code suppresses most of the function's activity (including *
* most of the stack accesses) if num_fdcts (in B4) is zero. It is not *
* possible to exit the function faster. *
* ========================================================================= *
;-
STW .D2T1 A15, * B15--[16] ; Save A15, get stk frame
|| MV .L1X B15, A15 ; Twin Stack Pointer
|| SHL .S2 B4, 3, B_o ; iters == num_fdcts * 8
[ B_o]STW .D1T2 B14, *-A15 [ 2] ; Save B14 (SP[14])
||[ B_o]ADD .L2 B_o, -1, B_o ; Adj. for parallel iters
||[ B_o]ADDK .S1 48, A_i_ptr ; Point to row 3, col 0
||[!B_o]RET .S2 B3 ; Abort if num_fdcts == 0
||[!B_o]LDW .D2T1 *++B15[16], A15 ; Restore A15 on abort
; ===== Interrupts masked by branch delay slots =====
;-
[ B_o]STW .D1T1 A13, *-A15 [ 3] ; Save A13 (SP[13])
||[ B_o]STW .D2T2 B11, *+B15 [ 8] ; Save B11
||[ B_o]MVC .S2 CSR, B0 ; Snapshot CSR
[ B_o]STW .D1T1 A12, *-A15 [ 5] ; Save A12 (SP[11])
||[ B_o]STW .D2T2 B12, *+B15 [10] ; Save B12
[ B_o]STW .D1T2 B13, *-A15 [ 7] ; Save B13 (SP[ 9])
||[ B_o]STW .D2T1 A11, *+B15 [12] ; Save A11
||[ B_o]MVC .S2 IRP, B5 ; Snapshot IRP
;-
[ B_o]STW .D1T1 A10, *-A15 [ 9] ; Save A10 (SP[ 7])
||[ B_o]STW .D2T2 B10, *+B15 [ 6] ; Save B10
||[ B_o]AND .L2 B0, -2, B2 ; Clear GIE bit in CSR
||[ B_o]MV .L1X B5, A1 ; Partitioning MV
[ B_o]STW .D2T2 B3, *+B15 [ 5] ; Save return address
||[ B_o]STW .D1T1 A1, *-A15 [12] ; Save IRP (SP[ 4])
||[ B_o]MV .L1X B0, A0 ; Partitioning MV
||[ B_o]MVC .S2 B2, CSR ; Mask interrupts
; ===== Branch Occurs =====
;-
; =========================== PIPE LOOP PROLOG ============================ ;
LDH .D1T1 *+A_i_ptr [ 8], A_f4 ;[ 1,1]
|| MVC .S2 B15, IRP ; Save Stack Pointer
LDH .D1T2 *-A_i_ptr [16], B_f1 ;[ 2,1]
|| MVK .S1 4, A_i ; Inner loop counter #1
LDH .D1T1 * A_i_ptr++[ 1], A_f3 ;[ 3,1]
|| MVKL .S1 cst_c7, A_c1c7 ; Cosine term C7
LDH .D1T1 *-A_i_ptr [25], A_f0 ;[ 4,1]
|| MVKL .S1 cst_c0, A_k1c0 ; Cosine term C0
;-
LDH .D1T2 *+A_i_ptr [15], B_f5 ;[ 5,1]
|| MVKL .S1 cst_c6, A_c2c6 ; Cosine term C6
|| MVKL .S2 cst_c6, B_c2c6 ; Cosine term C6
|| MV .L2X A_c1c7, B_c1c7 ; Twin constant register
LDH .D1T2 *-A_i_ptr [ 9], B_f2 ;[ 6,1]
|| MVKLH .S1 cst_c2, A_c2c6 ; Cosine term C2
|| SUB .L1 A_i, 2, A_c ; Prolog collapse cnt = 2
|| ADD .L2X A_i_ptr, -2, B_o_ptr
;-
LDH .D1T2 *+A_i_ptr [23], B_f6 ;[ 7,1]
|| MVKLH .S2 cst_c1, B_c1c7 ; Cosine term C1
LDH .D1T1 *+A_i_ptr [31], A_f7 ;[ 8,1]
|| MVKLH .S1 cst_c1, A_c1c7 ; Cosine term C1
|| MVKLH .S2 cst_c2, B_c2c6 ; Cosine term C2
MVKL .S2 cst_c5, B_c3c5 ; Cosine term C5
|| MVKLH .S1 1, A_k1c0 ; Constant: 0x0001
|| STW .D2T1 A14, *+B15 [15] ; Save A14
;-
SUB .L1 A_f3, A_f4, A_q1 ;[ 9,1] q1=g2
|| ADD .S1 A_f3, A_f4, A_h0 ;[10,1]
|| MVKLH .S2 cst_c3, B_c3c5 ; Cosine term C3
|| STW .D2T2 B_o, *+B15 [ 3] ; Spill horiz loop count
|| STW .D1T1 A0, *-A15 [14] ; Save CSR (SP[ 2])
LDH .D1T1 *+A_i_ptr [ 8], A_f4 ;[ 1,2]
|| MV .L2X A_k1c0, B_k1c0 ; Twin constant register
|| MVK .S2 16, B_i ; Inner loop counter #2
;-
; =========================== PIPE LOOP KERNEL ============================ ;
v_loop:
SHR .S1 A_F6, 13, A_F6t ;[22,1]
|| MPY .M2 B_S0, B_c3c5, B_c5S0 ;[22,1]
|| MPY .M1X A_Q0, B_c3c5, A_c5Q0 ;[22,1]
|| ADD .D2 B_f1, B_f6, B_g1 ;[12,2]
|| SUB .S2 B_f2, B_f5, B_g3 ;[12,2]
|| SUB .L2 B_f1, B_f6, B_h3 ;[12,2]
|| LDH .D1T2 *-A_i_ptr [16], B_f1 ;[ 2,3]
||[ A_c]ADD .L1 A_c, -1, A_c ;pro. collapse
;-
v_loop_1:
[!A_c]STH .D2T2 B_F4, *+B_o_ptr [ 8] ;[23,1]
|| MPY .M2 B_S1, B_c1c7, B_c7S1 ;[23,1]
|| MPYLH .M1X A_Q0, B_c3c5, A_c3Q0 ;[23,1]
|| ADD .L2 B_h3, B_g3, B_s0a ;[13,2]
|| SUB .S2 B_h3, B_g3, B_q0a ;[13,2]
|| SUB .S1 A_f0, A_f7, A_s1 ;[13,2] s1=h2
|| ADD .L1 A_f0, A_f7, A_g0 ;[13,2]
|| LDH .D1T1 * A_i_ptr++[ 1], A_f3 ;[ 3,3]
;-
v_loop_2:
[!A_c]STH .D2T2 B_F0, *-B_o_ptr [24] ;[24,1]
|| SUB .S2X B_c3S0, A_c5Q0, B_F3 ;[24,1]
|| MPYLH .M2 B_S1, B_c1c7, B_c1S1 ;[24,1]
|| ADD .L2 B_f2, B_f5, B_h1 ;[14,2]
|| SUB .S1 A_g0, A_h0, A_r0 ;[14,2]
|| ADD .L1 A_g0, A_h0, A_p0 ;[14,2]
|| MPYSU .M1X B_q0a, A_k1c0, A_q0b ;[14,2]
|| LDH .D1T1 *-A_i_ptr [25], A_f0 ;[ 4,3]
;-
v_loop_3:
[!A_c]SHR .S1 A_F2, 13, A_F2t ;[25,1]
||[!A_c]MPY .M1 A_i, 4, A_i ;[25,1]
|| SHR .S2 B_F3, 13, B_F3t ;[25,1]
|| SUB .L2X B_c7S1, A_c1Q1, B_F7 ;[25,1]
|| ADD .L1X A_c3Q0, B_c5S0, A_F5 ;[25,1]
|| SUB .D2 B_g1, B_h1, B_r1 ;[15,2]
|| MPYSU .M2 B_s0a, B_k1c0, B_s0b ;[15,2]
|| LDH .D1T2 *+A_i_ptr [15], B_f5 ;[ 5,3]
;-
v_loop_4:
ADD .L1X A_c7Q1, B_c1S1, A_F1 ;[26,1]
||[ B_o]B .S2 v_loop ;[26,1]
||[!A_c]STH .D2T1 A_F6t, *+B_o_ptr [24] ;[26,1]
|| ADD .L2 B_g1, B_h1, B_p1 ;[16,2]
|| ADDK .S1 07FFFh, A_q0b ;[16,2]
|| MPY .M1 A_r0, A_c2c6, A_c6r0 ;[16,2]
|| MPY .M2 B_i, 4, B_i ;[ 6,3]
|| LDH .D1T2 *-A_i_ptr [ 9], B_f2 ;[ 6,3]
;-
v_loop_5:
[!A_c]STH .D2T1 A_F2t, *-B_o_ptr [ 8] ;[27,1]
|| SHR .S1 A_F5, 13, A_F5t ;[27,1]
|| MPY .M2 B_r1, B_c2c6, B_c6r1 ;[17,2]
|| SUB .L2X A_p0, B_p1, B_F4 ;[17,2]
|| ADDK .S2 07FFFh, B_s0b ;[17,2]
|| MPYH .M1 A_q0b, A_k1c0, A_q0 ;[17,2]
|| LDH .D1T2 *+A_i_ptr [23], B_f6 ;[ 7,3]
;-
v_loop_6:
[!A_c]STH .D2T2 B_F3t, * B_o_ptr++[ 1] ;[28,1]
|| SHR .S1 A_F1, 13, A_F1t ;[28,1]
|| ADD .L2X A_p0, B_p1, B_F0 ;[18,2]
|| MPYLH .M1 A_r0, A_c2c6, A_c2r0 ;[18,2]
|| MPYH .M2 B_s0b, B_k1c0, B_s0 ;[18,2]
||[ B_o]SUB .S2 B_o, 1, B_o ;[18,2]
|| LDH .D1T1 *+A_i_ptr [31], A_f7 ;[ 8,3]
v_loop_7:
;-
SHR .S2 B_F7, 13, B_F7t ;[29,1]
||[!A_c]STH .D2T1 A_F5t, *+B_o_ptr [15] ;[29,1]
|| MPYLH .M2 B_r1, B_c2c6, B_c2r1 ;[19,2]
|| SUB .L1 A_q1, A_q0, A_Q0 ;[19,2]
|| ADD .D1 A_q1, A_q0, A_Q1 ;[19,2]
|| SUB .S1 A_f3, A_f4, A_q1 ;[ 9,3] q1=g2
;-
v_loop_8:
[!A_c]STH .D2T1 A_F1t, *-B_o_ptr [17] ;[30,1]
|| ADD .L1X B_c6r1, A_c2r0, A_F2 ;[20,2]
|| SUB .L2X A_s1, B_s0, B_S0 ;[20,2]
|| MPYLH .M1 A_Q1, A_c1c7, A_c1Q1 ;[20,2]
|| ADD .D1 A_f3, A_f4, A_h0 ;[10,3]
||[!B_i]ADD .S2 B_i, 4, B_i ;[10,3]
||[!B_i]ADDK .S1 112, A_i_ptr ;[10,3]
;-
v_loop_9:
[!A_c]STH .D2T2 B_F7t, *+B_o_ptr [31] ;[31,1]
||[!A_i]ADDK .S2 112, B_o_ptr ;[31,1]
||[!A_i]ADD .S1 A_i, 4, A_i ;[31,1]
|| SUB .L1X A_c6r0, B_c2r1, A_F6 ;[21,2]
|| MPYLH .M2 B_S0, B_c3c5, B_c3S0 ;[21,2]
|| ADD .L2X A_s1, B_s0, B_S1 ;[21,2]
|| MPY .M1 A_Q1, A_c1c7, A_c7Q1 ;[21,2]
|| LDH .D1T1 *+A_i_ptr [ 8], A_f4 ;[ 1,4]
; =========================== PIPE LOOP EPILOG ============================ ;
* ========================================================================= *
* Epilog / Inter-loop / Prolog Code *
* *
* The code from the vertical loop's epilog has been interscheduled *
* with inter-loop code and prolog code for the horizontal loop. *
* This allows hiding some of the overhead as we pipe-down one loop and *
* pipe-up the next. *
* *
* Notably, we restore B15 and IRP here (rather than after the loop) *
* and unspill our loop trip count from the stack, all in parallel with *
* the prolog and epilog code. Also, the epilog of the first loop has *
* been heavily overlapped with the prolog of the second loop. Since *
* a handful of symbolic names have been assigned to different registers, *
* and others have conflicting names between the two loops, we use a set *
* of intermediate symbolic names that bridge the transition. *
* *
* To save a STH/LDH pair, the value of "F7t" from the first loop is *
* forwarded directly to the input "f7" of the second loop. (The last *
* FDCT performed by the vertical loop overlaps the first FDCT performed *
* by the second loop.) This is done through a "sign extension", to *
* exactly mimic the overflow behavior of the original C code. *
* *
* For speed, we twin the stack pointer in a spare slot here so that the *
* stack restore after the loop can proceed as quickly as possible. *
* ========================================================================= *
.asg A4, Ah_io_ptr ; Horiz Input/output pointer
.asg A14, Av_c1Q1 ; Vert: Intermediate c1 * Q1
.asg A6, Av_c3Q0 ; Vert: Intermediate c3 * Q0
.asg A8, Av_c7Q1 ; Vert: Intermediate c7 * Q1
.asg B6, Bv_c1S1 ; Vert: Intermediate c1 * S1
.asg B3, Bv_c5S0 ; Vert: Intermediate c5 * S0
.asg B5, Bv_c7S1 ; Vert: Intermediate c7 * S1
.asg A8, Av_F1 ; Vert: Freq. domain term F1
.asg A5, Av_F2 ; Vert: Freq. domain term F2
.asg B4, Bv_F3 ; Vert: Freq. domain term F3
.asg A9, Av_F5 ; Vert: Freq. domain term F5
.asg B4, Bv_F7 ; Vert: Freq. domain term F7
.asg A8, Av_F1t ; Vert: Trunc. result for F1
.asg A5, Av_F2t ; Vert: Trunc. result for F2
.asg B7, Bv_F3t ; Vert: Trunc. result for F3
.asg A9, Av_F5t ; Vert: Trunc. result for F5
.asg B5, Bv_F7t ; Vert: Trunc. result for F7
;-
SHR .S1 A_F6, 13, A_F6t ;[22,4]
|| MPYLH .M2 B_S1, B_c1c7, Bv_c1S1 ;[24,4]
|| MPY .M1X A_Q0, B_c3c5, A_c5Q0 ;[22,4]
|| STH .D2T2 B_F4, *+B_o_ptr [ 8] ;[23,4]
MPY .M2 B_S0, B_c3c5, Bv_c5S0 ;[22,4]
|| MPYLH .M1X A_Q0, B_c3c5, Av_c3Q0 ;[23,4]
|| STH .D2T1 A_F6t, *+B_o_ptr [24] ;[26,4]
|| MVC .S2 IRP, B15
;-
STH .D2T2 B_F0, *-B_o_ptr [24] ;[24,4]
|| MPY .M2 B_S1, B_c1c7, B_c7S1 ;[23,4]
|| SUB .S2X B_c3S0, A_c5Q0, B_F3 ;[24,4]
|| ADD .L1X A_c7Q1, Bv_c1S1, A_F1 ;[26,4]
|| ADDK .S1 -54, Ah_io_ptr ; Adjust pointer
SHR .S2 B_F3, 13, B_F3t ;[25,4]
|| LDW .D2T1 *B15[4], A2 ; Load IRP's value
|| SHR .S1 A_F1, 13, Av_F1t ;[28,4]
;-
; ========================================================================= ;
; =============== SYMBOLIC REGISTER ASSIGNMENTS: HORIZ LOOP =============== ;
.asg A14, A_c3c5 ; Cosine terms c3, c5 (packed)
.asg B1, B_k_rnd ; Rounding constant 0x7FFF
.asg A12, A_k_rnd ; Rounding constant 0x7FFF
.asg B2, B_io_ptr; Input/output pointer
.asg A4, A_io_ptr; Input/output pointer
.asg A7, A_f0 ; Spatial domain sample f0
.asg B13, B_f1 ; Spatial domain sample f1
.asg B3, B_f2 ; Spatial domain sample f2
.asg A10, A_f3 ; Spatial domain sample f3
.asg A0, A_f4 ; Spatial domain sample f4
.asg B7, B_f5 ; Spatial domain sample f5
.asg B9, B_f6t ; Spatial domain sample f6 (tmp)
.asg B10, B_f6 ; Spatial domain sample f6
.asg A8, A_f7 ; Spatial domain sample f7
.asg A0, A_g0 ; Node g0 in flow graph
.asg B3, B_g1 ; Node g1 in flow graph
.asg B8, B_h1 ; Node h1 in flow graph
.asg A9, A_h0 ; Node h0 in flow graph
.asg A1, A_s1 ; Node s1 in flow graph
.asg B13, B_h3 ; Node h3 in flow graph
.asg B10, B_g3 ; Node g3 in flow graph
.asg A10, A_q1 ; Node q1 in flow graph
.asg A5, A_p0 ; Node p0 in flow graph
.asg B4, B_p1 ; Node p1 in flow graph
.asg B4, B_s0a ; Node s0 intermediate value
.asg B5, B_s0b ; Node s0 intermediate value
.asg B5, B_s0c ; Node s0 intermediate value
.asg B9, B_s0 ; Node s0 in flow graph
.asg A0, A_r0 ; Node r0 in flow graph
.asg B7, B_r1 ; Node r1 in flow graph
.asg B5, B_q0a ; Node q0 intermediate value
.asg A3, A_q0b ; Node q0 intermediate value
.asg A0, A_q0c ; Node q0 intermediate value
.asg A6, A_q0 ; Node q0 in flow graph
.asg A9, A_Q1 ; Node Q1 in flow graph
.asg B8, B_S1 ; Node S1 in flow graph
.asg A6, A_Q0 ; Node Q0 in flow graph
.asg B5, B_S0 ; Node S0 in flow graph
.asg A0, A_c1Q1 ; Intermediate value c1 * Q1
.asg A5, A_c2r0 ; Intermediate value c2 * r0
.asg A3, A_c3Q0 ; Intermediate value c3 * Q0
.asg A9, A_c5Q0 ; Intermediate value c5 * Q0
.asg A3, A_c6r0 ; Intermediate value c6 * r0
.asg A3, A_c7Q1 ; Intermediate value c7 * Q1
.asg B7, B_c1S1 ; Intermediate value c1 * S1
.asg B3, B_c2r1 ; Intermediate value c2 * r1
.asg B4, B_c3S0 ; Intermediate value c3 * S0
.asg B10, B_c5S0 ; Intermediate value c5 * S0
.asg B4, B_c6r1 ; Intermediate value c6 * r1
.asg B4, B_c7S1 ; Intermediate value c7 * S1
.asg B5, B_F0 ; Frequency domain term F0
.asg A6, A_F1 ; Frequency domain term F1
.asg B9, B_F2 ; Frequency domain term F2
.asg B3, B_F3 ; Frequency domain term F3
.asg A1, A_F4 ; Frequency domain term F4
.asg A9, A_F5 ; Frequency domain term F5
.asg A3, A_F6 ; Frequency domain term F6
.asg B7, B_F7 ; Frequency domain term F7
.asg B5, B_F0r ; Rounded value for F0
.asg A6, A_F1r ; Rounded value for F1
.asg B3, B_F2r ; Rounded value for F2
.asg B3, B_F3r ; Rounded value for F3
.asg A7, A_F4r ; Rounded value for F4
.asg A9, A_F5r ; Rounded value for F5
.asg A3, A_F6r ; Rounded value for F6
.asg B5, B_F7r ; Rounded value for F7
.asg B6, B_F0t ; Truncated result for F0
.asg A8, A_F1t ; Truncated result for F1
.asg B6, B_F2t ; Truncated result for F2
.asg B4, B_F3t ; Truncated result for F3
.asg A7, A_F4t ; Truncated result for F4
.asg A0, A_F5t ; Truncated result for F5
.asg A5, A_F6t ; Truncated result for F6
.asg B13, B_F7t ; Truncated result for F7
.asg A2, A_o ; Outer loop counter
.asg B0, B_c ; Prolog collapse counter
.asg A1, A_c ; Prolog collapse counter copy
; ========================================================================= ;
* ========================================================================= *
* (Instructions marked "(v)" in the prolog below are from the vertical *
* loop's epilog.) *
* ========================================================================= *
; =========================== PIPE LOOP PROLOG ============================ ;
LDH .D1T2 *-A_io_ptr [ 4], B_f2 ;[ 1,1]
|| SUB A_io_ptr, 12, B_io_ptr
|| STH .D2T1 Av_F1t, *-B_o_ptr [16] ;[30,4] (v)
|| SHR .S1 Av_F2, 13, Av_F2t ;[25,4] (v)
LDH .D1T1 *-A_io_ptr [ 3], A_f3 ;[ 2,1]
|| LDH .D2T2 *+B_io_ptr [ 5], B_f5 ;[ 2,1]
|| SUB .L2X Bv_c7S1, Av_c1Q1, Bv_F7 ;[25,4] (v)
|| ADD .L1X Av_c3Q0, Bv_c5S0, Av_F5 ;[25,4] (v)
;-
LDH .D2T1 *+B_io_ptr [ 4], A_f4 ;[ 3,1]
|| LDH .D1T2 * A_io_ptr--[ 7], B_f6t ;[ 3,1]
|| MVK .S1 0x7FFF, A_k_rnd ; Rounding value
|| MVK .S2 0x7FFF, B_k_rnd ; Rounding value
LDH .D2T1 * B_io_ptr--[ 8], A_f0 ;[ 5,1]
|| LDH .D1T2 *+A_io_ptr [ 2], B_f1 ;[ 4,1]
|| SHR .S1 Av_F5, 13, Av_F5t ;[27,4] (v)
;-
SHL .S1X Bv_F7, 3, A_f7 ;[29,4] (v)
|| STH .D2T2 Bv_F3t, * B_o_ptr [ 0] ;[28,4] (v)
CLR .S2 B_k1c0, 0,15, B_c ; Prolog collapse: 0x10000
|| MV .L1X B_c3c5, A_c3c5 ; Twin constant register
|| STH .D2T1 Av_F5t, *+B_o_ptr [16] ;[29,4] (v)
MV .L1X B15, A15 ; Twin stack pointer
|| MVC .S2X A2, IRP ; Restore IRP
|| STH .D2T1 Av_F2t, *-B_o_ptr [ 8] ;[27,4] (v)
;-
ADD .L1 A_f3, A_f4, A_h0 ;[ 8,1]
|| ADD .S2 B_f2, B_f5, B_h1 ;[ 8,1]
|| SUB .L2 B_f2, B_f5, B_g3 ;[ 9,1]
|| LDW .D2T1 *B15[3], A_o
SUB .L1 A_f3, A_f4, A_q1 ;[ 9,1] q1=g2
|| ADD .L2 B_f1, B_f6t, B_g1 ;[ 9,1]
|| SUB .S2 B_f1, B_f6t, B_h3 ;[ 9,1]
|| SHR .S1 A_f7, 16, A_f7
;-
; =========================== PIPE LOOP KERNEL ============================ ;
h_loop:
[!B_c]STH .D1T2 B_F0t, *+A_io_ptr[ 9] ;[20,1]
|| MPY .M1 A_Q0, A_c3c5, A_c5Q0 ;[20,1]
|| MPYLH .M2 B_S0, B_c3c5, B_c3S0 ;[20,1]
|| ADD .S1X A_c7Q1, B_c1S1, A_F1 ;[20,1]
|| SUB .S2X B_c7S1, A_c1Q1, B_F7 ;[20,1]
|| ADD .L1 A_f0, A_f7, A_g0 ;[10,2]
|| ADD .D2 B_h3, B_g3, B_s0a ;[10,2]
|| SUB .L2 B_h3, B_g3, B_q0a ;[10,2]
;-
h_loop_1:
ADD .L2 B_F7, B_k_rnd, B_F7r ;[21,1]
||[!B_c]STH .D2T1 A_F6t, *+B_io_ptr[22] ;[21,1]
|| SUB .S2 B_g1, B_h1, B_r1 ;[11,2]
|| MPYSU .M2 B_s0a, B_k1c0, B_s0b ;[11,2]
|| MPYSU .M1X B_q0a, A_k1c0, A_q0b ;[11,2]
|| ADD .S1 A_g0, A_h0, A_p0 ;[11,2]
|| SUB .L1 A_g0, A_h0, A_r0 ;[11,2]
|| LDH .D1T2 *-A_io_ptr [ 2], B_f5 ;[ 1,3]
;-
h_loop_2:
SUB .S2X B_c3S0, A_c5Q0, B_F3 ;[22,1]
|| ADD .S1X A_c3Q0, B_c5S0, A_F5 ;[22,1]
|| ADD .L1 A_F1, A_k_rnd, A_F1r ;[22,1]
|| MPYH .M2 B_F7r, B_k1c0, B_F7t ;[22,1]
|| ADD .L2 B_g1, B_h1, B_p1 ;[12,2]
|| MPY .M1 A_r0, A_c2c6, A_c6r0 ;[12,2]
|| LDH .D1T1 *-A_io_ptr [ 4], A_f3 ;[ 2,3]
|| LDH .D2T2 *+B_io_ptr [ 2], B_f2 ;[ 2,3]
;-
h_loop_3:
ADD .S2 B_F3, B_k_rnd, B_F3r ;[23,1]
|| SUB .S1X A_p0, B_p1, A_F4 ;[13,2]
|| ADD .L1 A_q0b, A_k_rnd, A_q0c ;[13,2]
|| ADD .L2 B_s0b, B_k_rnd, B_s0c ;[13,2]
|| MPYLH .M2X B_r1, A_c2c6, B_c2r1 ;[13,2]
|| MPYLH .M1 A_r0, A_c2c6, A_c2r0 ;[13,2]
|| LDH .D2T1 *+B_io_ptr [ 4], A_f4 ;[ 3,3]
|| LDH .D1T2 *-A_io_ptr [ 1], B_f6 ;[ 3,3]
;-
h_loop_4:
ADD .L1 A_F5, A_k_rnd, A_F5r ;[24,1]
|| SHR .S2 B_F3r, 16, B_F3t ;[24,1]
||[ A_o]B .S1 h_loop ;[24,1]
|| ADD .L2X A_p0, B_p1, B_F0 ;[14,2]
|| MPYH .M2 B_s0c, B_k1c0, B_s0 ;[14,2]
|| MPYH .M1 A_q0c, A_k1c0, A_q0 ;[14,2]
|| LDH .D1T1 * A_io_ptr--[ 8], A_f7 ;[ 4,3]
|| LDH .D2T2 *+B_io_ptr [ 1], B_f1 ;[ 4,3]
;-
h_loop_5:
ADD .S2 B_F2, B_k_rnd, B_F2r ;[25,1]
|| MPYH .M1 A_F1r, A_k1c0, A_F1t ;[25,1]
|| ADD .D1 A_F4, 4, A_F4r ;[15,2]
|| SUB .S1X A_c6r0, B_c2r1, A_F6 ;[15,2]
|| SUB .L1 A_f0, A_f7, A_s1 ;[15,2] s1=h2
|| ADD .L2 B_F0, 6, B_F0r ;[15,2]
|| MPY .M2X B_r1, A_c2c6, B_c6r1 ;[15,2]
|| LDH .D2T1 * B_io_ptr--[ 8], A_f0 ;[ 5,3]
;-
h_loop_6:
SHR .S1 A_F5r, 16, A_F5t ;[26,1]
|| SHR .S2 B_F2r, 16, B_F2t ;[26,1]
||[!B_c]STH .D2T2 B_F3t, *+B_io_ptr[27] ;[26,1]
|| SUB .L1 A_q1, A_q0, A_Q0 ;[16,2]
|| ADD .L2X A_s1, B_s0, B_S1 ;[16,2]
|| ADD .D1 A_q1, A_q0, A_Q1 ;[16,2]
|| MPYUS .M2 B_c, 2, B_c ;pro. collapse
|| MPYHL .M1X B_c, A_c2c6, A_c ;pro. collapse
;-
h_loop_7:
[!B_c]STH .D1T1 A_F5t, *+A_io_ptr[22] ;[27,1]
||[!B_c]STH .D2T2 B_F2t, *+B_io_ptr[26] ;[27,1]
|| ADD .L1 A_F6, A_k_rnd, A_F6r ;[17,2]
|| SHR .S2 B_F0r, 3, B_F0t ;[17,2]
|| SHR .S1 A_F4r, 3, A_F4t ;[17,2]
|| SUB .L2X A_s1, B_s0, B_S0 ;[17,2]
|| MPY .M2 B_S1, B_c1c7, B_c7S1 ;[17,2]
|| MPY .M1X A_Q1, B_c1c7, A_c7Q1 ;[17,2]
;-
h_loop_8:
[!A_c]STH .D1T1 A_F1t, *+A_io_ptr[18] ;[28,1]
||[!A_c]STH .D2T2 B_F7t, *+B_io_ptr[31] ;[28,1]
|| SHR .S1 A_F6r, 16, A_F6t ;[18,2]
|| ADD .L2X B_c6r1, A_c2r0, B_F2 ;[18,2]
|| MPYLH .M2 B_S1, B_c1c7, B_c1S1 ;[18,2]
|| MPYLH .M1X A_Q1, B_c1c7, A_c1Q1 ;[18,2]
|| ADD .L1 A_f3, A_f4, A_h0 ;[ 8,3]
|| ADD .S2 B_f2, B_f5, B_h1 ;[ 8,3]
;-
h_loop_9:
[ A_o]SUB .S1 A_o, 1, A_o ;[19,2]
||[!B_c]STH .D1T1 A_F4t, *+A_io_ptr[13] ;[19,2]
|| MPYLH .M1 A_Q0, A_c3c5, A_c3Q0 ;[19,2]
|| MPY .M2 B_S0, B_c3c5, B_c5S0 ;[19,2]
|| SUB .L1 A_f3, A_f4, A_q1 ;[ 9,3] q1=g2
|| ADD .L2 B_f1, B_f6, B_g1 ;[ 9,3]
|| SUB .D2 B_f1, B_f6, B_h3 ;[ 9,3]
|| SUB .S2 B_f2, B_f5, B_g3 ;[ 9,3]
; =========================== PIPE LOOP EPILOG ============================ ;
; EPILOG:
;-
* ========================================================================= *
* Epilog / Final Cleanup Code. *
* *
* This code performs the final stores from the epilog while retoring *
* Save-On-Entry values from the stack. The two processes are heavily *
* interwoven in the interest of speed. For instance, the return addr. *
* is loaded immediately and branched to as soon as it lands in the *
* register file. Meanwhile, the final epilog stores complete as the *
* return-branch is taken. *
* *
* Note that a handful of symbolic names have been reassigned in the *
* epilog to avoid interfering with the values being loaded from the *
* stack. *
* ========================================================================= *
.asg B5, B_F7t
.asg B9, B_F2r
.asg B8, B_F3
.asg B8, B_F3r
.asg A9, A_F5t
MPY .M1 A_Q0, A_c3c5, A_c5Q0
|| MPYLH .M2 B_S0, B_c3c5, B_c3S0
|| ADD .S1X A_c7Q1, B_c1S1, A_F1
|| SUB .S2X B_c7S1, A_c1Q1, B_F7
|| ADD .L2 B_F2, B_k_rnd, B_F2r
|| LDW .D2T1 *+ B15[ 2], A0 ; Load CSR's value
|| LDW .D1T2 *+ A15[ 5], B3 ; Load return address
;-
ADD .L2 B_F7, B_k_rnd, B_F7r
|| ADD .L1 A_F1, A_k_rnd, A_F1r
|| LDW .D2T2 *+ B15[ 8], B11 ; Restore B11
|| LDW .D1T1 *+ A15[13], A13 ; Restore A13
MPYH .M2 B_F7r, B_k1c0, B_F7t
|| MPYH .M1 A_F1r, A_k1c0, A_F1t
|| LDW .D1T2 *+ A15[ 6], B10 ; Restore B10
|| LDW .D2T1 *+ B15[ 7], A10 ; Restore A10
;-
ADD .S1X A_c3Q0, B_c5S0, A_F5
|| SUB .S2X B_c3S0, A_c5Q0, B_F3
|| LDW .D1T2 *+ A15[14], B14 ; Restore B14
|| LDW .D2T1 *+ B15[15], A14 ; Restore A14
ADD .L2 B_F3, B_k_rnd, B_F3r
|| ADD .L1 A_F5, A_k_rnd, A_F5r
|| LDW .D1T2 *+ A15[10], B12 ; Restore B12
|| LDW .D2T1 *+ B15[11], A12 ; Restore A12
;-
RET .S2 B3 ; Return to caller
|| LDW .D2T1 *+ B15[12], A11 ; Restore A11
SHR .S2 B_F3r, 16, B_F3t
|| LDW .D2T1 *++B15[16], A15 ; Rst. A15, release stack
|| LDW .D1T2 *+ A15[ 9], B13 ; Restore B13
STH .D1T1 A_F1t, *+A_io_ptr[10]
|| STH .D2T2 B_F7t, *+B_io_ptr[23]
|| SHR .S1 A_F5r, 16, A_F5t
;-
STH .D2T1 A_F6t, *+B_io_ptr[22]
|| STH .D1T2 B_F0t, *+A_io_ptr[ 9]
SHR .S2 B_F2r, 16, B_F2t
|| STH .D2T2 B_F3t, *+B_io_ptr[19]
STH .D1T1 A_F5t, *+A_io_ptr[14]
|| STH .D2T2 B_F2t, *+B_io_ptr[18]
|| MVC .S2X A0, CSR ; Restore CSR
;-
; ===== Interruptibility state restored here =====
; ===== Branch Occurs =====
* ========================================================================= *
* End of file: img_fdct_8x8.asm *
* ------------------------------------------------------------------------- *
* Copyright (c) 2003 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *