www.pudn.com > dm642video-driver.rar > ycbcr422pl_to_rgb565_h.asm


; 
;  Copyright 2003 by Texas Instruments Incorporated. 
;  All rights reserved. Property of Texas Instruments Incorporated. 
;  Restricted rights to use, duplicate or disclose this code are 
;  granted through contract. 
;   
; 
; "@(#) DDK 1.10.00.21 06-26-03 (ddk-b10)" 
* ========================================================================= * 
*                                                                           * 
*   USAGE                                                                   * 
*       This function is C callable, and is called according to this        * 
*       C prototype:                                                        * 
*                                                                           * 
*       void ycbcr422pl_to_rgb565                                           * 
*       (                                                                   * 
*           const short         coeff[5],  -- Matrix coefficients.          * 
*           const unsigned char *y_data,   -- Luminence data  (Y')          * 
*           const unsigned char *cb_data,  -- Blue color-diff (B'-Y')       * 
*           const unsigned char *cr_data,  -- Red color-diff  (R'-Y')       * 
*           unsigned short      *rgb_data, -- RGB 5:6:5 packed pixel out.   * 
*           unsigned            num_pixels -- # of luma pixels to process.  * 
*       )                                                                   * 
*                                                                           * 
*       The 'coeff[]' array contains the color-space-conversion matrix      * 
*       coefficients.  The 'y_data', 'cb_data' and 'cr_data' pointers       * 
*       point to the separate input image planes.  The 'rgb_data' pointer   * 
*       points to the output image buffer, and must be word aligned.        * 
*                                                                           * 
*       The kernel is designed to process arbitrary amounts of 4:2:2        * 
*       image data, although 4:2:0 image data may be processed as well.     * 
*       For 4:2:2 input data, the 'y_data', 'cb_data' and 'cr_data'         * 
*       arrays may hold an arbitrary amount of image data.  For 4:2:0       * 
*       input data, only a single scan-line (or portion thereof) may be     * 
*       processed at a time.                                                * 
*                                                                           * 
*       The coefficients in the coeff array must be in signed Q13 form.     * 
*       These coefficients correspond to the following matrix equation:     * 
*                                                                           * 
*           [ Y' -  16 ]   [ coeff[0] 0.0000   coeff[1] ]     [ R']         * 
*           [ Cb - 128 ] * [ coeff[0] coeff[2] coeff[3] ]  =  [ G']         * 
*           [ Cr - 128 ]   [ coeff[0] coeff[4] 0.0000   ]     [ B']         * 
*                                                                           * 
*   DESCRIPTION                                                             * 
*       This function runs for 46 + (num_pixels * 3) cycles, including      * 
*       6 cycles of function-call overhead.  Interrupts are masked for      * 
*       37 + (num_pixels * 3) cycles.  Code size is 512 bytes.              * 
*                                                                           * 
*       This kernel performs Y'CbCr to RGB conversion.  From the Color      * 
*       FAQ, http://home.inforamp.net/~poynton/ColorFAQ.html :              * 
*                                                                           * 
*           Various scale factors are applied to (B'-Y') and (R'-Y')        * 
*           for different applications.  The Y'PbPr scale factors are       * 
*           optimized for component analog video.  The Y'CbCr scaling       * 
*           is appropriate for component digital video, JPEG and MPEG.      * 
*           Kodak's PhotoYCC(tm) uses scale factors optimized for the       * 
*           gamut of film colors.  Y'UV scaling is appropriate as an        * 
*           intermediate step in the formation of composite NTSC or PAL     * 
*           video signals, but is not appropriate when the components       * 
*           are keps separate.  Y'UV nomenclature is now used rather        * 
*           loosely, and it sometimes denotes any scaling of (B'-Y')        * 
*           and (R'-Y').  Y'IQ coding is obsolete.                          * 
*                                                                           * 
*       This code can perform various flavors of Y'CbCr to RGB conversion   * 
*       as long as the offsets on Y, Cb, and Cr are -16, -128, and -128,    * 
*       respectively, and the coefficients match the pattern shown.         * 
*                                                                           * 
*       The kernel implements the following matrix form, which involves 5   * 
*       unique coefficients:                                                * 
*                                                                           * 
*           [ Y' -  16 ]   [ coeff[0] 0.0000   coeff[1] ]     [ R']         * 
*           [ Cb - 128 ] * [ coeff[0] coeff[2] coeff[3] ]  =  [ G']         * 
*           [ Cr - 128 ]   [ coeff[0] coeff[4] 0.0000   ]     [ B']         * 
*                                                                           * 
*                                                                           * 
*       Below are some common coefficient sets, along with the matrix       * 
*       equation that they correspond to.   Coefficients are in signed      * 
*       Q13 notation, which gives a suitable balance between precision      * 
*       and range.                                                          * 
*                                                                           * 
*       1.  Y'CbCr -> RGB conversion with RGB levels that correspond to     * 
*           the 219-level range of Y'.  Expected ranges are [16..235] for   * 
*           Y' and [16..240] for Cb and Cr.                                 * 
*                                                                           * 
*           coeff[] = { 0x2000, 0x2BDD, -0x0AC5, -0x1658, 0x3770 };         * 
*                                                                           * 
*           [ Y' -  16 ]   [ 1.0000    0.0000    1.3707 ]     [ R']         * 
*           [ Cb - 128 ] * [ 1.0000   -0.3365   -0.6982 ]  =  [ G']         * 
*           [ Cr - 128 ]   [ 1.0000    1.7324    0.0000 ]     [ B']         * 
*                                                                           * 
*       2.  Y'CbCr -> RGB conversion with the 219-level range of Y'         * 
*           expanded to fill the full RGB dynamic range.  (The matrix has   * 
*           been scaled by 255/219.)  Expected ranges are [16..235] for Y'  * 
*           and [16..240] for Cb and Cr.                                    * 
*                                                                           * 
*           coeff[] = { 0x2543, 0x3313, -0x0C8A, -0x1A04, 0x408D };         * 
*                                                                           * 
*           [ Y' -  16 ]   [ 1.1644    0.0000    1.5960 ]     [ R']         * 
*           [ Cb - 128 ] * [ 1.1644   -0.3918   -0.8130 ]  =  [ G']         * 
*           [ Cr - 128 ]   [ 1.1644    2.0172    0.0000 ]     [ B']         * 
*                                                                           * 
*       Other scalings of the color differences (B'-Y') and (R'-Y')         * 
*       (sometimes incorrectly referred to as U and V) are supported, as    * 
*       long as the color differences are unsigned values centered around   * 
*       128 rather than signed values centered around 0, as noted above.    * 
*                                                                           * 
*       In addition to performing plain color-space conversion, color       * 
*       saturation can be adjusted by scaling coeff[1] through coeff[4].    * 
*       Similarly, brightness can be adjusted by scaling coeff[0].          * 
*       General hue adjustment can not be performed, however, due to the    * 
*       two zeros hard-coded in the matrix.                                 * 
*                                                                           * 
*   TECHNIQUES                                                              * 
*       Pixel replication is performed implicitly on chroma data to         * 
*       reduce the total number of multiplies required.  The chroma         * 
*       portion of the matrix is calculated once for each Cb, Cr pair,      * 
*       and the result is added to both Y' samples.                         * 
*                                                                           * 
*       Luma is biased downwards to produce R, G, and B values that are     * 
*       signed quantities centered around zero, rather than unsigned qtys.  * 
*       This allows us to use SSHL to perform saturation, followed by a     * 
*       quick XOR to correct the sign bits in the final packed pixels.      * 
*       The required downward bias is 128 shifted left by the Q-point, 13.  * 
*                                                                           * 
*       To save two instructions, I transformed "(y0-16)*luma - (128<<13)"  * 
*       to the slightly more cryptic "y0*luma - (16*luma + (128<<13))".     * 
*       This gives me the non-obvious but effective y_bias value            * 
*       -((128 << 13) + 16*luma).  The transformation allows me to fit in   * 
*       a 6 cycle loop.                                                     * 
*                                                                           * 
*       Twin pointers are used for the stack and coeff[] arrays for speed.  * 
*                                                                           * 
*       Because the loop accesses four different arrays at three different  * 
*       strides, no memory accesses are allowed to parallelize in the       * 
*       loop.  No bank conflicts occur, as a result.                        * 
*                                                                           * 
*       Creatively constructed multiplies are used to avoid a bottleneck    * 
*       on shifts in the loop.  In particular, the 5-bit mask 0xF8000000    * 
*       doubles as a right-shift constant that happens to negate while      * 
*       shifting.  This negation is reversed by merging the bits with a     * 
*       SUB instead of an ADD or OR.                                        * 
*                                                                           * 
*       Prolog and epilog collapsing have been performed, with only a       * 
*       partial stage of prolog and epilog left uncollapsed.  The partial   * 
*       stages are interscheduled with the rest of the code for speed.      * 
*                                                                           * 
*       The stack pointer is saved in IRP to allow all 32 registers to      * 
*       be used in the loop.  This enabled prolog collapsing by freeing     * 
*       up a predicate register.  The prolog collapse counter is            * 
*       implemented as a MPY which shifts a constant left by 3 bits each    * 
*       iteration.  The counter is initialized from one of the other        * 
*       constant registers, thereby reducing the S-unit bottleneck in the   * 
*       setup code.                                                         * 
*                                                                           * 
*       Instructions have been scheduled to minimize fetch-packet padding   * 
*       NOPs.  Only 3 padding NOPs and 1 explicit NOP remain.               * 
*                                                                           * 
*   ASSUMPTIONS                                                             * 
*       An even number of luma samples needs to be processed.               * 
*                                                                           * 
*       The output image must be word aligned.                              * 
*                                                                           * 
*   NOTES                                                                   * 
*       No bank conflicts occur.                                            * 
*                                                                           * 
*       Codesize is 512 bytes.                                              * 
*                                                                           * 
*       On average, one bank per cycle is accessed on a C6201 in the loop,  * 
*       with 1 cycle of 6 accessing no banks, and 1 cycle accessing two.    * 
*                                                                           * 
*       The kernel requires 14 words of stack space.                        * 
*                                                                           * 
*   SOURCE                                                                  * 
*       Poynton, Charles et al.  "The Color FAQ,"  1999.                    * 
*           http://home.inforamp.net/~poynton/ColorFAQ.html                 * 
*                                                                           * 
* ------------------------------------------------------------------------- * 
*             Copyright (c) 1999 Texas Instruments, Incorporated.           * 
*                            All Rights Reserved.                           * 
* ========================================================================= * 
 
                .sect ".data:copyright_h" 
_Copyright:     .string "Copyright (C) 1999 Texas Instruments Incorporated. " 
                .string "All Rights Reserved.",0 
                .sect ".text:hand" 
                .global _yc2rgb16 
_yc2rgb16: 
; =============== SYMBOLIC REGISTER ASSIGNMENTS: ARGUMENTS ================ ; 
        .asg            A4,         A_coef      ; Coefficients table 
        .asg            B4,         B_y_data    ; Pointer to luma 
        .asg            A6,         A_cb_data   ; Pointer to B-Y 
        .asg            B6,         B_cr_data   ; Pointer to R-Y 
        .asg            A8,         A_rgb_data  ; Pointer to RGB output 
        .asg            B8,         B_num_pix   ; # of pixels to process 
; ================= SYMBOLIC REGISTER ASSIGNMENTS: SETUP ================== ; 
        .asg            B15,        B_SP        ; Stack pointer, B datapath 
        .asg            A3,         A_SP        ; Stack pointer, A datapath 
        .asg            B0,         B_csr       ; CSR's value 
        .asg            B1,         B_noGIE     ; CSR w/ GIE bit cleared 
        .asg            B2,         B_irp       ; IRP's value 
        .asg            A0,         A_csr       ; Copy of CSR's value 
        .asg            B3,         B_ret       ; Return address 
        .asg            B7,         B_coef      ; Twin coefficients ptr. 
        .asg            A13,        A_rcr       ; Cr's contribution to Red 
        .asg            B14,        B_bcb       ; Cb's contribution to Blu 
        .asg            A5,         A_gcr_      ; Cr's contribution to Grn 
        .asg            A5,         A_gcr       ; A_gcr_ << 16 
        .asg            B5,         B_gcb_      ; Cb's contribution to Grn 
        .asg            B5,         B_gcb       ; B_gcb_ << 16 
        .asg            A1,         A_lneg      ; luma coeff[0] < 0 
; ================= SYMBOLIC REGISTER ASSIGNMENTS: KERNEL ================= ; 
        .asg            B0,         B_p         ; Prolog collapse counter 
        .asg            A2,         A_i         ; Loop trip counter 
        .asg            A10,        A_y_ptr     ; Luma data pointer 
        .asg            B15,        B_cb_ptr    ; B-Y data pointer 
        .asg            B6,         B_cr_ptr    ; R-Y data pointer 
        .asg            B11,        B_rgb_ptr   ; RGB output data pointer 
        .asg            B12,        B_k32_k128  ; Constant 0x00200080 
        .asg            A11,        A_k32_k128  ; Constant 0x00200080 
        .asg            A12,        A_one_lum   ; Constant 1 packed w/coeff[0] 
        .asg            A13,        A_gcr_rcr   ; coeff[3], coeff[1] packed 
        .asg            B14,        B_gcb_bcb   ; coeff[2], coeff[4] packed 
        .asg            B10,        B_y_bias    ; -((128<<13) + 16*coeff[0]) 
        .asg            B13,        B_ms5       ; Mask:  upper 5 bits 
        .asg            A14,        A_ms6       ; Mask:  upper 6 bits 
        .asg            A15,        A_sflip     ; Sign-flip const 0x84108410 
        .asg            A0,         A_y0        ; y0 value from y_data[] 
        .asg            B4,         B_y1        ; y1 value from y_data[] 
        .asg            B1,         B_cb_       ; cb value prior to level shift 
        .asg            A3,         A_cr_       ; cr value prior to level shift 
        .asg            B3,         B_cb        ; level-shifted cb value. 
        .asg            A4,         A_cr        ; level-shifted cr value 
        .asg            B5,         B_y1t_      ; scaled y1, before level shift 
        .asg            A3,         A_y0t_      ; scaled y0, before level shift 
        .asg            B9,         B_y1t       ; scaled, level-shifted y1 
        .asg            A5,         A_y0t       ; scaled, level-shifted y0 
        .asg            B3,         B_bt        ; Scaled blue color-diff 
        .asg            B1,         B_gt_       ; Scaled green color-diff (a) 
        .asg            A8,         A_gt_       ; Scaled green color-diff (b) 
        .asg            A6,         A_gt        ; Scaled green color-diff 
        .asg            A9,         A_rt        ; Scaled red color-diff 
        .asg            B1,         B_r1        ; Pixel 1 red  (16Q16) 
        .asg            B3,         B_g1        ; Pixel 1 grn  (17Q15) 
        .asg            B4,         B_b1        ; Pixel 1 blu  (16Q16) 
        .asg            A3,         A_r0        ; Pixel 0 red  (16Q16) 
        .asg            A5,         A_g0        ; Pixel 0 grn  (17Q15) 
        .asg            A0,         A_b0        ; Pixel 0 blu  (16Q16) 
        .asg            B5,         B_r1s       ; Saturated pixel 1 red (5Q27) 
        .asg            B4,         B_g1s       ; Saturated pixel 1 grn (6Q26) 
        .asg            B5,         B_b1s       ; Saturated pixel 1 blu (5Q27) 
        .asg            A1,         A_r0s       ; Saturated pixel 0 red (5Q27) 
        .asg            A4,         A_g0s       ; Saturated pixel 0 grn (6Q26) 
        .asg            A4,         A_b0s       ; Saturated pixel 0 blu (5Q27) 
        .asg            B8,         B_r1t       ; Truncated pixel 1 red 
        .asg            B7,         B_g1t       ; Truncated pixel 1 grn 
        .asg            B2,         B_b1t       ; Truncated pixel 1 blu 
        .asg            A7,         A_r0t       ; Truncated pixel 0 red 
        .asg            A4,         A_g0t       ; Truncated pixel 0 grn 
        .asg            A5,         A_b0t       ; Truncated pixel 0 blu 
        .asg            B2,         B_g1f       ; Pixel 1 grn in final position 
        .asg            B1,         B_b1f       ; Pixel 1 blu in final position 
        .asg            B8,         B_r_b1      ; Pixel 1 red, blue merged 
        .asg            B4,         B_rgb1      ; Pixel 1 red, grn, blu merged 
        .asg            A3,         A_g0f       ; Pixel 0 grn in final position 
        .asg            A6,         A_b0f       ; Pixel 0 blu in final position 
        .asg            A7,         A_r_b0      ; Pixel 0 red, blue merged 
        .asg            A9,         A_rgb0_     ; Pixel 0 red, grn, blu merged 
        .asg            A6,         A_rgb0      ; Pixel 0 in low half word 
        .asg            B5,         B_rgb_      ; Combined pixels pre-sign-fix 
        .asg            B7,         B_rgb       ; Combined pixels w/ sign-fix 
; ========================================================================= ; 
        ; Stack frame.  14 words:  A10..A15, B10..B14, B3, CSR, IRP 
;- 
        STW     .D2T1   A15,        *B_SP--[14]         ; Save A15, get stack 
||      MVC     .S2     CSR,        B_csr               ; Capture CSR's state 
||      MV      .L2X    A_coef,     B_coef              ; Twin coef pointer 
||      MVK     .S1     0xFFFF8410, A_sflip             ; Sign-flip cst, low 
 
        MV      .S1X    B_SP,       A_SP                ; Twin Stack Pointer 
||      AND     .L2     B_csr,      -2,         B_noGIE ; Clear GIE 
||      LDHU    .D1T2   *A_coef[2], B_gcb_              ; gcb = coeff[2] 
||      LDHU    .D2T1   *B_coef[3], A_gcr_              ; gcb = coeff[3] 
;- 
        STW     .D1T1   A14,        *+A_SP[12]          ; Save A14 
||      STW     .D2T2   B14,        *+B_SP[11]          ; Save B14 
||      MVC     .S2     B_noGIE,    CSR                 ; Disable interrupts 
||      ZERO    .L1     A_ms6                           ; Mask 6, low 
; ===== Interrupts masked here ===== 
        STW     .D1T1   A13,        *+A_SP[10]          ; Save A13 
||      STW     .D2T2   B13,        *+B_SP[ 9]          ; Save B13 
||      MVC     .S2     IRP,        B_irp               ; Capture IRP's state 
||      ZERO    .L2     B_ms5                           ; Mask 5, low 
;- 
        STW     .D1T1   A12,        *+A_SP[ 8]          ; Save A12 
||      STW     .D2T2   B12,        *+B_SP[ 7]          ; Save B12 
||      MVC     .S2     B_SP,       IRP                 ; Save SP in IRP 
||      MVKLH   .S1     0xFC00,     A_ms6               ; Mask 6, high 
 
        LDH     .D1T1   *A_coef[0], A_one_lum           ; lum = coeff[0] 
||      MV      .L1X    B_csr,      A_csr               ; Partitioning MV 
 
        STW     .D1T1   A11,        *+A_SP[ 6]          ; Save A11 
||      STW     .D2T2   B11,        *+B_SP[ 5]          ; Save B11 
;- 
        LDHU    .D2T1   *B_coef[1], A_rcr               ; rcr = coeff[1] 
||      LDHU    .D1T2   *A_coef[4], B_bcb               ; rcr = coeff[2] 
 
        STW     .D1T1   A10,        *+A_SP[ 4]          ; Save A10 
||      STW     .D2T2   B10,        *+B_SP[ 3]          ; Save B10 
||      MV      .L1X    B_y_data,   A_y_ptr             ; Partitioning MV 
 
        STW     .D1T1   A_csr,      *+A_SP[ 2]          ; Save CSR 
||      STW     .D2T2   B_ret,      *+B_SP[ 1]          ; Save return address 
||      MVK     .S2     128,        B_k32_k128          ; Constant: 128 
;- 
; =========================== PIPE LOOP PROLOG ============================ ; 
        LDBU    .D2T1   *B_cr_ptr++,            A_cr_   ;[ 1,1] cr = *cr_ptr++ 
||      AND     .L1X    B_num_pix,  -2,         A_i     ; Make num_pix even 
||      MV      .L2X    A_cb_data,  B_cb_ptr            ; Partitioning MV 
||      MVKLH   .S1     1,          A_one_lum           ; Constant: 1 
||      MVKLH   .S2     32,         B_k32_k128          ; Constant: 32 
||      MPY     .M2     B_k32_k128, 1,          B_p     ; Prolog collapse count 
||      MPYH    .M1     A_one_lum,  A_one_lum,  A_lneg  ; lneg = coeff[0] < 0 
;- 
        LDBU    .D1T1   *A_y_ptr++[2],          A_y0    ;[ 2,1] y0 = *y_ptr++ 
||      SHL     .S2X    A_one_lum,  4,          B_y_bias; ((128<<13)+16*luma) 
||      MVKH    .S1     0x84108410, A_sflip             ; Sign-flip cst, high 
 
        LDBU    .D2T2   *B_cb_ptr++,            B_cb_   ;[ 3,1] cb = *cb_ptr++ 
||      ADD     .D1     A_i,        2,          A_i     ; Adjust for para iter 
||      SHL     .S1     A_lneg,     20,         A_lneg  ; Handle luma < 0 
||      MV      .L1X    B_k32_k128, A_k32_k128          ; Twin constant reg. 
||      MV      .L2X    A_rgb_data, B_rgb_ptr           ; Partitioning MV 
;- 
        LDBU    .D1T2   *-A_y_ptr[1],           B_y1    ;[ 4,1] y1 = *y_ptr++ 
||      SHL     .S1     A_gcr_,     16,         A_gcr   ; Put gcr in high half 
||      SHL     .S2     B_gcb_,     16,         B_gcb   ; Put gcb in high half 
||      SUB     .L2X    B_y_bias,   A_lneg,     B_y_bias; Sign bit, coeff[0]<0 
 
        STW     .D1T2   B_irp,      *+A_SP[13]          ; Save IRP 
||      ADD     .L1     A_gcr,      A_rcr,      A_gcr_rcr ; Merge gcr, rcr 
||      ADD     .L2     B_gcb,      B_bcb,      B_gcb_bcb ; Merge gcb, rcb 
||      MVKLH   .S2     0xF800,     B_ms5               ; Mask 5, high 
;- 
; =========================== PIPE LOOP KERNEL ============================ ; 
conv_loop: 
  [ A_i]B       .S1     conv_loop                       ;[24,1] while (i) 
||      ADD     .L2X    B_rgb1,     A_rgb0,     B_rgb_  ;[24,1] merge pix 0, 1 
||      MPYHUS  .M1X    A_g0t,      B_ms5,      A_g0f   ;[18,2] >> 5 and negate 
||      SSHL    .S2     B_g1,       11,         B_g1s   ;[18,2] g1s = sat(g1) 
||      ADD     .D1     A_y0t,      A_rt,       A_r0    ;[12,3] r0  = y0t + rt 
||      SUB     .D2     B_y1t_,     B_y_bias,   B_y1t   ;[12,3] y1t-= y_bias 
||      SUB     .L1     A_cr_,      A_k32_k128, A_cr    ;[ 6,4] cr -= 128 
||      MPYUS   .M2     B_p,        8,          B_p     ; prolog collapse count 
;- 
        ADD     .D1     A_r0t,      A_b0f,      A_r_b0  ;[19,2] Merge r0, b0 
||      MPYHU   .M2     B_b1t,      B_k32_k128, B_b1f   ;[19,2] >> 11 
||      AND     .S2X    B_g1s,      A_ms6,      B_g1t   ;[19,2] g1t = g1s & ms6 
||      SSHL    .S1     A_r0,       11,         A_r0s   ;[13,3] r0s = sat(r0) 
||      ADD     .L2     B_y1t,      B_bt,       B_b1    ;[13,3] b1  = y1t + bt 
||      ADD     .L1X    B_gt_,      A_gt_,      A_gt    ;[13,3]gt=gcr*cr+gcb*cb 
||      MPYLH   .M1     A_cr,       A_gcr_rcr,  A_gt_   ;[ 7,4] gcr *c r 
||      LDBU    .D2T1   *B_cr_ptr++,            A_cr_   ;[ 1,5] cr  = *cr_ptr++ 
;- 
        XOR     .L2X    B_rgb_,     A_sflip,    B_rgb   ;[26,1] Fix sign bits 
||      MPYHUS  .M2     B_g1t,      B_ms5,      B_g1f   ;[20,2] >> 5 and negate 
||      SSHL    .S2     B_b1,       11,         B_b1s   ;[14,3] b1s = sat(b1) 
||      ADD     .L1X    A_y0t,      B_bt,       A_b0    ;[14,3] b0  = y0t + bt 
||      ADD     .S1     A_y0t,      A_gt,       A_g0    ;[14,3] g0  = y0t + gt 
||      MPY     .M1     A_y0,       A_one_lum,  A_y0t_  ;[ 8,4] y0t = y0 * luma 
||      SUB     .D2     B_cb_,      B_k32_k128, B_cb    ;[ 8,4] cb -= 128 
||      LDBU    .D1T1   *A_y_ptr++[2],          A_y0    ;[ 2,5] y0  = *y_ptr++ 
;- 
        SUB     .D1     A_r_b0,     A_g0f,      A_rgb0_ ;[21,2] merge r0,g0,b0 
||      ADD     .L2     B_r1t,      B_b1f,      B_r_b1  ;[21,2] merge r1, b1 
||      AND     .L1X    A_r0s,      B_ms5,      A_r0t   ;[15,3] r0s = r0t & ms5 
||      SSHL    .S1     A_b0,       11,         A_b0s   ;[15,3] b0s = sat(b0) 
||      ADD     .S2X    B_y1t,      A_rt,       B_r1    ;[15,3] r1  = y1t + rt 
||      MPY     .M1     A_cr,       A_gcr_rcr,  A_rt    ;[ 9,4] rt  = rcr * cr 
||      MPYLH   .M2     B_cb,       B_gcb_bcb,  B_gt_   ;[ 9,4] gcb * cb 
||      LDBU    .D2T2   *B_cb_ptr++,            B_cb_   ;[ 3,5] cb  = *cb_ptr++ 
;- 
        MPYHU   .M1     A_rgb0_,    A_one_lum,  A_rgb0  ;[22,2] rgb0 in lo half 
||      SUB     .D2     B_r_b1,     B_g1f,      B_rgb1  ;[22,2] merge r1,g1,b1 
||      AND     .L1X    A_b0s,      B_ms5,      A_b0t   ;[16,3] b0t = b0s & ms5 
||      AND     .L2     B_b1s,      B_ms5,      B_b1t   ;[16,3] b1t = b1s & ms5 
||      SSHL    .S1     A_g0,       11,         A_g0s   ;[16,3] g0s = sat(g0) 
||      SSHL    .S2     B_r1,       11,         B_r1s   ;[16,3] r1s = sat(r1) 
||      MPY     .M2X    B_y1,       A_one_lum,  B_y1t_  ;[10,4] y1t = y1 * luma 
||      LDBU    .D1T2   *-A_y_ptr[1],           B_y1    ;[ 4,5] y1  = *y_ptr++ 
;- 
  [!B_p]STW     .D2T2   B_rgb,      *B_rgb_ptr++        ;[29,1] *rgb_ptr++=rgb 
||      SUB     .D1     A_i,        2,          A_i     ;[23,2] i -= 2 
||      MPYHU   .M1     A_b0t,      A_k32_k128, A_b0f   ;[17,3] >> 11 
||      AND     .L1     A_g0s,      A_ms6,      A_g0t   ;[17,3] g0t = g0s & ms6 
||      AND     .L2     B_r1s,      B_ms5,      B_r1t   ;[17,3] r1t = r1s & ms5 
||      ADD     .S2X    B_y1t,      A_gt,       B_g1    ;[17,3] g1  = y1t + gt 
||      MPY     .M2     B_cb,       B_gcb_bcb,  B_bt    ;[11,4] bt  = bcb * cb 
||      SUB     .S1X    A_y0t_,     B_y_bias,   A_y0t   ;[11,4] y0t-= y_bias 
; =========================== PIPE LOOP EPILOG ============================ ; 
 
; ================ SYMBOLIC REGISTER ASSIGNMENTS: CLEANUP ================= ; 
        .asg            B15,        B_SP                ; Stack ptr, B side 
        .asg            A3,         A_SP                ; Stack ptr, A side 
        .asg            A0,         A_csr               ; CSR value 
        .asg            B0,         B_irp               ; IRP value 
        .asg            B3,         B_ret               ; Return address 
; ========================================================================= ; 
;- 
        MVC     .S2     IRP,        B_SP                ; Restore stack ptr 
||      ADD     .L2X    B_rgb1,     A_rgb0,     B_rgb_  ;[24,5] merge pix 0, 1 
 
        MV      .L1X    B_SP,       A_SP                ; Twin Stack Pointer 
||      LDW     .D2T2   *+B_SP[13], B_irp               ; Get IRP's value 
 
        LDW     .D1T2   *+A_SP[ 1], B_ret               ; Get return address 
||      LDW     .D2T1   *+B_SP[ 2], A_csr               ; Get CSR's value 
 
        LDW     .D1T2   *+A_SP[ 3], B10                 ; Restore B10 
||      LDW     .D2T1   *+B_SP[ 4], A10                 ; Restore A10 
;- 
        LDW     .D1T2   *+A_SP[11], B14                 ; Restore B14 
||      LDW     .D2T1   *+B_SP[12], A14                 ; Restore A14 
||      XOR     .L2X    B_rgb_,     A_sflip,    B_rgb   ;[26,5] fix sign bits 
 
        LDW     .D1T2   *+A_SP[ 7], B12                 ; Restore B12 
||      LDW     .D2T1   *+B_SP[ 8], A12                 ; Restore A12 
 
        LDW     .D1T2   *+A_SP[ 9], B13                 ; Restore B13 
||      LDW     .D2T1   *+B_SP[10], A13                 ; Restore A13 
||      MVC     .S2     B_irp,      IRP                 ; Restore IRP 
;- 
        LDW     .D1T2   *+A_SP[ 5], B11                 ; Restore B11 
||      LDW     .D2T1   *+B_SP[ 6], A11                 ; Restore A11 
||      B       .S2     B_ret                           ; Return to caller 
 
        MVC     .S2X    A_csr,      CSR                 ; Restore CSR 
||      LDW     .D2T1   *++B_SP[14],A15                 ; Restore A15 
; ===== Interruptibility state (GIE) restored here ===== 
 
        STW     .D2T2   B_rgb,      *B_rgb_ptr          ;[29,5] *rgb_ptr++=rgb 
 
        NOP             3 
 
; ===== Branch occurs ===== 
; ===== Interrupts may occur here ===== 
 
* ========================================================================= * 
*   End of file:  ycbcr422pl_to_rgb565_h.asm                                * 
* ------------------------------------------------------------------------- * 
*             Copyright (c) 1999 Texas Instruments, Incorporated.           * 
*                            All Rights Reserved.                           * 
* ========================================================================= *