www.pudn.com > MPEG4Codec.zip > yuv2rgb_mmx.c
/************************************************************************** * * * This code has been developed by John Funnell. This software is an * * implementation of a part of one or more MPEG-4 Video tools as * * specified in ISO/IEC 14496-2 standard. Those intending to use this * * software module in hardware or software products are advised that its * * use may infringe existing patents or copyrights, and any such use * * would be at such party's own risk. The original developer of this * * software module and his/her company, and subsequent editors and their * * companies (including Project Mayo), will have no liability for use of * * this software or modifications or derivatives thereof. * * * * Project Mayo gives users of the Codec a license to this software * * module or modifications thereof for use in hardware or software * * products claiming conformance to the MPEG-4 Video Standard as * * described in the Open DivX license. * * * * The complete Open DivX license can be found at * * http://www.projectmayo.com/opendivx/license.php * * * **************************************************************************/ /** * Copyright (C) 2001 - Project Mayo * * John Funnell * Andrea Graziani * * DivX Advanced Research Center* **/ // yuv2rgb_mmx.c // /* MMX version of colourspace conversion 13 Feb 2001 - John Funnell, added -height feature to invert output, fixed 24-bit overwrite TODO: 1. Extract common part of from 32, 24 and 16 bit conversions into inline func to avoid duplication of the core MMX code. 2. Experiment with using prefetch instructions to improve performance Using matrix of SMPTE 170M This is what we're doing: Step 1. Y -= 16 U -= 128 V -= 128 Step 2. Y /= 219 U /= 224 V /= 224 Step 3. now we want the inverse of this matrix: / 0.299 0.114 0.587 \ | -0.169 0.500 -0.331 | \ 0.500 -0.081 -0.419 / which is, approximately: / 2568 0 3343 \ | 2568 f36e e5e2 | / 65536 * 8 \ 2568 40cf 0 / including the multiplies in Step 2 */ #include // for memset() & memcpy() #include "portab.h" #include "yuv2rgb.h" /** * **/ #define MAXIMUM_Y_WIDTH 800 #define _USE_PREFETCH /* static constants */ /* colourspace conversion matrix values */ static uint64_t mmw_mult_Y = 0x2568256825682568; static uint64_t mmw_mult_U_G = 0xf36ef36ef36ef36e; static uint64_t mmw_mult_U_B = 0x40cf40cf40cf40cf; static uint64_t mmw_mult_V_R = 0x3343334333433343; static uint64_t mmw_mult_V_G = 0xe5e2e5e2e5e2e5e2; /* various masks and other constants */ static uint64_t mmb_0x10 = 0x1010101010101010; static uint64_t mmw_0x0080 = 0x0080008000800080; static uint64_t mmw_0x00ff = 0x00ff00ff00ff00ff; static uint64_t mmw_cut_red = 0x7c007c007c007c00; static uint64_t mmw_cut_green = 0x03e003e003e003e0; static uint64_t mmw_cut_blue = 0x001f001f001f001f; /**** YUV -> RGB conversion, 32-bit output ****/ /* if height_y is negative then the output image will be inverted */ /* note: _stride_out parameter is ignored in yuv to rgb conversion it's assumed that stride_out = 4 * width_y for the 32 bit color bitmap */ void yuv2rgb_32(uint8_t *puc_y, int stride_y, uint8_t *puc_u, uint8_t *puc_v, int stride_uv, uint8_t *puc_out, int width_y, int height_y, unsigned int _stride_out) { int y, horiz_count; int stride_out = width_y * 4; if (height_y < 0) { /* we are flipping our output upside-down */ height_y = -height_y; puc_y += (height_y - 1) * stride_y ; puc_u += (height_y/2 - 1) * stride_uv; puc_v += (height_y/2 - 1) * stride_uv; stride_y = -stride_y; stride_uv = -stride_uv; } horiz_count = -(width_y >> 3); for (y=0; y RGB conversion, 24-bit output ****/ void yuv2rgb_24(uint8_t *puc_y, int stride_y, uint8_t *puc_u, uint8_t *puc_v, int stride_uv, uint8_t *puc_out, int width_y, int height_y, unsigned int _stride_out) { int y, horiz_count; uint8_t *puc_out_remembered; int stride_out = width_y * 3; if (height_y < 0) { /* we are flipping our output upside-down */ height_y = -height_y; puc_y += (height_y - 1) * stride_y ; puc_u += (height_y/2 - 1) * stride_uv; puc_v += (height_y/2 - 1) * stride_uv; stride_y = -stride_y; stride_uv = -stride_uv; } horiz_count = -(width_y >> 3); for (y=0; y RGB conversion, 16-bit output (two flavours) ****/ /* 5 Jan 2001 - Andrea Graziani */ static uint64_t mask_5 = 0xf8f8f8f8f8f8f8f8; static uint64_t mask_6 = 0xfcfcfcfcfcfcfcfc; static uint64_t mask_blue = 0x1f1f1f1f1f1f1f1f; /* all stride values are in _bytes_ */ void yuv2rgb_555(uint8_t *puc_y, int stride_y, uint8_t *puc_u, uint8_t *puc_v, int stride_uv, uint8_t *puc_out, int width_y, int height_y, unsigned int _stride_out) { int y, horiz_count; int stride_out = width_y * 2; if (height_y < 0) { /* we are flipping our output upside-down */ height_y = -height_y; puc_y += (height_y - 1) * stride_y ; puc_u += (height_y/2 - 1) * stride_uv; puc_v += (height_y/2 - 1) * stride_uv; stride_y = -stride_y; stride_uv = -stride_uv; } horiz_count = -(width_y >> 3); for (y=0; y 255 ? 255 : (a)<0 ? 0 : (a) #define _R(y,u,v) (0x2568*(y) + 0x3343*(u)) /0x2000 #define _G(y,u,v) (0x2568*(y) - 0x0c92*(v) - 0x1a1e*(u)) /0x2000 #define _B(y,u,v) (0x2568*(y) + 0x40cf*(v)) /0x2000 #define _mR 0x7c00 #define _mG 0x03e0 #define _mB 0x001f #define _Ps565(r,g,b) ( ((r & 0xF8) >> 3) | (((g & 0xF8) << 3)) | (((b & 0xF8) << 8)) ) void yuv2rgb_565(uint8_t *puc_y, int stride_y, uint8_t *puc_u, uint8_t *puc_v, int stride_uv, uint8_t *puc_out, int width_y, int height_y, unsigned int _stride_out) { int y, horiz_count; unsigned short * pus_out; int stride_out = width_y * 2; if (height_y < 0) { /* we are flipping our output upside-down */ height_y = -height_y; puc_y += (height_y - 1) * stride_y ; puc_u += (height_y/2 - 1) * stride_uv; puc_v += (height_y/2 - 1) * stride_uv; stride_y = -stride_y; stride_uv = -stride_uv; } pus_out = (unsigned short *) puc_out; horiz_count = -(width_y >> 3); for (y=0; y >1] - 128; v = puc_v[x>>1] - 128; _r = _R(y,u,v); _g = _G(y,u,v); _b = _B(y,u,v); r = _S(_r); g = _S(_g); b = _S(_b); pus_out[0] = (unsigned short) _Ps565(r,g,b); pus_out++; } ***/ _asm { push eax push ebx push ecx push edx push edi mov eax, puc_out mov ebx, puc_y mov ecx, puc_u mov edx, puc_v mov edi, horiz_count horiz_loop: // load data movd mm2, [ecx] ; mm2 = ________u3u2u1u0 movd mm3, [edx] ; mm3 = ________v3v2v1v0 movq mm0, [ebx] ; mm0 = y7y6y5y4y3y2y1y0 pxor mm7, mm7 ; zero mm7 // convert chroma part punpcklbw mm2, mm7 ; mm2 = __u3__u2__u1__u0 punpcklbw mm3, mm7 ; mm3 = __v3__v2__v1__v0 psubw mm2, mmw_0x0080 ; mm2 -= 128 psubw mm3, mmw_0x0080 ; mm3 -= 128 psllw mm2, 3 ; mm2 *= 8 psllw mm3, 3 ; mm3 *= 8 movq mm4, mm2 ; mm4 = mm2 = u movq mm5, mm3 ; mm5 = mm3 = v pmulhw mm2, mmw_mult_U_G ; mm2 *= u green coeff pmulhw mm3, mmw_mult_V_G ; mm3 *= v green coeff pmulhw mm4, mmw_mult_U_B ; mm4 = blue chroma pmulhw mm5, mmw_mult_V_R ; mm5 = red chroma paddsw mm2, mm3 ; mm2 = green chroma // convert luma part psubusb mm0, mmb_0x10 ; mm0 -= 16 movq mm1, mmw_0x00ff ; mm1 = 00ff00ff00ff00ff psrlw mm0, 8 ; mm0 = __y7__y5__y3__y1 luma odd pand mm1, mm0 ; mm1 = __y6__y4__y2__y0 luma even psllw mm0, 3 ; mm0 *= 8 psllw mm1, 3 ; mm1 *= 8 pmulhw mm0, mmw_mult_Y ; mm0 luma odd *= luma coeff pmulhw mm1, mmw_mult_Y ; mm1 luma even *= luma coeff // complete the matrix calc with the addictions movq mm3, mm4 ; copy blue chroma movq mm6, mm5 ; copy red chroma movq mm7, mm2 ; copy green chroma paddsw mm3, mm0 ; mm3 = luma odd + blue chroma paddsw mm4, mm1 ; mm4 = luma even + blue chroma paddsw mm6, mm0 ; mm6 = luma odd + red chroma paddsw mm5, mm1 ; mm5 = luma even + red chroma paddsw mm7, mm0 ; mm7 = luma odd + green chroma paddsw mm2, mm1 ; mm2 = luma even + green chroma // clipping packuswb mm3, mm3 packuswb mm4, mm4 packuswb mm6, mm6 packuswb mm5, mm5 packuswb mm7, mm7 packuswb mm2, mm2 // interleave odd and even parts punpcklbw mm4, mm3 ; mm4 = b7b6b5b4b3b2b1b0 blue punpcklbw mm5, mm6 ; mm5 = r7r6r5r4r3r2r1r0 red punpcklbw mm2, mm7 ; mm2 = g7g6g5g4g3g2g1g0 green // mask not needed bits (using 555) pand mm4, mask_5 pand mm5, mask_5 pand mm2, mask_5 // mix colors and write psrlw mm4, 3 ; mm4 = red shifted pand mm4, mask_blue ; mask the blue again pxor mm7, mm7 ; zero mm7 movq mm1, mm5 ; mm1 = copy blue movq mm3, mm4 ; mm3 = copy red movq mm6, mm2 ; mm6 = copy green punpckhbw mm1, mm7 punpckhbw mm3, mm7 punpckhbw mm6, mm7 psllw mm6, 3 ; shift green psllw mm1, 8 ; shift blue por mm6, mm3 por mm6, mm1 movq 8[eax], mm6 punpcklbw mm2, mm7 ; mm2 = __g3__g2__g1__g0 already masked punpcklbw mm4, mm7 punpcklbw mm5, mm7 psllw mm2, 3 ; shift green psllw mm5, 8 ; shift blue por mm2, mm4 por mm2, mm5 movq [eax], mm2 add ebx, 8 ; puc_y += 8; add ecx, 4 ; puc_u += 4; add edx, 4 ; puc_v += 4; add eax, 16 ; puc_out += 16 // wrote 16 bytes inc edi jne horiz_loop pop edi pop edx pop ecx pop ebx pop eax emms } /***/ puc_y += stride_y; if (y%2) { puc_u += stride_uv; puc_v += stride_uv; } puc_out += stride_out; } } /***/ /**** YUV -> YUV conversions (more-or-less straight copy) - still in development ****/ void yuy2_out(uint8_t *puc_y, int stride_y, uint8_t *puc_u, uint8_t *puc_v, int stride_uv, uint8_t *puc_out, int width_y, int height_y, unsigned int stride_out) { int y; uint8_t* puc_out2; unsigned int stride_diff = 4 * stride_out - 2 * width_y; // expressed in bytes if (height_y < 0) { /* we are flipping our output upside-down */ height_y = -height_y; puc_y += (height_y - 1) * stride_y ; puc_u += (height_y/2 - 1) * stride_uv; puc_v += (height_y/2 - 1) * stride_uv; stride_y = -stride_y; stride_uv = -stride_uv; } puc_out2 = puc_out + 2 * stride_out; for (y=height_y/2; y; y--) { register uint8_t *py, *py2, *pu, *pv; register int x; uint32_t tmp; py = puc_y; py2 = puc_y + stride_y; pu = puc_u; pv = puc_v; for (x=width_y/2; x; x--) { tmp = *(py++); tmp |= *(pu++) << 8; tmp |= *(py++) << 16; tmp |= *(pv++) << 24; *(uint32_t*)puc_out=tmp; puc_out += 4; tmp &= 0xFF00FF00; tmp |= *(py2++); tmp |= *(py2++) << 16; *(uint32_t*)puc_out2=tmp; puc_out2 += 4; } puc_y += 2*stride_y; puc_u += stride_uv; puc_v += stride_uv; puc_out += stride_diff; puc_out2 += stride_diff; } } /*** YUV 4:2:0 -> UYVY ***/ void uyvy_out(uint8_t *puc_y, int stride_y, uint8_t *puc_u, uint8_t *puc_v, int stride_uv, uint8_t *puc_out, int width_y, int height_y, unsigned int stride_out) { int y; uint8_t* puc_out2; unsigned int stride_diff = 4 * stride_out - 2 * width_y; // expressed in bytes if (height_y < 0) { /* we are flipping our output upside-down */ height_y = -height_y; puc_y += (height_y - 1) * stride_y ; puc_u += (height_y/2 - 1) * stride_uv; puc_v += (height_y/2 - 1) * stride_uv; stride_y = -stride_y; stride_uv = -stride_uv; } puc_out2 = puc_out + 2 * stride_out; for (y=height_y/2; y; y--) { register uint8_t *py, *py2, *pu, *pv; register int x; uint32_t tmp; py = puc_y; py2 = puc_y + stride_y; pu = puc_u; pv = puc_v; for (x=width_y/2; x; x--) { tmp = *(pu++); tmp |= *(py++) << 8; tmp |= *(pv++) << 16; tmp |= *(py++) << 24; *(uint32_t*)puc_out=tmp; puc_out += 4; tmp &= 0x00FF00FF; tmp |= *(py2++) << 8; tmp |= *(py2++) << 24; *(uint32_t*)puc_out2=tmp; puc_out2 += 4; } puc_y += 2*stride_y; puc_u += stride_uv; puc_v += stride_uv; puc_out += stride_diff; puc_out2 += stride_diff; } } /*** YUV edged -> YUV conversion (just remove the edges for YV12) ***/ void yuv12_out(uint8_t *puc_y, int stride_y, uint8_t *puc_u, uint8_t *puc_v, int stride_uv, uint8_t *puc_out, int width_y, int height_y, unsigned int stride_out) { int i; unsigned char * pauc_out[3]; if (height_y < 0) { // we are flipping our output upside-down height_y = -height_y; puc_y += (height_y - 1) * stride_y ; puc_u += (height_y/2 - 1) * stride_uv; puc_v += (height_y/2 - 1) * stride_uv; stride_y = -stride_y; stride_uv = -stride_uv; } pauc_out[0] = puc_out; pauc_out[1] = puc_out + stride_out * height_y; pauc_out[2] = puc_out + stride_out * height_y * 5 / 4; for (i=0; i