www.pudn.com > coremp4-1.0.zip > idct.cpp


/***************************************************************************** 
 * This program is free software ; you can redistribute it and/or modify 
 * it under the terms of the GNU General Public License as published by 
 * the Free Software Foundation; either version 2 of the License, or 
 * (at your option) any later version. 
 * 
 * This program is distributed in the hope that it will be useful, 
 * but WITHOUT ANY WARRANTY; without even the implied warranty of 
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 
 * GNU General Public License for more details. 
 * 
 * You should have received a copy of the GNU General Public License 
 * along with this program; if not, write to the Free Software 
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA 
 * 
 ***************************************************************************** 
 * 
 * Authors: 
 * 
 *	Andrea	Graziani  (Ag): Original source code (Open Divx Decoder 0.4a). 
 *	Pedro	Mateu     (Pm) and 
 *	Gabor	Kovacs    (Kg) Heavily modified and optimized code 
 * Michal Bacik    Porting into C++ 
 * 
 ****************************************************************************/ 
 
 
#include "Rules.h" 
#include "Util.h" 
 
//---------------------------- 
 
typedef short idct_block_t; 
 
#if defined _WIN32_WCE || defined __PALMOS__ 
#define USE_IDCT_WMMX 
#endif 
 
//---------------------------- 
// 2D Inverse Discrete Cosine Transform (iDCT) 
 
#ifdef ARM 
#define SAT(Value) Value = Value < 0 ? 0: (Value > 255 ? 255: Value); 
#else 
  // upper bits are trashed! use only lower 8 bits afterward 
#define SAT(Value) Value <<= Value >> 16; Value |= (Value << 23) >> 31; 
#endif 
 
 
#define ADDSAT32(a,Dst,Add32)    \ 
   b = a + Add32;             \ 
   c = a & Add32;             \ 
   a ^= Add32;                \ 
   a &= ~b;                \ 
   a |= c;                    \ 
   a &= MaskCarry;               \ 
   c = a << 1;                \ 
   b -= c;  /* adjust neighbour */  \ 
   b |= c - (a >> 7); /* mask */ \ 
   Dst = b;                 
 
#define SUBSAT32(a,Dst,Add32)    \ 
   a = ~a;                    \ 
   b = a + Add32;             \ 
   c = a & Add32;             \ 
   a ^= Add32;                \ 
   a &= ~b;                \ 
   a |= c;                    \ 
   a &= MaskCarry;               \ 
   c = a << 1;                \ 
   b -= c;  /* adjust neighbour */  \ 
   b |= c - (a >> 7); /* mask */ \ 
   Dst = ~b;                   
 
//---------------------------- 
#ifdef ARM 
#include "DynamicArmCode.h" 
//---------------------------- 
 
//r6 Block 
//r7,r8 must be saved 
 
static void *BuildMCol8(C_dyn_code &dc){ 
 
   const dword pitch = 16; 
// r10 = x0 
// r4  = x1 
// r2  = x2 
// r1  = x3 
// r3  = x4 
// r12 = x5 
// r0  = x6 
// r5  = x7 
// r11 = x8 
// r9  = tmp (x567) 
   void *mode_1 = dc.Label(false); 
   void *mode_2 = dc.Label(false); 
   void *mode_3 = dc.Label(false); 
 
   dc.Align(16); 
 
   void *fnc_begin = dc.Label(); 
 
   dc.SHalf(); dc.Ldr(dc.r4, dc.r6, 4*pitch); 
   dc.SHalf(); dc.Ldr(dc.r0, dc.r6, 5*pitch); 
   dc.SHalf(); dc.Ldr(dc.r12,dc.r6, 7*pitch); 
   dc.SHalf(); dc.Ldr(dc.r5, dc.r6, 3*pitch); 
   dc.SHalf(); dc.Ldr(dc.r2, dc.r6, 6*pitch); 
   dc.SHalf(); dc.Ldr(dc.r1, dc.r6, 2*pitch); 
   dc.SHalf(); dc.Ldr(dc.r3, dc.r6, 1*pitch); 
   dc.SHalf(); dc.Ldr(dc.r10, dc.r6, 0*pitch); 
 
   dc.Orr(dc.r9, dc.r12, dc.r0); 
   dc.Orr(dc.r9, dc.r9, dc.r5); 
   dc.Orr(dc.r11, dc.r9, dc.r2); 
   dc.Orr(dc.r11, dc.r11, dc.r4); 
   dc.Set(); dc.Orr(dc.r11, dc.r11, dc.r1); 
 
   dc.Branch(dc.NE, mode_2); 
   dc.Cmp(dc.r3, 0); 
   dc.Branch(dc.NE, mode_1); 
   dc.Cmp(dc.r10, 0); 
   dc.Cond(dc.EQ); dc.Mov(dc.pc, dc.lr); 
   dc.Mov(dc.r10, dc.r10, dc.LSL, 3); 
   dc.Half(); dc.Str(dc.r10, dc.r6, 0); 
   dc.Half(); dc.Str(dc.r10, dc.r6, 0x10); 
   dc.Half(); dc.Str(dc.r10, dc.r6, 0x20); 
   dc.Half(); dc.Str(dc.r10, dc.r6, 0x30); 
   dc.Half(); dc.Str(dc.r10, dc.r6, 0x40); 
   dc.Half(); dc.Str(dc.r10, dc.r6, 0x50); 
   dc.Half(); dc.Str(dc.r10, dc.r6, 0x60); 
   dc.Half(); dc.Str(dc.r10, dc.r6, 0x70); 
   dc.Mov(dc.pc, dc.lr); 
 
dc.PutLabel(mode_1); 
                   //x0,x4 
   dc.Mov(dc.r11, dc.r3); 
   dc.Mov(dc.r2, 564); 
   dc.Orr(dc.r2, dc.r2, 1); 
   dc.Mov(dc.r9, dc.r3); 
   dc.Mul(dc.r2, dc.r11, dc.r2); 
   dc.Mov(dc.r11, 2832); 
   dc.Orr(dc.r11, dc.r11, 9); 
   dc.Mul(dc.r4, dc.r9, dc.r11); 
   dc.Mov(dc.r11, 2400); 
   dc.Orr(dc.r11, dc.r11, 8); 
   dc.Mul(dc.r5, dc.r9, dc.r11); 
   dc.Mov(dc.r11, 1600); 
   dc.Mov(dc.r1, dc.r10, dc.LSL, 11); 
   dc.Orr(dc.r11, dc.r11, 9); 
   dc.Mul(dc.r0, dc.r3, dc.r11); 
   dc.Add(dc.r1, dc.r1, 128); 
 
   dc.Add(dc.r3, dc.r4, dc.r1); 
   dc.Add(dc.r11, dc.r5, dc.r1); 
   dc.Mov(dc.r3, dc.r3, dc.ASR, 8); 
   dc.Mov(dc.r11, dc.r11, dc.ASR, 8); 
   dc.Half(); dc.Str(dc.r3, dc.r6, 0); 
   dc.Half(); dc.Str(dc.r11, dc.r6, 0x10); 
 
   dc.Add(dc.r3, dc.r0, dc.r1); 
   dc.Add(dc.r11, dc.r2, dc.r1); 
   dc.Mov(dc.r3, dc.r3, dc.ASR, 8); 
   dc.Mov(dc.r11, dc.r11, dc.ASR, 8); 
   dc.Half(); dc.Str(dc.r3, dc.r6, 0x20); 
   dc.Half(); dc.Str(dc.r11, dc.r6, 0x30); 
 
   dc.Sub(dc.r3, dc.r1, dc.r2); 
   dc.Sub(dc.r11, dc.r1, dc.r0); 
   dc.Mov(dc.r3, dc.r3, dc.ASR, 8); 
   dc.Mov(dc.r11, dc.r11, dc.ASR, 8); 
   dc.Half(); dc.Str(dc.r3, dc.r6, 0x40); 
   dc.Half(); dc.Str(dc.r11, dc.r6, 0x50); 
 
   dc.Sub(dc.r3, dc.r1, dc.r5); 
   dc.Sub(dc.r11, dc.r1, dc.r4); 
   dc.Mov(dc.r3, dc.r3, dc.ASR, 8); 
   dc.Mov(dc.r11, dc.r11, dc.ASR, 8); 
   dc.Half(); dc.Str(dc.r3, dc.r6, 0x60); 
   dc.Half(); dc.Str(dc.r11, dc.r6, 0x70); 
   dc.Mov(dc.pc, dc.lr); 
 
dc.PutLabel(mode_2); 
                //x0,x1,x2,x3 
   dc.Set(); dc.Orr(dc.r11, dc.r9, dc.r3); 
   dc.Branch(dc.NE, mode_3); 
   dc.Mov(dc.r3, dc.r10, dc.LSL, 11); 
   dc.Add(dc.r3, dc.r3, 128); 
   dc.Mov(dc.r9, 1104); 
   dc.Add(dc.r5, dc.r3, dc.r4, dc.LSL, 11); 
   dc.Add(dc.r11, dc.r2, dc.r1); 
   dc.Orr(dc.r9, dc.r9, 4); 
   dc.Sub(dc.r3, dc.r3, dc.r4, dc.LSL, 11); 
   dc.Mul(dc.r4, dc.r11, dc.r9); 
   dc.Mov(dc.r11, 3776); 
   dc.Orr(dc.r11, dc.r11, 8); 
   dc.Mul(dc.r11, dc.r2, dc.r11); 
   dc.Sub(dc.r2, dc.r4, dc.r11); 
   dc.Mov(dc.r11, 1568); 
   dc.Mul(dc.r11, dc.r1, dc.r11); 
   dc.Add(dc.r0, dc.r2, dc.r3); 
   dc.Add(dc.r1, dc.r11, dc.r4); 
   dc.Add(dc.r4, dc.r5, dc.r1); 
   dc.Sub(dc.r3, dc.r3, dc.r2); 
   dc.Sub(dc.r5, dc.r5, dc.r1); 
   dc.Mov(dc.r1, dc.r4, dc.ASR, 8); 
   dc.Mov(dc.r3, dc.r3, dc.ASR, 8); 
   dc.Mov(dc.r2, dc.r0, dc.ASR, 8); 
   dc.Mov(dc.r4, dc.r5, dc.ASR, 8); 
   dc.Half(); dc.Str(dc.r1, dc.r6, 0x00); 
   dc.Half(); dc.Str(dc.r2, dc.r6, 0x10); 
   dc.Half(); dc.Str(dc.r3, dc.r6, 0x20); 
   dc.Half(); dc.Str(dc.r4, dc.r6, 0x30); 
   dc.Half(); dc.Str(dc.r4, dc.r6, 0x40);  
   dc.Half(); dc.Str(dc.r3, dc.r6, 0x50);  
   dc.Half(); dc.Str(dc.r2, dc.r6, 0x60);  
   dc.Half(); dc.Str(dc.r1, dc.r6, 0x70);  
   dc.Mov(dc.pc, dc.lr); 
 
dc.PutLabel(mode_3); 
                //x0,x1,x2,x3,x4,x5,x6,x7 
 
   dc.Mov(dc.r9, 564); 
   dc.Orr(dc.r9, dc.r9, 1);         //W7 
   dc.Add(dc.r11, dc.r12, dc.r3); 
   dc.Mul(dc.r11, dc.r9, dc.r11);    //x8 = W7 * (x5 + x4) 
 
   dc.Mov(dc.r9, 2272); 
   dc.Orr(dc.r9, dc.r9, 4);         //W1_minus_W7 
   dc.Mla(dc.r3, dc.r9, dc.r3, dc.r11);    //x4 = x8 + (W1_minus_W7) * x4 
 
   dc.Mvn(dc.r9, 0xd40); 
   dc.Eor(dc.r9, dc.r9, 0xd);    //minus_W1_minus_W7 
   dc.Mla(dc.r12, dc.r9, dc.r12, dc.r11);  //x5 = x8 + (minus_W1_minus_W7) * x5 
 
   dc.Mov(dc.r9, 2400);      // 
   dc.Orr(dc.r9, dc.r9, 8);         //W3 
   dc.Add(dc.r11, dc.r0, dc.r5); 
   dc.Mul(dc.r11, dc.r9, dc.r11);    //x8 = W3 * (x6 + x7) 
                         
   dc.Mvn(dc.r9, 0x310); 
   dc.Eor(dc.r9, dc.r9, 0xe);    //W5_minus_W3 
   dc.Mla(dc.r0, dc.r9, dc.r0, dc.r11);    //x6 = x8 + (W5_minus_W3) * x6 
 
   dc.Mvn(dc.r9, 0xfb0);         //minus_W3_minus_W5 
   dc.Mla(dc.r5, dc.r9, dc.r5, dc.r11);    //x7 = x8 + minus_W3_minus_W5 * x7 
 
   dc.Mov(dc.r10, dc.r10, dc.LSL, 11); 
   dc.Add(dc.r10, dc.r10, 128);     //x0 = (x0 << 11) + 128 
   dc.Add(dc.r11, dc.r10, dc.r4, dc.LSL, 11); //x8 = x0 + (x1 << 11) 
   dc.Sub(dc.r10, dc.r10, dc.r4, dc.LSL, 11); //x0 = x0 - (x1 << 11) 
 
   dc.Mov(dc.r9, 1104); 
   dc.Orr(dc.r9, dc.r9, 4);         //W6 
   dc.Add(dc.r4, dc.r1, dc.r2); 
   dc.Mul(dc.r4, dc.r9, dc.r4);        //x1 = W6 * (x3 + x2) 
 
   dc.Mvn(dc.r9, 0xec0); 
   dc.Eor(dc.r9, dc.r9, 7);    //minus_W2_minus_W6 
   dc.Mla(dc.r2, dc.r9, dc.r2, dc.r4);     //x2 = x1 + minus_W2_minus_W6 * x2 
 
   dc.Mov(dc.r9, 0x620);         //W2_minus_W6 
   dc.Mla(dc.r1, dc.r9, dc.r1, dc.r4);     //x3 = x1 + (W2_minus_W6) * x3 
 
   dc.Add(dc.r4, dc.r3, dc.r0);        //x1 = x4 + x6 
   dc.Sub(dc.r3, dc.r3, dc.r0);        //x4 -= x6 
   dc.Add(dc.r0, dc.r12, dc.r5);        //x6 = x5 + x7 
   dc.Sub(dc.r12, dc.r12, dc.r5);        //x5 -= x7 
   dc.Add(dc.r5, dc.r11, dc.r1);        //x7 = x8 + x3 
   dc.Sub(dc.r11, dc.r11, dc.r1);        //x8 -= x3 
   dc.Add(dc.r1, dc.r10, dc.r2);        //x3 = x0 + x2 
   dc.Sub(dc.r10, dc.r10, dc.r2);        //x0 -= x2 
 
   dc.Add(dc.r9, dc.r3, dc.r12);       //x4 + x5 
   dc.Sub(dc.r3, dc.r3, dc.r12);       //x4 - x5 
   dc.Mov(dc.r12, 181); 
   dc.Mul(dc.r2, dc.r9, dc.r12);       //181 * (x4 + x5) 
   dc.Mul(dc.r9, dc.r3, dc.r12);       //181 * (x4 - x5) 
   dc.Add(dc.r2, dc.r2, 128);      //x2 = 181 * (x4 + x5) + 128 
   dc.Add(dc.r3, dc.r9, 128);      //x4 = 181 * (x4 - x5) + 128 
 
   dc.Add(dc.r9, dc.r5, dc.r4);        
   dc.Sub(dc.r5, dc.r5, dc.r4); 
   dc.Mov(dc.r9, dc.r9, dc.ASR, 8);      //(x7 + x1) >> 8 
   dc.Mov(dc.r5, dc.r5, dc.ASR, 8);      //(x7 - x1) >> 8 
   dc.Half(); dc.Str(dc.r9, dc.r6, 0x00); 
   dc.Half(); dc.Str(dc.r5, dc.r6, 0x70); 
 
   dc.Add(dc.r9, dc.r1, dc.r2, dc.ASR, 8); 
   dc.Sub(dc.r1, dc.r1, dc.r2, dc.ASR, 8); 
   dc.Mov(dc.r9, dc.r9, dc.ASR, 8);      //(x3 + x2) >> 8 
   dc.Mov(dc.r1, dc.r1, dc.ASR, 8);      //(x3 - x2) >> 8 
   dc.Half(); dc.Str(dc.r9, dc.r6, 0x10); 
   dc.Half(); dc.Str(dc.r1, dc.r6, 0x60); 
 
   dc.Add(dc.r9, dc.r10, dc.r3, dc.ASR, 8); 
   dc.Sub(dc.r10, dc.r10, dc.r3, dc.ASR, 8); 
   dc.Mov(dc.r9, dc.r9, dc.ASR, 8);      //(x0 + x4) >> 8 
   dc.Mov(dc.r10, dc.r10, dc.ASR, 8);    //(x0 - x4) >> 8 
   dc.Half(); dc.Str(dc.r9, dc.r6, 0x20); 
   dc.Half(); dc.Str(dc.r10, dc.r6, 0x50); 
 
   dc.Add(dc. r9, dc.r11, dc.r0); 
   dc.Sub(dc.r11, dc.r11, dc.r0); 
   dc.Mov(dc.r9, dc.r9, dc.ASR, 8);      //(x8 + x6) >> 8 
   dc.Mov(dc.r11, dc.r11, dc.ASR, 8);    //(x8 - x6) >> 8 
   dc.Half(); dc.Str(dc.r9, dc.r6, 0x30); 
   dc.Half(); dc.Str(dc.r11, dc.r6, 0x40); 
 
   dc.Mov(dc.pc, dc.lr); 
 
   return fnc_begin; 
} 
 
//---------------------------- 
 
static void *BuildRowConst(C_dyn_code &dc){ 
 
// r0 Block[0] 
// r6 Block 
// r7 Src 
// r8 Dst 
   void *l_no_src = dc.Label(false); 
   void *l_zero = dc.Label(false); 
   void *l_sub = dc.Label(false); 
   void *CarryMask = dc.DeclareData(0x80808080); 
 
   dc.Align(16); 
 
   void *fnc_begin = dc.Label(); 
 
   dc.Add(dc.r0, dc.r0, 32); 
   dc.Cmp(dc.r7, 0); 
   dc.Mov(dc.r3, dc.r0, dc.ASR, 6); 
   dc.Branch(dc.EQ, l_no_src); 
   dc.Cmp(dc.r3, 0); 
   dc.Branch(dc.EQ, l_zero); 
   dc.Ldr(dc.r0, CarryMask); 
   dc.LdrAdvance(dc.r2, dc.r7, 8);  //source stride 
   dc.Branch(dc.LT, l_sub); 
 
                              //add: 
   dc.Orr(dc.r3, dc.r3, dc.r3, dc.LSL, 8); 
   dc.Orr(dc.r3, dc.r3, dc.r3, dc.LSL, 16); 
   dc.Add(dc.r4, dc.r2, dc.r3); 
   dc.Eor(dc.r11, dc.r2, dc.r3); 
   dc.And(dc.r2, dc.r3, dc.r2); 
   dc.Bic(dc.r11, dc.r11, dc.r4); 
   dc.Orr(dc.r11, dc.r11, dc.r2); 
   dc.And(dc.r5, dc.r11, dc.r0); 
   dc.Mov(dc.r12, dc.r5, dc.LSL, 1); 
   dc.Sub(dc.r10, dc.r4, dc.r12); 
   dc.Sub(dc.r11, dc.r12, dc.r5, dc.LSR, 7); 
   dc.Ldr(dc.r2, dc.r7, -4); 
   dc.Orr(dc.r11, dc.r11, dc.r10); 
   dc.Str(dc.r11, dc.r8, 0); 
   dc.Add(dc.r4, dc.r2, dc.r3); 
   dc.Eor(dc.r11, dc.r2, dc.r3); 
   dc.And(dc.r2, dc.r3, dc.r2); 
   dc.Bic(dc.r11, dc.r11, dc.r4); 
   dc.Orr(dc.r11, dc.r11, dc.r2); 
   dc.And(dc.r5, dc.r11, dc.r0); 
   dc.Mov(dc.r12, dc.r5, dc.LSL, 1); 
   dc.Sub(dc.r10, dc.r4, dc.r12); 
   dc.Sub(dc.r11, dc.r12, dc.r5, dc.LSR, 7); 
   dc.Orr(dc.r11, dc.r11, dc.r10); 
   dc.Str(dc.r11, dc.r8, 4); 
   dc.Mov(dc.pc, dc.lr); 
 
dc.PutLabel(l_sub); 
   dc.Rsb(dc.r3, dc.r3, 0); 
   dc.Orr(dc.r3, dc.r3, dc.r3, dc.LSL, 8); 
   dc.Orr(dc.r3, dc.r3, dc.r3, dc.LSL, 16); 
   dc.Mvn(dc.r2, dc.r2); 
   dc.Add(dc.r4, dc.r2, dc.r3); 
   dc.Eor(dc.r11, dc.r2, dc.r3); 
   dc.And(dc.r2, dc.r3, dc.r2); 
   dc.Bic(dc.r11, dc.r11, dc.r4); 
   dc.Orr(dc.r11, dc.r11, dc.r2); 
   dc.And(dc.r5, dc.r11, dc.r0); 
   dc.Mov(dc.r12, dc.r5, dc.LSL, 1); 
   dc.Sub(dc.r10, dc.r4, dc.r12); 
   dc.Sub(dc.r11, dc.r12, dc.r5, dc.LSR, 7); 
   dc.Ldr(dc.r2, dc.r7, -4); 
   dc.Orr(dc.r11, dc.r11, dc.r10); 
   dc.Mvn(dc.r11, dc.r11); 
   dc.Str(dc.r11, dc.r8, 0); 
   dc.Mvn(dc.r2, dc.r2); 
   dc.Add(dc.r4, dc.r2, dc.r3); 
   dc.Eor(dc.r11, dc.r2, dc.r3); 
   dc.And(dc.r2, dc.r3, dc.r2); 
   dc.Bic(dc.r11, dc.r11, dc.r4); 
   dc.Orr(dc.r11, dc.r11, dc.r2); 
   dc.And(dc.r5, dc.r11, dc.r0); 
   dc.Mov(dc.r12, dc.r5, dc.LSL, 1); 
   dc.Sub(dc.r10, dc.r4, dc.r12); 
   dc.Sub(dc.r11, dc.r12, dc.r5, dc.LSR, 7); 
   dc.Orr(dc.r11, dc.r11, dc.r10); 
   dc.Mvn(dc.r11, dc.r11); 
   dc.Str(dc.r11, dc.r8, 4); 
   dc.Mov(dc.pc, dc.lr); 
 
dc.PutLabel(l_zero); 
   dc.Ldr(dc.r2, dc.r7, 4); 
   dc.LdrAdvance(dc.r1, dc.r7, 8);  //source stride 
   dc.Str(dc.r2, dc.r8, 4); 
   dc.Str(dc.r1, dc.r8, 0); 
   dc.Mov(dc.pc, dc.lr); 
 
dc.PutLabel(l_no_src); 
   dc.Cmp(dc.r3, 0); 
   dc.Cond(dc.MI); dc.Mov(dc.r3, 0); 
   dc.Cond(dc.PL); dc.Cmp(dc.r3, 255); 
   dc.Cond(dc.GT); dc.Mov(dc.r3, 255); 
   dc.Orr(dc.r3, dc.r3, dc.r3, dc.LSL, 8); 
   dc.Orr(dc.r3, dc.r3, dc.r3, dc.LSL, 16); 
   dc.Str(dc.r3, dc.r8, 0); 
   dc.Str(dc.r3, dc.r8, 4); 
   dc.Mov(dc.pc, dc.lr); 
 
   dc.PutLabel(CarryMask); 
 
   return fnc_begin; 
} 
 
//---------------------------- 
 
static void BuildIDCT_Block4x8(C_dyn_code &dc, void *l_MCol8, void *l_RowConst){ 
 
// r6 Block 
// r7 Src 
// r8 Dst 
   void *W1 = dc.DeclareData(2841);             // 2048*sqrt(2)*cos(1*pi/16) 
   void *W2 = dc.DeclareData(2676);                 // 2048*sqrt(2)*cos(2*pi/16) 
   void *W3 = dc.DeclareData(2408);                 // 2048*sqrt(2)*cos(3*pi/16) 
   void *W6 = dc.DeclareData(1108);                 // 2048*sqrt(2)*cos(6*pi/16) 
   void *W7 = dc.DeclareData(565);  //2048*sqrt(2)*cos(7*pi/16) 
   void *minus_W5 = dc.DeclareData((dword)-1609);        // 2048*sqrt(2)*cos(5*pi/16) 
   void *Row4_NoSrc = dc.Label(false); 
   void *Row4_Sat = dc.Label(false); 
 
   struct S_stack{ 
      void *end_ptr; 
      dword dst_pitch; 
      dword saved_regs[C_dyn_code::STACKFRAME]; 
   }; 
 
   dc.Align(16); 
 
   dc.FunctionBegin(OffsetOf(S_stack, saved_regs)); 
 
   dc.Mov(dc.r6, dc.r0);      //Block 
   dc.Add(dc.r0, dc.r0, 128); 
   dc.Str(dc.r2, dc.sp, OffsetOf(S_stack, dst_pitch)); 
   dc.Str(dc.r0, dc.sp, OffsetOf(S_stack, end_ptr)); 
 
   dc.Mov(dc.r7, dc.r3);         //Src 
   dc.Mov(dc.r8, dc.r1);        //Dst 
 
   dc.BranchLink(l_MCol8); 
   dc.Add(dc.r6, dc.r6, 2); 
   dc.BranchLink(l_MCol8); 
   dc.Add(dc.r6, dc.r6, 2); 
   dc.BranchLink(l_MCol8);   
   dc.Add(dc.r6, dc.r6, 2); 
   dc.BranchLink(l_MCol8);  
   dc.Sub(dc.r6, dc.r6, 6); 
   { 
      void *Row4_Loop = dc.Label(); 
 
      dc.SHalf(); dc.Ldr(dc.r4, dc.r6, 4);    //x3 
      dc.SHalf(); dc.Ldr(dc.r5, dc.r6, 6);    //x7 
      dc.SHalf(); dc.Ldr(dc.r3, dc.r6, 2);    //x4 
      dc.SHalf(); dc.Ldr(dc.r0, dc.r6, 0);    //x0 
 
      dc.Orr(dc.r11, dc.r5, dc.r4); 
      dc.Set(); dc.Orr(dc.r11, dc.r11, dc.r3); 
      void *Row4_NoConst = dc.Label(false); 
      dc.Branch(dc.NE, Row4_NoConst); 
 
      dc.BranchLink(l_RowConst); 
      void *Row4_Next = dc.Label(false); 
      dc.Branch(Row4_Next); 
 
   dc.PutLabel(Row4_NoConst); 
      dc.Cmp(dc.r7, 0); 
 
      dc.Ldr(dc.r10, W7); 
      dc.Ldr(dc.r11, W1); 
      dc.Mov(dc.r2, 4); 
      dc.Add(dc.r0, dc.r0, 32); 
      dc.Mov(dc.r0, dc.r0, dc.LSL, 8);     //x0 
      dc.Mla(dc.r14, dc.r3, dc.r10, dc.r2);   //x5 = x4 * W7 + 4 
      dc.Ldr(dc.r10, W3); 
      dc.Mla(dc.r3, dc.r11, dc.r3, dc.r2);    //x4 = x4 * W1 + 4 
      dc.Mov(dc.r14, dc.r14, dc.ASR, 3);   //x5 >>= 3 
      dc.Ldr(dc.r11, minus_W5); 
      dc.Mla(dc.r12, dc.r5, dc.r10, dc.r2);   //x6 = x7 * W3 + 4 
      dc.Mov(dc.r3, dc.r3, dc.ASR, 3);     //x4 >>= 3 
      dc.Ldr(dc.r10, W6); 
      dc.Mla(dc.r5, dc.r11, dc.r5, dc.r2);    //x7 = x7 * minus_W5 + 4 
      dc.Ldr(dc.r11, W2); 
      dc.Add(dc.r9, dc.r3, dc.r12, dc.ASR, 3);   //x1 = x4 + (x6 >> 3) 
      dc.Sub(dc.r3, dc.r3, dc.r12, dc.ASR, 3); //x4 = x4 - (x6 >> 3) 
      dc.Mla(dc.r12, dc.r4, dc.r10, dc.r2);   //x2 = x3 * W6 + 4 
      dc.Mla(dc.r4, dc.r11, dc.r4, dc.r2);    //x3 = x3 * W2 + 4 
      dc.Add(dc.r2, dc.r14, dc.r5, dc.ASR, 3);   //x6 = x5 + (x7 >> 3) 
      dc.Sub(dc.r5, dc.r14, dc.r5, dc.ASR, 3); //x5 = x5 - (x7 >> 3) 
      dc.Add(dc.r14, dc.r0, dc.r4, dc.ASR, 3); //x7 = x0 + (x3 >> 3) 
      dc.Sub(dc.r4, dc.r0, dc.r4, dc.ASR, 3); //x8 = x0 - (x3 >> 3) 
      dc.Add(dc.r10, dc.r0, dc.r12, dc.ASR, 3);//x3 = x0 + (x2 >> 3) 
      dc.Sub(dc.r0, dc.r0, dc.r12, dc.ASR, 3);   //x0 = x0 - (x2 >> 3) 
      dc.Add(dc.r1, dc.r5, dc.r3); 
      dc.Mov(dc.r11, 181); 
      dc.Mul(dc.r12, dc.r1, dc.r11);    //x2 = 181 * (x5 + x4) 
      dc.Sub(dc.r3, dc.r3, dc.r5); 
      dc.Mul(dc.r1, dc.r3, dc.r11);        //x4 = 181 * (x4 - x5) 
      dc.Add(dc.r12, dc.r12, 128);     //x2 += 128 
      dc.Add(dc.r3, dc.r1, 128);    //x4 += 128 
      dc.Add(dc.r1, dc.r14, dc.r9);        //x5 = x7 + x1 
      dc.Sub(dc.r5, dc.r14, dc.r9);        //x1 = x7 - x1 
      dc.Add(dc.r11, dc.r10, dc.r12, dc.ASR, 8); //x7 = x3 + (x2 >> 8) 
      dc.Sub(dc.r14, dc.r10, dc.r12, dc.ASR, 8); //x2 = x3 - (x2 >> 8) 
      dc.Add(dc.r9, dc.r0, dc.r3, dc.ASR, 8); //x3 = x0 + (x4 >> 8) 
      dc.Sub(dc.r3, dc.r0, dc.r3, dc.ASR, 8);  //x4 = x0 - (x4 >> 8) 
      dc.Add(dc.r12, dc.r4, dc.r2);        //x0 = x8 + x6 
      dc.Sub(dc.r4, dc.r4, dc.r2);        //x6 = x8 - x6 
 
      dc.Branch(dc.EQ, Row4_NoSrc); 
 
      dc.Byte(); dc.Ldr(dc.r0, dc.r7, 0); 
      dc.Byte(); dc.Ldr(dc.r2, dc.r7, 7); 
      dc.Byte(); dc.Ldr(dc.r10, dc.r7, 1); 
      dc.Add(dc.r1, dc.r0, dc.r1, dc.ASR, 14); 
      dc.Add(dc.r5, dc.r2, dc.r5, dc.ASR, 14); 
      dc.Add(dc.r11, dc.r10, dc.r11, dc.ASR, 14); 
      dc.Byte(); dc.Ldr(dc.r2, dc.r7, 6); 
      dc.Byte(); dc.Ldr(dc.r0, dc.r7, 2); 
      dc.Byte(); dc.Ldr(dc.r10, dc.r7, 5); 
      dc.Add(dc.r14, dc.r2, dc.r14, dc.ASR, 14); 
      dc.Add(dc.r9, dc.r0, dc.r9, dc.ASR, 14); 
      dc.Byte(); dc.Ldr(dc.r0, dc.r7, 3); 
      dc.Byte(); dc.Ldr(dc.r2, dc.r7, 4); 
      dc.Add(dc.r3, dc.r10, dc.r3, dc.ASR, 14); 
      dc.Add(dc.r12, dc.r0, dc.r12, dc.ASR, 14); 
      dc.Add(dc.r4, dc.r2, dc.r4, dc.ASR, 14); 
      dc.Add(dc.r7, dc.r7, 8);        //source stride 
 
   dc.PutLabel(Row4_Sat); 
      dc.Orr(dc.r0, dc.r5, dc.r14); 
      dc.Orr(dc.r0, dc.r0, dc.r4); 
      dc.Orr(dc.r0, dc.r0, dc.r1); 
      dc.Orr(dc.r0, dc.r0, dc.r12); 
      dc.Orr(dc.r0, dc.r0, dc.r11); 
      dc.Orr(dc.r0, dc.r0, dc.r9); 
      dc.Orr(dc.r0, dc.r0, dc.r3); 
      dc.Set(); dc.Bic(dc.r0, dc.r0, 255); 
      void *Row4_Write = dc.Label(false); 
      dc.Branch(dc.EQ, Row4_Write); 
 
      dc.Mov(dc.r0, 0xffffff00); 
 
      dc.Tst(dc.r1, dc.r0); 
      dc.Cond(dc.NE); dc.Mov(dc.r1, 0xFF); 
      dc.Cond(dc.MI); dc.Mov(dc.r1, 0x00); 
 
      dc.Tst(dc.r11, dc.r0); 
      dc.Cond(dc.NE); dc.Mov(dc.r11, 0xFF); 
      dc.Cond(dc.MI); dc.Mov(dc.r11, 0x00); 
 
      dc.Tst(dc.r9, dc.r0); 
      dc.Cond(dc.NE); dc.Mov(dc.r9, 0xFF); 
      dc.Cond(dc.MI); dc.Mov(dc.r9, 0x00); 
 
      dc.Tst(dc.r12, dc.r0); 
      dc.Cond(dc.NE); dc.Mov(dc.r12, 0xFF); 
      dc.Cond(dc.MI); dc.Mov(dc.r12, 0x00); 
 
      dc.Tst(dc.r4, dc.r0); 
      dc.Cond(dc.NE); dc.Mov(dc.r4, 0xFF); 
      dc.Cond(dc.MI); dc.Mov(dc.r4, 0x00); 
 
      dc.Tst(dc.r3, dc.r0); 
      dc.Cond(dc.NE); dc.Mov(dc.r3, 0xFF); 
      dc.Cond(dc.MI); dc.Mov(dc.r3, 0x00); 
 
      dc.Tst(dc.r14, dc.r0); 
      dc.Cond(dc.NE); dc.Mov(dc.r14, 0xFF); 
      dc.Cond(dc.MI); dc.Mov(dc.r14, 0x00); 
 
      dc.Tst(dc.r5, dc.r0); 
      dc.Cond(dc.NE); dc.Mov(dc.r5, 0xFF); 
      dc.Cond(dc.MI); dc.Mov(dc.r5, 0x00); 
 
   dc.PutLabel(Row4_Write); 
      dc.Byte(); dc.Str(dc.r1, dc.r8, 0); 
      dc.Byte(); dc.Str(dc.r11, dc.r8, 1); 
      dc.Byte(); dc.Str(dc.r9, dc.r8, 2); 
      dc.Byte(); dc.Str(dc.r12, dc.r8, 3); 
      dc.Byte(); dc.Str(dc.r4, dc.r8, 4); 
      dc.Byte(); dc.Str(dc.r3, dc.r8, 5); 
      dc.Byte(); dc.Str(dc.r14, dc.r8, 6); 
      dc.Byte(); dc.Str(dc.r5, dc.r8, 7); 
 
   dc.PutLabel(Row4_Next); 
      dc.Ldr(dc.r2, dc.sp, OffsetOf(S_stack, dst_pitch)); 
      dc.Ldr(dc.r1, dc.sp, OffsetOf(S_stack, end_ptr)); 
 
      dc.Add(dc.r6, dc.r6, 16);      //Block += 16 
      dc.Add(dc.r8, dc.r8, dc.r2);    //Dst += DstStride 
 
      dc.Cmp(dc.r6, dc.r1); 
      dc.Branch(dc.NE, Row4_Loop); 
   } 
   dc.FunctionEnd(); 
 
dc.PutLabel(Row4_NoSrc); 
 
   dc.Mov(dc.r5, dc.r5, dc.ASR, 14); 
   dc.Mov(dc.r14, dc.r14, dc.ASR, 14); 
   dc.Mov(dc.r12, dc.r12, dc.ASR, 14); 
   dc.Mov(dc.r1, dc.r1, dc.ASR, 14); 
   dc.Mov(dc.r11, dc.r11, dc.ASR, 14); 
   dc.Mov(dc.r9, dc.r9, dc.ASR, 14); 
   dc.Mov(dc.r3, dc.r3, dc.ASR, 14); 
   dc.Mov(dc.r4, dc.r4, dc.ASR, 14); 
 
   dc.Branch(Row4_Sat); 
 
   dc.PutLabel(W1); 
   dc.PutLabel(W2); 
   dc.PutLabel(W3); 
   dc.PutLabel(W6); 
   dc.PutLabel(W7); 
   dc.PutLabel(minus_W5); 
} 
 
//---------------------------- 
// r6 Block 
// r7 Src 
// r8 Dst 
static void BuildIDCT_Block8x8(C_dyn_code &dc, void *l_MCol8, void *l_RowConst){ 
 
   //void *W1 = dc.DeclareData(2841);             // 2048*sqrt(2)*cos(1*pi/16) 
   //void *W2 = dc.DeclareData(2676);                 // 2048*sqrt(2)*cos(2*pi/16) 
   void *W3 = dc.DeclareData(2408);                 // 2048*sqrt(2)*cos(3*pi/16) 
   void *W6 = dc.DeclareData(1108);                 // 2048*sqrt(2)*cos(6*pi/16) 
   void *W7 = dc.DeclareData(565);  //2048*sqrt(2)*cos(7*pi/16) 
   void *W1_minus_W7 = dc.DeclareData(2276); 
   void *minus_W1_minus_W7 = dc.DeclareData((dword)-3406); 
   void *W5_minus_W3 = dc.DeclareData((dword)-799); 
   void *minus_W2_minus_W6 = dc.DeclareData((dword)-3784); 
   void *Row8_NoSrc = dc.Label(false); 
   void *Row8_Sat = dc.Label(false); 
 
   struct S_stack{ 
      void *end_ptr; 
      dword dst_pitch; 
      dword saved_regs[C_dyn_code::STACKFRAME]; 
   }; 
 
   dc.Align(16); 
 
   dc.FunctionBegin(OffsetOf(S_stack, saved_regs)); 
 
   dc.Mov(dc.r6, dc.r0);      //Block 
   dc.Add(dc. r0, dc.r0, 128); 
   dc.Str(dc.r2, dc.sp, OffsetOf(S_stack, dst_pitch)); 
   dc.Str(dc.r0, dc.sp, OffsetOf(S_stack, end_ptr)); 
   //stmdb   sp!, {r0, dc.r2, dc.r4 - dc.r12, lr}  // r0=BlockEnd r2=DstStride 
   //dc.Sub(dc.r6, dc.r0, 128   //Block 
   dc.Mov(dc.r7, dc.r3);         //Src 
   dc.Mov(dc.r8, dc.r1);        //Dst 
 
   dc.BranchLink(l_MCol8);   
   dc.Add(dc.r6, dc.r6, 2); 
   dc.BranchLink(l_MCol8);   
   dc.Add(dc.r6, dc.r6, 2); 
   dc.BranchLink(l_MCol8);   
   dc.Add(dc.r6, dc.r6, 2); 
   dc.BranchLink(l_MCol8);   
   dc.Add(dc.r6, dc.r6, 2); 
   dc.BranchLink(l_MCol8);   
   dc.Add(dc.r6, dc.r6, 2); 
   dc.BranchLink(l_MCol8);   
   dc.Add(dc.r6, dc.r6, 2); 
   dc.BranchLink(l_MCol8);   
   dc.Add(dc.r6, dc.r6, 2); 
   dc.BranchLink(l_MCol8);  
   dc.Sub(dc.r6, dc.r6, 14); 
   { 
      void *Row8_Loop = dc.Label(); 
 
      dc.SHalf(); dc.Ldr(dc.r0, dc.r6, 0);        //x0 
      dc.SHalf(); dc.Ldr(dc.r3, dc.r6, 2);    //x4 
      dc.SHalf(); dc.Ldr(dc.r4, dc.r6, 4);    //x3 
      dc.SHalf(); dc.Ldr(dc.r5, dc.r6, 6);    //x7 
      dc.SHalf(); dc.Ldr(dc.r9, dc.r6, 8);    //x1 
      dc.SHalf(); dc.Ldr(dc.r2, dc.r6, 10);      //x6 
      dc.SHalf(); dc.Ldr(dc.r14,dc.r6, 12);      //x2 
      dc.SHalf(); dc.Ldr(dc.r1, dc.r6, 14);      //x5 
       
      dc.Orr(dc.r11, dc.r3, dc.r4); 
      dc.Orr(dc.r11, dc.r11, dc.r5); 
      dc.Orr(dc.r11, dc.r11, dc.r9); 
      dc.Orr(dc.r11, dc.r11, dc.r2); 
      dc.Orr(dc.r11, dc.r11, dc.r14); 
      dc.Set(); dc.Orr(dc.r11, dc.r11, dc.r1); 
      void *Row8_NoConst = dc.Label(false); 
      dc.Branch(dc.NE, Row8_NoConst); 
 
      dc.BranchLink(l_RowConst); 
      //bl    RowConst 
      //b     Row8_Next 
      void *Row8_Next = dc.Label(false); 
      dc.Branch(Row8_Next); 
 
 
      /* 
_W3         DCW 2408                 // 2048*sqrt(2)*cos(3*pi/16)  
_W6         DCW 1108                 // 2048*sqrt(2)*cos(6*pi/16)  
_W7         DCW 565                  // 2048*sqrt(2)*cos(7*pi/16)  
 
W1_minus_W7       DCW 2276 
minus_W1_minus_W7 DCW 0xF2B2 //-3406 
W5_minus_W3       DCW 0xFCE1 //-799 
minus_W2_minus_W6 DCW 0xF138 //-3784 
*/ 
 
   dc.PutLabel(Row8_NoConst); 
      dc.Cmp(dc.r7, 0); 
 
      dc.Add(dc.r0, dc.r0, 32); 
      dc.Ldr(dc.r10, W7); 
      dc.Mov(dc.r0, dc.r0, dc.LSL, 11);    //x0 = (x0 + 32) << 11 
      dc.Ldr(dc.r12, W1_minus_W7); 
      dc.Add(dc.r11, dc.r3, dc.r1); 
      dc.Mul(dc.r11, dc.r10, dc.r11);       //x8 = W7 * (x4 + x5) 
      dc.Ldr(dc.r10, minus_W1_minus_W7); 
      dc.Mla(dc.r3, dc.r12, dc.r3, dc.r11);   //x4 = x8 + W1_minus_W7 * x4 
      dc.Ldr(dc.r12, W3); 
      dc.Mla(dc.r1, dc.r10, dc.r1, dc.r11);   //x5 = x8 + minus_W1_minus_W7 * x5 
      dc.Ldr(dc.r10, W5_minus_W3); 
      dc.Add(dc.r11, dc.r2, dc.r5);         //x6 + x7 
      dc.Mul(dc.r11, dc.r12, dc.r11);       //x8 = W3 * (x6 + x7) 
      dc.Mvn(dc.r12, 0xfb0);        //minus_W3_minus_W5 
      dc.Mla(dc.r2, dc.r10, dc.r2, dc.r11);     //x6 = x8 + W5_minus_W3 * x6 
      dc.Ldr(dc.r10, W6); 
      dc.Mla(dc.r5, dc.r12, dc.r5, dc.r11);     //x7 = x8 + minus_W3_minus_W5 * x7 
      dc.Ldr(dc.r12, minus_W2_minus_W6); 
      dc.Add(dc.r11, dc.r0, dc.r9, dc.LSL, 11);//x8 = x0 + (x1 << 11) 
      dc.Sub(dc.r0, dc.r0, dc.r9, dc.LSL, 11);  //x0 = x0 - (x1 << 11) 
      dc.Add(dc.r9, dc.r4, dc.r14); 
      dc.Mul(dc.r9, dc.r10, dc.r9);       //x1 = W6 * (x3 + x2) 
      dc.Mov(dc.r10, 0x620);        //W2_minus_W6 
      dc.Mla(dc.r14, dc.r12, dc.r14, dc.r9); //x2 = x1 + minus_W2_minus_W6 * x2 
      dc.Mov(dc.r12, 181); 
      dc.Mla(dc.r4, dc.r10, dc.r4, dc.r9);      //x3 = x1 + W2_minus_W6 * x3 
      dc.Add(dc.r9, dc.r3, dc.r2);        //x1 = x4 + x6 
      dc.Sub(dc.r3, dc.r3, dc.r2);        //x4 = x4 - x6 
      dc.Add(dc.r2, dc.r1, dc.r5);        //x6 = x5 + x7 
      dc.Sub(dc.r1, dc.r1, dc.r5);        //x5 = x5 - x7 
      dc.Add(dc.r5, dc.r11, dc.r4);       //x7 = x8 + x3 
      dc.Sub(dc.r11, dc.r11, dc.r4);      //x8 = x8 - x3 
      dc.Add(dc.r4, dc.r0, dc.r14);       //x3 = x0 + x2 
      dc.Sub(dc.r0, dc.r0, dc.r14);       //x0 = x0 - x2 
      dc.Add(dc.r3, dc.r3, 4);        // 
      dc.Add(dc.r14, dc.r3, dc.r1);       //x2 = x4 + x5 + 4 
      dc.Sub(dc.r3, dc.r3, dc.r1);        //x4 = x4 - x5 + 4 
      dc.Mov(dc.r10, 16); 
      dc.Mov(dc.r14, dc.r14, dc.ASR, 3); 
      dc.Mov(dc.r3, dc.r3, dc.ASR, 3); 
      dc.Mla(dc.r14, dc.r12, dc.r14, dc.r10);   //x2 = 181 * ((x4 + x5 + 4) >> 3) + 16 
      dc.Mla(dc.r3, dc.r12, dc.r3, dc.r10);  //x4 = 181 * ((x4 - x5 + 4) >> 3) + 16 
 
      dc.Add(dc.r1, dc.r5, dc.r9);        //x5 = x7 + x1 
      dc.Sub(dc.r9, dc.r5, dc.r9);        //x1 = x7 - x1 
      dc.Add(dc.r5, dc.r4, dc.r14, dc.ASR, 5);  //x7 = x3 + (x2 >> 5) 
      dc.Sub(dc.r14, dc.r4, dc.r14, dc.ASR, 5);  //x2 = x3 - (x2 >> 5) 
      dc.Add(dc.r4, dc.r0, dc.r3, dc.ASR, 5);   //x3 = x0 + (x4 >> 5) 
      dc.Sub(dc.r3, dc.r0, dc.r3, dc.ASR, 5);   //x4 = x0 - (x4 >> 5) 
      dc.Add(dc.r0, dc.r11, dc.r2);       //x0 = x8 + x6 
      dc.Sub(dc.r2, dc.r11, dc.r2);       //x6 = x8 - x6 
 
      dc.Branch(dc.EQ, Row8_NoSrc); 
 
      dc.Byte(); dc.Ldr(dc.r10, dc.r7, 0); 
      dc.Byte(); dc.Ldr(dc.r12, dc.r7, 7); 
      dc.Byte(); dc.Ldr(dc.r11, dc.r7, 1); 
      dc.Add(dc.r1, dc.r10, dc.r1, dc.ASR, 17); 
      dc.Add(dc.r9, dc.r12, dc.r9, dc.ASR, 17); 
      dc.Add(dc.r5, dc.r11, dc.r5, dc.ASR, 17); 
      dc.Byte(); dc.Ldr(dc.r10, dc.r7, 6); 
      dc.Byte(); dc.Ldr(dc.r12, dc.r7, 2); 
      dc.Byte(); dc.Ldr(dc.r11, dc.r7, 5); 
      dc.Add(dc.r14, dc.r10, dc.r14, dc.ASR, 17); 
      dc.Add(dc.r4, dc.r12, dc.r4, dc.ASR, 17); 
      dc.Byte(); dc.Ldr(dc.r10, dc.r7, 3); 
      dc.Byte(); dc.Ldr(dc.r12, dc.r7, 4); 
      dc.Add(dc.r3, dc.r11, dc.r3, dc.ASR, 17); 
      dc.Add(dc.r0, dc.r10, dc.r0, dc.ASR, 17); 
      dc.Add(dc.r2, dc.r12, dc.r2, dc.ASR, 17); 
      dc.Add(dc.r7, dc.r7, 8);        //source stride 
 
   dc.PutLabel(Row8_Sat); 
      dc.Orr(dc.r10, dc.r1, dc.r9); 
      dc.Orr(dc.r10, dc.r10, dc.r5); 
      dc.Orr(dc.r10, dc.r10, dc.r14); 
      dc.Orr(dc.r10, dc.r10, dc.r4); 
      dc.Orr(dc.r10, dc.r10, dc.r3); 
      dc.Orr(dc.r10, dc.r10, dc.r0); 
      dc.Orr(dc.r10, dc.r10, dc.r2); 
      dc.Set(); dc.Bic(dc.r10, dc.r10, 0xff);  // 0xFF = 255 
      void *Row8_Write = dc.Label(false); 
      dc.Branch(dc.EQ, Row8_Write); 
 
      dc.Mov(dc.r10, 0xffffff00); 
 
      dc.Tst(dc.r1, dc.r10); 
      dc.Cond(dc.NE); dc.Mov(dc.r1, 0xFF); 
      dc.Cond(dc.MI); dc.Mov(dc.r1, 0x00); 
 
      dc.Tst(dc.r9, dc.r10); 
      dc.Cond(dc.NE); dc.Mov(dc.r9, 0xFF); 
      dc.Cond(dc.MI); dc.Mov(dc.r9, 0x00); 
 
      dc.Tst(dc.r5, dc.r10); 
      dc.Cond(dc.NE); dc.Mov(dc.r5, 0xFF); 
      dc.Cond(dc.MI); dc.Mov(dc.r5, 0x00); 
 
      dc.Tst(dc.r14, dc.r10); 
      dc.Cond(dc.NE); dc.Mov(dc.r14, 0xFF); 
      dc.Cond(dc.MI); dc.Mov(dc.r14, 0x00); 
 
      dc.Tst(dc.r4, dc.r10); 
      dc.Cond(dc.NE); dc.Mov(dc.r4, 0xFF); 
      dc.Cond(dc.MI); dc.Mov(dc.r4, 0x00); 
 
      dc.Tst(dc.r3, dc.r10); 
      dc.Cond(dc.NE); dc.Mov(dc.r3, 0xFF); 
      dc.Cond(dc.MI); dc.Mov(dc.r3, 0x00); 
 
      dc.Tst(dc.r0, dc.r10); 
      dc.Cond(dc.NE); dc.Mov(dc.r0, 0xFF); 
      dc.Cond(dc.MI); dc.Mov(dc.r0, 0x00); 
 
      dc.Tst(dc.r2, dc.r10); 
      dc.Cond(dc.NE); dc.Mov(dc.r2, 0xFF); 
      dc.Cond(dc.MI); dc.Mov(dc.r2, 0x00); 
 
   dc.PutLabel(Row8_Write); 
      dc.Byte(); dc.Str(dc.r1, dc.r8, 0); 
      dc.Byte(); dc.Str(dc.r5, dc.r8, 1); 
      dc.Byte(); dc.Str(dc.r4, dc.r8, 2); 
      dc.Byte(); dc.Str(dc.r0, dc.r8, 3); 
      dc.Byte(); dc.Str(dc.r2, dc.r8, 4); 
      dc.Byte(); dc.Str(dc.r3, dc.r8, 5); 
      dc.Byte(); dc.Str(dc.r14,dc.r8, 6); 
      dc.Byte(); dc.Str(dc.r9, dc.r8, 7); 
 
   dc.PutLabel(Row8_Next); 
      //dc.Ldr(dc. r2, [sp, 4);   //DstStride 
      //dc.Ldr(dc. r1, [sp, 0);   //BlockEnd 
      dc.Ldr(dc.r2, dc.sp, OffsetOf(S_stack, dst_pitch)); 
      dc.Ldr(dc.r1, dc.sp, OffsetOf(S_stack, end_ptr)); 
       
      dc.Add(dc.r6, dc.r6,16);      //Block += 16 
      dc.Add(dc.r8, dc.r8, dc.r2);    //Dst += DstStride 
 
      dc.Cmp(dc.r6, dc.r1); 
      dc.Branch(dc.NE, Row8_Loop); 
   } 
   dc.FunctionEnd(); 
 
dc.PutLabel(Row8_NoSrc); 
 
   dc.Mov(dc.r1, dc.r1, dc.ASR, 17); 
   dc.Mov(dc.r9, dc.r9, dc.ASR, 17); 
   dc.Mov(dc.r5, dc.r5, dc.ASR, 17); 
   dc.Mov(dc.r14, dc.r14, dc.ASR, 17); 
   dc.Mov(dc.r4, dc.r4, dc.ASR, 17); 
   dc.Mov(dc.r3, dc.r3, dc.ASR, 17); 
   dc.Mov(dc.r0, dc.r0, dc.ASR, 17); 
   dc.Mov(dc.r2, dc.r2, dc.ASR, 17); 
 
   dc.Branch(Row8_Sat); 
 
   dc.PutLabel(W3); 
   dc.PutLabel(W6); 
   dc.PutLabel(W7); 
   dc.PutLabel(W1_minus_W7); 
   dc.PutLabel(minus_W1_minus_W7); 
   dc.PutLabel(W5_minus_W3); 
   dc.PutLabel(minus_W2_minus_W6); 
} 
 
//---------------------------- 
#ifdef USE_IDCT_WMMX 
//---------------------------- 
 
static void BuildWMMXconst8x8copyrow(C_dyn_code &dc){ 
 
   dc.Wldrd(dc.wr1, dc.r3, 0); 
   dc.Add(dc.r3, dc.r3, 8); 
   dc.Wldrd(dc.wr2, dc.r3, 0); 
   dc.Add(dc.r3, dc.r3, 8); 
   dc.Wstrd(dc.wr1, dc.r1, 0); 
   dc.Add(dc.r1, dc.r1, dc.r2); 
   dc.Wstrd(dc.wr2, dc.r1, 0); 
   dc.Add(dc.r1, dc.r1, dc.r2); 
} 
 
//---------------------------- 
 
static void BuildWMMXconst8x8addrow(C_dyn_code &dc){ 
   dc.Wldrd(dc.wr1, dc.r3, 0); 
   dc.Add(dc.r3, dc.r3, 8); 
   dc.Wldrd(dc.wr2, dc.r3, 0); 
   dc.Add(dc.r3, dc.r3, 8); 
   dc.Waddbus(dc.wr1, dc.wr1, dc.wr0); 
   dc.Waddbus(dc.wr2, dc.wr2, dc.wr0); 
   dc.Wstrd(dc.wr1, dc.r1, 0); 
   dc.Add(dc.r1, dc.r1, dc.r2); 
   dc.Wstrd(dc.wr2, dc.r1, 0); 
   dc.Add(dc.r1, dc.r1, dc.r2); 
} 
 
//---------------------------- 
 
static void BuildWMMXconst8x8subrow(C_dyn_code &dc){ 
    
   dc.Wldrd(dc.wr1, dc.r3, 0); 
   dc.Add(dc.r3, dc.r3, 8); 
   dc.Wldrd(dc.wr2, dc.r3, 0); 
   dc.Add(dc.r3, dc.r3, 8); 
   dc.Wsubbus(dc.wr1, dc.wr1, dc.wr0); 
   dc.Wsubbus(dc.wr2, dc.wr2, dc.wr0); 
   dc.Wstrd(dc.wr1, dc.r1, 0); 
   dc.Add(dc.r1, dc.r1, dc.r2); 
   dc.Wstrd(dc.wr2, dc.r1, 0); 
   dc.Add(dc.r1, dc.r1, dc.r2); 
} 
 
//---------------------------- 
 
static void BuildWMMXIDCT_Const8x8(C_dyn_code &dc){ 
    
   void *l_const8x8add = dc.Label(false); 
   void *l_const8x8sub = dc.Label(false); 
   void *l_const8x8done = dc.Label(false); 
 
   dc.Align(16); 
   dc.FunctionBegin(); 
 
   dc.Cmp(dc.r0, 0); 
   dc.Branch(dc.GT, l_const8x8add); 
   dc.Branch(dc.LT, l_const8x8sub); 
   dc.Cmp(dc.r1, dc.r3); 
   dc.Branch(dc.EQ, l_const8x8done); 
 
   BuildWMMXconst8x8copyrow(dc); 
   BuildWMMXconst8x8copyrow(dc); 
   BuildWMMXconst8x8copyrow(dc); 
   BuildWMMXconst8x8copyrow(dc); 
 
dc.PutLabel(l_const8x8done); 
   dc.FunctionEnd(); 
 
dc.PutLabel(l_const8x8add);    
   dc.Tbcstb(dc.wr0, dc.r0); 
   BuildWMMXconst8x8addrow(dc); 
   BuildWMMXconst8x8addrow(dc); 
   BuildWMMXconst8x8addrow(dc); 
   BuildWMMXconst8x8addrow(dc); 
   dc.FunctionEnd(); 
 
dc.PutLabel(l_const8x8sub); 
   dc.Rsb(dc.r0, dc.r0, 0); 
   dc.Tbcstb(dc.wr0, dc.r0); 
   BuildWMMXconst8x8subrow(dc); 
   BuildWMMXconst8x8subrow(dc); 
   BuildWMMXconst8x8subrow(dc); 
   BuildWMMXconst8x8subrow(dc); 
   dc.FunctionEnd(); 
} 
 
//---------------------------- 
#endif 
//---------------------------- 
 
void BuildIDCTFunctions(C_dyn_code &dc, bool use_wmmx){ 
 
   void *l_MCol8 = BuildMCol8(dc); 
   void *l_RowConst = BuildRowConst(dc); 
                              //IDCT_Block4x8 (13) 
   BuildIDCT_Block4x8(dc, l_MCol8, l_RowConst); 
                              //IDCT_Block8x8 (14) 
   BuildIDCT_Block8x8(dc, l_MCol8, l_RowConst); 
                              //IDCT_Const8x8 (15) 
#ifdef USE_IDCT_WMMX 
   if(use_wmmx){ 
      BuildWMMXIDCT_Const8x8(dc); 
   } 
#endif 
   { 
                              //implemented in C++, make empty function 
      dc.FunctionBegin(); 
      dc.FunctionEnd(); 
   } 
} 
 
//---------------------------- 
#else //ARM 
//---------------------------- 
#define W1 2841                 // 2048*sqrt(2)*cos(1*pi/16)  
#define W2 2676                 // 2048*sqrt(2)*cos(2*pi/16)  
#define W3 2408                 // 2048*sqrt(2)*cos(3*pi/16)  
#define W5 1609                 // 2048*sqrt(2)*cos(5*pi/16)  
#define W6 1108                 // 2048*sqrt(2)*cos(6*pi/16)  
#define W7 565                  // 2048*sqrt(2)*cos(7*pi/16)  
 
#define W1_minus_W7 2276 
#define W1_plus_W7 3406 
#define W3_minus_W5 799 
#define W3_plus_W5 4017 
#define W2_minus_W6 1568 
#define W2_plus_W6 3784 
 
static void IDCT_Col8(idct_block_t *Blk){ 
   int x0, x1, x2, x3, x4, x5, x6, x7, x8; 
   int x567,x123; 
 
   x0 = Blk[0]; 
   x4 = Blk[8]; 
   x3 = Blk[16]; 
   x7 = Blk[24]; 
   x1 = Blk[32] << 11; 
   x6 = Blk[40]; 
   x2 = Blk[48];   
   x5 = Blk[56]; 
    
   x123=x1|x2|x3; 
   x567=x5|x6|x7; 
 
   if(!(x123|x567)){ 
      if (!x4) { // x0 
 
         if (x0)  
            Blk[0] = Blk[8] = Blk[16] = Blk[24] = Blk[32] = Blk[40] = Blk[48] = Blk[56] = (idct_block_t)(x0 << 3); 
      }else{ // x0,x4 
 
         x0 = (x0 << 11) + 128;     
         x5 = W7 * x4; 
         x1 = W1 * x4; 
         x2 = ((181 * W1_plus_W7 + 128) >> 8) * x4; 
         x4 = ((181 * W1_minus_W7 + 128) >> 8) * x4; 
 
         Blk[0] = (idct_block_t)((x0 + x1) >> 8); 
         Blk[8] = (idct_block_t)((x0 + x2) >> 8); 
         Blk[16] = (idct_block_t)((x0 + x4) >> 8); 
         Blk[24] = (idct_block_t)((x0 + x5) >> 8); 
         Blk[32] = (idct_block_t)((x0 - x5) >> 8); 
         Blk[40] = (idct_block_t)((x0 - x4) >> 8); 
         Blk[48] = (idct_block_t)((x0 - x2) >> 8); 
         Blk[56] = (idct_block_t)((x0 - x1) >> 8); 
      } 
   }else 
   if (!(x4|x567)) { // x0,x1,x2,x3 
    
      x0 = (x0 << 11) + 128;     
       
      x8 = x0 + x1; 
      x0 -= x1; 
      x1 = W6 * (x3 + x2); 
      x2 = x1 - (W2_plus_W6) * x2; 
      x3 = x1 + (W2_minus_W6) * x3; 
       
      x7 = x8 + x3; 
      x8 -= x3; 
      x3 = x0 + x2; 
      x0 -= x2; 
       
      Blk[0] = (idct_block_t)(x7 >> 8); 
      Blk[8] = (idct_block_t)(x3 >> 8); 
      Blk[16] = (idct_block_t)(x0 >> 8); 
      Blk[24] = (idct_block_t)(x8 >> 8); 
      Blk[32] = (idct_block_t)(x8 >> 8); 
      Blk[40] = (idct_block_t)(x0 >> 8); 
      Blk[48] = (idct_block_t)(x3 >> 8); 
      Blk[56] = (idct_block_t)(x7 >> 8); 
 
      return; 
   }else { //x0,x1,x2,x3,x4,x5,x6,x7 
 
      x0 = (x0 << 11) + 128;     
      x8 = W7 * (x4 + x5); 
      x4 = x8 + (W1_minus_W7) * x4; 
      x5 = x8 - (W1_plus_W7) * x5; 
      x8 = W3 * (x6 + x7); 
      x6 = x8 - (W3_minus_W5) * x6; 
      x7 = x8 - (W3_plus_W5) * x7; 
 
      x8 = x0 + x1; 
      x0 -= x1; 
      x1 = W6 * (x3 + x2); 
      x2 = x1 - (W2_plus_W6) * x2; 
      x3 = x1 + (W2_minus_W6) * x3; 
      x1 = x4 + x6; 
      x4 -= x6; 
      x6 = x5 + x7; 
      x5 -= x7; 
 
      x7 = x8 + x3; 
      x8 -= x3; 
      x3 = x0 + x2; 
      x0 -= x2; 
      x2 = (181 * (x4 + x5) + 128) >> 8; 
      x4 = (181 * (x4 - x5) + 128) >> 8; 
 
      Blk[0] = (idct_block_t)((x7 + x1) >> 8); 
      Blk[8] = (idct_block_t)((x3 + x2) >> 8); 
      Blk[16] = (idct_block_t)((x0 + x4) >> 8); 
      Blk[24] = (idct_block_t)((x8 + x6) >> 8); 
      Blk[32] = (idct_block_t)((x8 - x6) >> 8); 
      Blk[40] = (idct_block_t)((x0 - x4) >> 8); 
      Blk[48] = (idct_block_t)((x3 - x2) >> 8); 
      Blk[56] = (idct_block_t)((x7 - x1) >> 8); 
   } 
} 
 
static void IDCT_RowConst(int v, byte *Dst, const byte *Src){ 
   if (Src) { 
       
      dword MaskCarry = 0x80808080U; 
      dword a,b,c,d; 
 
      a = ((dword*)Src)[0]; 
      d = ((dword*)Src)[1]; 
      if(v>0){ 
         v |= v << 8; 
         v |= v << 16; 
         ADDSAT32(a, ((dword*)Dst)[0], v); 
         ADDSAT32(d, ((dword*)Dst)[1], v); 
      }else 
      if(v<0){ 
         v = -v; 
         v |= v << 8; 
         v |= v << 16; 
 
         SUBSAT32(a,((dword*)Dst)[0],v); 
         SUBSAT32(d,((dword*)Dst)[1],v); 
      }else{ 
         ((dword*)Dst)[0] = a; 
         ((dword*)Dst)[1] = d; 
      } 
   }else{ 
      SAT(v); 
      v &= 255; 
      v |= v << 8; 
      v |= v << 16; 
 
      ((dword*)Dst)[1] = ((dword*)Dst)[0] = v; 
   } 
}    
 
//---------------------------- 
 
static void IDCT_Row8(idct_block_t *Blk, byte *Dst, const byte *Src){ 
 
   int x0, x1, x2, x3, x4, x5, x6, x7, x8; 
 
   x4 = Blk[1]; 
   x3 = Blk[2]; 
   x7 = Blk[3]; 
   x1 = Blk[4]; 
   x6 = Blk[5]; 
   x2 = Blk[6]; 
   x5 = Blk[7]; 
    
   if (!(x1|x2|x3|x4|x5|x6|x7)) 
   { 
      IDCT_RowConst((Blk[0] + 32) >> 6,Dst,Src); 
      return; 
   } 
 
   x1 <<= 8; 
   x0 = (Blk[0] << 8) + 8192; 
 
   x8 = W7 * (x4 + x5) + 4; 
   x4 = (x8 + (W1_minus_W7) * x4) >> 3; 
   x5 = (x8 - (W1_plus_W7) * x5) >> 3; 
   x8 = W3 * (x6 + x7) + 4; 
   x6 = (x8 - (W3_minus_W5) * x6) >> 3; 
   x7 = (x8 - (W3_plus_W5) * x7) >> 3; 
 
   x8 = x0 + x1; 
   x0 -= x1; 
   x1 = W6 * (x3 + x2) + 4; 
   x2 = (x1 - (W2_plus_W6) * x2) >> 3; 
   x3 = (x1 + (W2_minus_W6) * x3) >> 3; 
   x1 = x4 + x6; 
   x4 -= x6; 
   x6 = x5 + x7; 
   x5 -= x7; 
 
   x7 = x8 + x3; 
   x8 -= x3; 
   x3 = x0 + x2; 
   x0 -= x2; 
   x2 = (181 * (x4 + x5) + 128) >> 8; 
   x4 = (181 * (x4 - x5) + 128) >> 8; 
 
   x5 = (x7 + x1) >> 14; 
   x1 = (x7 - x1) >> 14; 
   x7 = (x3 + x2) >> 14; 
   x2 = (x3 - x2) >> 14; 
   x3 = (x0 + x4) >> 14; 
   x4 = (x0 - x4) >> 14; 
   x0 = (x8 + x6) >> 14; 
   x6 = (x8 - x6) >> 14; 
 
   if (Src) 
   { 
      x5 += Src[0]; 
      x1 += Src[7]; 
      x7 += Src[1]; 
      x2 += Src[6]; 
      x3 += Src[2]; 
      x4 += Src[5]; 
      x0 += Src[3]; 
      x6 += Src[4]; 
   } 
    
   x8 = (x5|x1|x7|x2|x3|x4|x0|x6)>>8; 
 
   if (x8) 
   { 
      SAT(x5) 
      SAT(x7) 
      SAT(x3) 
      SAT(x0) 
      SAT(x6) 
      SAT(x4) 
      SAT(x2) 
      SAT(x1) 
   } 
 
   Dst[0] = (byte)x5; 
   Dst[1] = (byte)x7; 
   Dst[2] = (byte)x3; 
   Dst[3] = (byte)x0; 
   Dst[4] = (byte)x6; 
   Dst[5] = (byte)x4; 
   Dst[6] = (byte)x2; 
   Dst[7] = (byte)x1; 
} 
 
//---------------------------- 
 
static void IDCT_Row4(idct_block_t *Blk, byte *Dst, const byte *Src){ 
 
   int x0, x1, x2, x3, x4, x5, x6, x7, x8; 
   
   x4 = Blk[1]; 
   x3 = Blk[2]; 
   x7 = Blk[3]; 
    
   if (!(x3|x4|x7)) 
   { 
      IDCT_RowConst((Blk[0] + 32) >> 6,Dst,Src); 
      return; 
   } 
 
   x0 = (Blk[0] << 8) + 8192; 
 
   x5 = (W7 * x4 + 4) >> 3; 
   x4 = (W1 * x4 + 4) >> 3; 
   x6 = (W3 * x7 + 4) >> 3; 
   x7 = (-W5 * x7 + 4) >> 3; 
 
   x2 = (W6 * x3 + 4) >> 3; 
   x3 = (W2 * x3 + 4) >> 3; 
   x1 = x4 + x6; 
   x4 -= x6; 
   x6 = x5 + x7; 
   x5 -= x7; 
 
   x7 = x0 + x3; 
   x8 = x0 - x3; 
   x3 = x0 + x2; 
   x0 -= x2; 
   x2 = (181 * (x4 + x5) + 128) >> 8; 
   x4 = (181 * (x4 - x5) + 128) >> 8; 
 
   x5 = (x7 + x1) >> 14; 
   x1 = (x7 - x1) >> 14; 
   x7 = (x3 + x2) >> 14; 
   x2 = (x3 - x2) >> 14; 
   x3 = (x0 + x4) >> 14; 
   x4 = (x0 - x4) >> 14; 
   x0 = (x8 + x6) >> 14; 
   x6 = (x8 - x6) >> 14; 
 
   if (Src) 
   { 
      x5 += Src[0]; 
      x1 += Src[7]; 
      x7 += Src[1]; 
      x2 += Src[6]; 
      x3 += Src[2]; 
      x4 += Src[5]; 
      x0 += Src[3]; 
      x6 += Src[4]; 
   } 
    
   x8 = (x5|x1|x7|x2|x3|x4|x0|x6)>>8; 
 
   if (x8) 
   { 
      SAT(x5) 
      SAT(x7) 
      SAT(x3) 
      SAT(x0) 
      SAT(x6) 
      SAT(x4) 
      SAT(x2) 
      SAT(x1) 
   } 
 
   Dst[0] = (byte)x5; 
   Dst[1] = (byte)x7; 
   Dst[2] = (byte)x3; 
   Dst[3] = (byte)x0; 
   Dst[4] = (byte)x6; 
   Dst[5] = (byte)x4; 
   Dst[6] = (byte)x2; 
   Dst[7] = (byte)x1; 
} 
 
//---------------------------- 
 
void IDCT_Block8x8(idct_block_t *Block, byte *Dest, int DestStride, const byte *Src){ 
   int SrcStride; 
 
   IDCT_Col8(Block+0); 
   IDCT_Col8(Block+1); 
   IDCT_Col8(Block+2); 
   IDCT_Col8(Block+3); 
   IDCT_Col8(Block+4); 
   IDCT_Col8(Block+5); 
   IDCT_Col8(Block+6); 
   IDCT_Col8(Block+7); 
 
   SrcStride = 0; 
   if (Src) SrcStride = 8; 
 
   IDCT_Row8(Block,Dest,Src); 
   Dest+=DestStride; 
   Src+=SrcStride; 
   IDCT_Row8(Block+8,Dest,Src); 
   Dest+=DestStride; 
   Src+=SrcStride; 
   IDCT_Row8(Block+16,Dest,Src); 
   Dest+=DestStride; 
   Src+=SrcStride; 
   IDCT_Row8(Block+24,Dest,Src); 
   Dest+=DestStride; 
   Src+=SrcStride; 
   IDCT_Row8(Block+32,Dest,Src); 
   Dest+=DestStride; 
   Src+=SrcStride; 
   IDCT_Row8(Block+40,Dest,Src); 
   Dest+=DestStride; 
   Src+=SrcStride; 
   IDCT_Row8(Block+48,Dest,Src); 
   Dest+=DestStride; 
   Src+=SrcStride; 
   IDCT_Row8(Block+56,Dest,Src); 
} 
 
//---------------------------- 
 
void IDCT_Block4x8(idct_block_t *Block, byte *Dest, int DestStride, const byte *Src){ 
   int SrcStride; 
 
   IDCT_Col8(Block); 
   IDCT_Col8(Block+1); 
   IDCT_Col8(Block+2); 
   IDCT_Col8(Block+3); 
 
   SrcStride = 0; 
   if (Src) SrcStride = 8; 
 
   IDCT_Row4(Block,Dest,Src); 
   Dest+=DestStride; 
   Src+=SrcStride; 
   IDCT_Row4(Block+8,Dest,Src); 
   Dest+=DestStride; 
   Src+=SrcStride; 
   IDCT_Row4(Block+16,Dest,Src); 
   Dest+=DestStride; 
   Src+=SrcStride; 
   IDCT_Row4(Block+24,Dest,Src); 
   Dest+=DestStride; 
   Src+=SrcStride; 
   IDCT_Row4(Block+32,Dest,Src); 
   Dest+=DestStride; 
   Src+=SrcStride; 
   IDCT_Row4(Block+40,Dest,Src); 
   Dest+=DestStride; 
   Src+=SrcStride; 
   IDCT_Row4(Block+48,Dest,Src); 
   Dest+=DestStride; 
   Src+=SrcStride; 
   IDCT_Row4(Block+56,Dest,Src); 
} 
 
#endif   //!ARM 
 
//---------------------------- 
 
void IDCT_Const8x8(int v, byte * Dst, int DstPitch, const byte *Src){ 
 
   int SrcPitch = 8; 
 
   const byte* SrcEnd = Src + 8*SrcPitch; 
   dword MaskCarry = 0x80808080U; 
   dword a,b,c,d; 
 
   if(v>0){ 
      v |= v << 8; 
      v |= v << 16; 
      do{ 
         a = ((dword*)Src)[0]; 
         d = ((dword*)Src)[1]; 
         ADDSAT32(a,((dword*)Dst)[0],v); 
         ADDSAT32(d,((dword*)Dst)[1],v); 
         Dst += DstPitch; 
         Src += SrcPitch; 
      }while (Src != SrcEnd); 
   }else 
   if(v<0){ 
      v = -v; 
      v |= v << 8; 
      v |= v << 16; 
      do{ 
         a = ((dword*)Src)[0]; 
         d = ((dword*)Src)[1]; 
         SUBSAT32(a,((dword*)Dst)[0],v); 
         SUBSAT32(d,((dword*)Dst)[1],v); 
         Dst += DstPitch; 
         Src += SrcPitch; 
      }while (Src != SrcEnd); 
   } 
} 
 
//----------------------------