www.pudn.com > coremp4-1.0.zip > idct.cpp
/*****************************************************************************
* This program is free software ; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*****************************************************************************
*
* Authors:
*
* Andrea Graziani (Ag): Original source code (Open Divx Decoder 0.4a).
* Pedro Mateu (Pm) and
* Gabor Kovacs (Kg) Heavily modified and optimized code
* Michal Bacik Porting into C++
*
****************************************************************************/
#include "Rules.h"
#include "Util.h"
//----------------------------
typedef short idct_block_t;
#if defined _WIN32_WCE || defined __PALMOS__
#define USE_IDCT_WMMX
#endif
//----------------------------
// 2D Inverse Discrete Cosine Transform (iDCT)
#ifdef ARM
#define SAT(Value) Value = Value < 0 ? 0: (Value > 255 ? 255: Value);
#else
// upper bits are trashed! use only lower 8 bits afterward
#define SAT(Value) Value <<= Value >> 16; Value |= (Value << 23) >> 31;
#endif
#define ADDSAT32(a,Dst,Add32) \
b = a + Add32; \
c = a & Add32; \
a ^= Add32; \
a &= ~b; \
a |= c; \
a &= MaskCarry; \
c = a << 1; \
b -= c; /* adjust neighbour */ \
b |= c - (a >> 7); /* mask */ \
Dst = b;
#define SUBSAT32(a,Dst,Add32) \
a = ~a; \
b = a + Add32; \
c = a & Add32; \
a ^= Add32; \
a &= ~b; \
a |= c; \
a &= MaskCarry; \
c = a << 1; \
b -= c; /* adjust neighbour */ \
b |= c - (a >> 7); /* mask */ \
Dst = ~b;
//----------------------------
#ifdef ARM
#include "DynamicArmCode.h"
//----------------------------
//r6 Block
//r7,r8 must be saved
static void *BuildMCol8(C_dyn_code &dc){
const dword pitch = 16;
// r10 = x0
// r4 = x1
// r2 = x2
// r1 = x3
// r3 = x4
// r12 = x5
// r0 = x6
// r5 = x7
// r11 = x8
// r9 = tmp (x567)
void *mode_1 = dc.Label(false);
void *mode_2 = dc.Label(false);
void *mode_3 = dc.Label(false);
dc.Align(16);
void *fnc_begin = dc.Label();
dc.SHalf(); dc.Ldr(dc.r4, dc.r6, 4*pitch);
dc.SHalf(); dc.Ldr(dc.r0, dc.r6, 5*pitch);
dc.SHalf(); dc.Ldr(dc.r12,dc.r6, 7*pitch);
dc.SHalf(); dc.Ldr(dc.r5, dc.r6, 3*pitch);
dc.SHalf(); dc.Ldr(dc.r2, dc.r6, 6*pitch);
dc.SHalf(); dc.Ldr(dc.r1, dc.r6, 2*pitch);
dc.SHalf(); dc.Ldr(dc.r3, dc.r6, 1*pitch);
dc.SHalf(); dc.Ldr(dc.r10, dc.r6, 0*pitch);
dc.Orr(dc.r9, dc.r12, dc.r0);
dc.Orr(dc.r9, dc.r9, dc.r5);
dc.Orr(dc.r11, dc.r9, dc.r2);
dc.Orr(dc.r11, dc.r11, dc.r4);
dc.Set(); dc.Orr(dc.r11, dc.r11, dc.r1);
dc.Branch(dc.NE, mode_2);
dc.Cmp(dc.r3, 0);
dc.Branch(dc.NE, mode_1);
dc.Cmp(dc.r10, 0);
dc.Cond(dc.EQ); dc.Mov(dc.pc, dc.lr);
dc.Mov(dc.r10, dc.r10, dc.LSL, 3);
dc.Half(); dc.Str(dc.r10, dc.r6, 0);
dc.Half(); dc.Str(dc.r10, dc.r6, 0x10);
dc.Half(); dc.Str(dc.r10, dc.r6, 0x20);
dc.Half(); dc.Str(dc.r10, dc.r6, 0x30);
dc.Half(); dc.Str(dc.r10, dc.r6, 0x40);
dc.Half(); dc.Str(dc.r10, dc.r6, 0x50);
dc.Half(); dc.Str(dc.r10, dc.r6, 0x60);
dc.Half(); dc.Str(dc.r10, dc.r6, 0x70);
dc.Mov(dc.pc, dc.lr);
dc.PutLabel(mode_1);
//x0,x4
dc.Mov(dc.r11, dc.r3);
dc.Mov(dc.r2, 564);
dc.Orr(dc.r2, dc.r2, 1);
dc.Mov(dc.r9, dc.r3);
dc.Mul(dc.r2, dc.r11, dc.r2);
dc.Mov(dc.r11, 2832);
dc.Orr(dc.r11, dc.r11, 9);
dc.Mul(dc.r4, dc.r9, dc.r11);
dc.Mov(dc.r11, 2400);
dc.Orr(dc.r11, dc.r11, 8);
dc.Mul(dc.r5, dc.r9, dc.r11);
dc.Mov(dc.r11, 1600);
dc.Mov(dc.r1, dc.r10, dc.LSL, 11);
dc.Orr(dc.r11, dc.r11, 9);
dc.Mul(dc.r0, dc.r3, dc.r11);
dc.Add(dc.r1, dc.r1, 128);
dc.Add(dc.r3, dc.r4, dc.r1);
dc.Add(dc.r11, dc.r5, dc.r1);
dc.Mov(dc.r3, dc.r3, dc.ASR, 8);
dc.Mov(dc.r11, dc.r11, dc.ASR, 8);
dc.Half(); dc.Str(dc.r3, dc.r6, 0);
dc.Half(); dc.Str(dc.r11, dc.r6, 0x10);
dc.Add(dc.r3, dc.r0, dc.r1);
dc.Add(dc.r11, dc.r2, dc.r1);
dc.Mov(dc.r3, dc.r3, dc.ASR, 8);
dc.Mov(dc.r11, dc.r11, dc.ASR, 8);
dc.Half(); dc.Str(dc.r3, dc.r6, 0x20);
dc.Half(); dc.Str(dc.r11, dc.r6, 0x30);
dc.Sub(dc.r3, dc.r1, dc.r2);
dc.Sub(dc.r11, dc.r1, dc.r0);
dc.Mov(dc.r3, dc.r3, dc.ASR, 8);
dc.Mov(dc.r11, dc.r11, dc.ASR, 8);
dc.Half(); dc.Str(dc.r3, dc.r6, 0x40);
dc.Half(); dc.Str(dc.r11, dc.r6, 0x50);
dc.Sub(dc.r3, dc.r1, dc.r5);
dc.Sub(dc.r11, dc.r1, dc.r4);
dc.Mov(dc.r3, dc.r3, dc.ASR, 8);
dc.Mov(dc.r11, dc.r11, dc.ASR, 8);
dc.Half(); dc.Str(dc.r3, dc.r6, 0x60);
dc.Half(); dc.Str(dc.r11, dc.r6, 0x70);
dc.Mov(dc.pc, dc.lr);
dc.PutLabel(mode_2);
//x0,x1,x2,x3
dc.Set(); dc.Orr(dc.r11, dc.r9, dc.r3);
dc.Branch(dc.NE, mode_3);
dc.Mov(dc.r3, dc.r10, dc.LSL, 11);
dc.Add(dc.r3, dc.r3, 128);
dc.Mov(dc.r9, 1104);
dc.Add(dc.r5, dc.r3, dc.r4, dc.LSL, 11);
dc.Add(dc.r11, dc.r2, dc.r1);
dc.Orr(dc.r9, dc.r9, 4);
dc.Sub(dc.r3, dc.r3, dc.r4, dc.LSL, 11);
dc.Mul(dc.r4, dc.r11, dc.r9);
dc.Mov(dc.r11, 3776);
dc.Orr(dc.r11, dc.r11, 8);
dc.Mul(dc.r11, dc.r2, dc.r11);
dc.Sub(dc.r2, dc.r4, dc.r11);
dc.Mov(dc.r11, 1568);
dc.Mul(dc.r11, dc.r1, dc.r11);
dc.Add(dc.r0, dc.r2, dc.r3);
dc.Add(dc.r1, dc.r11, dc.r4);
dc.Add(dc.r4, dc.r5, dc.r1);
dc.Sub(dc.r3, dc.r3, dc.r2);
dc.Sub(dc.r5, dc.r5, dc.r1);
dc.Mov(dc.r1, dc.r4, dc.ASR, 8);
dc.Mov(dc.r3, dc.r3, dc.ASR, 8);
dc.Mov(dc.r2, dc.r0, dc.ASR, 8);
dc.Mov(dc.r4, dc.r5, dc.ASR, 8);
dc.Half(); dc.Str(dc.r1, dc.r6, 0x00);
dc.Half(); dc.Str(dc.r2, dc.r6, 0x10);
dc.Half(); dc.Str(dc.r3, dc.r6, 0x20);
dc.Half(); dc.Str(dc.r4, dc.r6, 0x30);
dc.Half(); dc.Str(dc.r4, dc.r6, 0x40);
dc.Half(); dc.Str(dc.r3, dc.r6, 0x50);
dc.Half(); dc.Str(dc.r2, dc.r6, 0x60);
dc.Half(); dc.Str(dc.r1, dc.r6, 0x70);
dc.Mov(dc.pc, dc.lr);
dc.PutLabel(mode_3);
//x0,x1,x2,x3,x4,x5,x6,x7
dc.Mov(dc.r9, 564);
dc.Orr(dc.r9, dc.r9, 1); //W7
dc.Add(dc.r11, dc.r12, dc.r3);
dc.Mul(dc.r11, dc.r9, dc.r11); //x8 = W7 * (x5 + x4)
dc.Mov(dc.r9, 2272);
dc.Orr(dc.r9, dc.r9, 4); //W1_minus_W7
dc.Mla(dc.r3, dc.r9, dc.r3, dc.r11); //x4 = x8 + (W1_minus_W7) * x4
dc.Mvn(dc.r9, 0xd40);
dc.Eor(dc.r9, dc.r9, 0xd); //minus_W1_minus_W7
dc.Mla(dc.r12, dc.r9, dc.r12, dc.r11); //x5 = x8 + (minus_W1_minus_W7) * x5
dc.Mov(dc.r9, 2400); //
dc.Orr(dc.r9, dc.r9, 8); //W3
dc.Add(dc.r11, dc.r0, dc.r5);
dc.Mul(dc.r11, dc.r9, dc.r11); //x8 = W3 * (x6 + x7)
dc.Mvn(dc.r9, 0x310);
dc.Eor(dc.r9, dc.r9, 0xe); //W5_minus_W3
dc.Mla(dc.r0, dc.r9, dc.r0, dc.r11); //x6 = x8 + (W5_minus_W3) * x6
dc.Mvn(dc.r9, 0xfb0); //minus_W3_minus_W5
dc.Mla(dc.r5, dc.r9, dc.r5, dc.r11); //x7 = x8 + minus_W3_minus_W5 * x7
dc.Mov(dc.r10, dc.r10, dc.LSL, 11);
dc.Add(dc.r10, dc.r10, 128); //x0 = (x0 << 11) + 128
dc.Add(dc.r11, dc.r10, dc.r4, dc.LSL, 11); //x8 = x0 + (x1 << 11)
dc.Sub(dc.r10, dc.r10, dc.r4, dc.LSL, 11); //x0 = x0 - (x1 << 11)
dc.Mov(dc.r9, 1104);
dc.Orr(dc.r9, dc.r9, 4); //W6
dc.Add(dc.r4, dc.r1, dc.r2);
dc.Mul(dc.r4, dc.r9, dc.r4); //x1 = W6 * (x3 + x2)
dc.Mvn(dc.r9, 0xec0);
dc.Eor(dc.r9, dc.r9, 7); //minus_W2_minus_W6
dc.Mla(dc.r2, dc.r9, dc.r2, dc.r4); //x2 = x1 + minus_W2_minus_W6 * x2
dc.Mov(dc.r9, 0x620); //W2_minus_W6
dc.Mla(dc.r1, dc.r9, dc.r1, dc.r4); //x3 = x1 + (W2_minus_W6) * x3
dc.Add(dc.r4, dc.r3, dc.r0); //x1 = x4 + x6
dc.Sub(dc.r3, dc.r3, dc.r0); //x4 -= x6
dc.Add(dc.r0, dc.r12, dc.r5); //x6 = x5 + x7
dc.Sub(dc.r12, dc.r12, dc.r5); //x5 -= x7
dc.Add(dc.r5, dc.r11, dc.r1); //x7 = x8 + x3
dc.Sub(dc.r11, dc.r11, dc.r1); //x8 -= x3
dc.Add(dc.r1, dc.r10, dc.r2); //x3 = x0 + x2
dc.Sub(dc.r10, dc.r10, dc.r2); //x0 -= x2
dc.Add(dc.r9, dc.r3, dc.r12); //x4 + x5
dc.Sub(dc.r3, dc.r3, dc.r12); //x4 - x5
dc.Mov(dc.r12, 181);
dc.Mul(dc.r2, dc.r9, dc.r12); //181 * (x4 + x5)
dc.Mul(dc.r9, dc.r3, dc.r12); //181 * (x4 - x5)
dc.Add(dc.r2, dc.r2, 128); //x2 = 181 * (x4 + x5) + 128
dc.Add(dc.r3, dc.r9, 128); //x4 = 181 * (x4 - x5) + 128
dc.Add(dc.r9, dc.r5, dc.r4);
dc.Sub(dc.r5, dc.r5, dc.r4);
dc.Mov(dc.r9, dc.r9, dc.ASR, 8); //(x7 + x1) >> 8
dc.Mov(dc.r5, dc.r5, dc.ASR, 8); //(x7 - x1) >> 8
dc.Half(); dc.Str(dc.r9, dc.r6, 0x00);
dc.Half(); dc.Str(dc.r5, dc.r6, 0x70);
dc.Add(dc.r9, dc.r1, dc.r2, dc.ASR, 8);
dc.Sub(dc.r1, dc.r1, dc.r2, dc.ASR, 8);
dc.Mov(dc.r9, dc.r9, dc.ASR, 8); //(x3 + x2) >> 8
dc.Mov(dc.r1, dc.r1, dc.ASR, 8); //(x3 - x2) >> 8
dc.Half(); dc.Str(dc.r9, dc.r6, 0x10);
dc.Half(); dc.Str(dc.r1, dc.r6, 0x60);
dc.Add(dc.r9, dc.r10, dc.r3, dc.ASR, 8);
dc.Sub(dc.r10, dc.r10, dc.r3, dc.ASR, 8);
dc.Mov(dc.r9, dc.r9, dc.ASR, 8); //(x0 + x4) >> 8
dc.Mov(dc.r10, dc.r10, dc.ASR, 8); //(x0 - x4) >> 8
dc.Half(); dc.Str(dc.r9, dc.r6, 0x20);
dc.Half(); dc.Str(dc.r10, dc.r6, 0x50);
dc.Add(dc. r9, dc.r11, dc.r0);
dc.Sub(dc.r11, dc.r11, dc.r0);
dc.Mov(dc.r9, dc.r9, dc.ASR, 8); //(x8 + x6) >> 8
dc.Mov(dc.r11, dc.r11, dc.ASR, 8); //(x8 - x6) >> 8
dc.Half(); dc.Str(dc.r9, dc.r6, 0x30);
dc.Half(); dc.Str(dc.r11, dc.r6, 0x40);
dc.Mov(dc.pc, dc.lr);
return fnc_begin;
}
//----------------------------
static void *BuildRowConst(C_dyn_code &dc){
// r0 Block[0]
// r6 Block
// r7 Src
// r8 Dst
void *l_no_src = dc.Label(false);
void *l_zero = dc.Label(false);
void *l_sub = dc.Label(false);
void *CarryMask = dc.DeclareData(0x80808080);
dc.Align(16);
void *fnc_begin = dc.Label();
dc.Add(dc.r0, dc.r0, 32);
dc.Cmp(dc.r7, 0);
dc.Mov(dc.r3, dc.r0, dc.ASR, 6);
dc.Branch(dc.EQ, l_no_src);
dc.Cmp(dc.r3, 0);
dc.Branch(dc.EQ, l_zero);
dc.Ldr(dc.r0, CarryMask);
dc.LdrAdvance(dc.r2, dc.r7, 8); //source stride
dc.Branch(dc.LT, l_sub);
//add:
dc.Orr(dc.r3, dc.r3, dc.r3, dc.LSL, 8);
dc.Orr(dc.r3, dc.r3, dc.r3, dc.LSL, 16);
dc.Add(dc.r4, dc.r2, dc.r3);
dc.Eor(dc.r11, dc.r2, dc.r3);
dc.And(dc.r2, dc.r3, dc.r2);
dc.Bic(dc.r11, dc.r11, dc.r4);
dc.Orr(dc.r11, dc.r11, dc.r2);
dc.And(dc.r5, dc.r11, dc.r0);
dc.Mov(dc.r12, dc.r5, dc.LSL, 1);
dc.Sub(dc.r10, dc.r4, dc.r12);
dc.Sub(dc.r11, dc.r12, dc.r5, dc.LSR, 7);
dc.Ldr(dc.r2, dc.r7, -4);
dc.Orr(dc.r11, dc.r11, dc.r10);
dc.Str(dc.r11, dc.r8, 0);
dc.Add(dc.r4, dc.r2, dc.r3);
dc.Eor(dc.r11, dc.r2, dc.r3);
dc.And(dc.r2, dc.r3, dc.r2);
dc.Bic(dc.r11, dc.r11, dc.r4);
dc.Orr(dc.r11, dc.r11, dc.r2);
dc.And(dc.r5, dc.r11, dc.r0);
dc.Mov(dc.r12, dc.r5, dc.LSL, 1);
dc.Sub(dc.r10, dc.r4, dc.r12);
dc.Sub(dc.r11, dc.r12, dc.r5, dc.LSR, 7);
dc.Orr(dc.r11, dc.r11, dc.r10);
dc.Str(dc.r11, dc.r8, 4);
dc.Mov(dc.pc, dc.lr);
dc.PutLabel(l_sub);
dc.Rsb(dc.r3, dc.r3, 0);
dc.Orr(dc.r3, dc.r3, dc.r3, dc.LSL, 8);
dc.Orr(dc.r3, dc.r3, dc.r3, dc.LSL, 16);
dc.Mvn(dc.r2, dc.r2);
dc.Add(dc.r4, dc.r2, dc.r3);
dc.Eor(dc.r11, dc.r2, dc.r3);
dc.And(dc.r2, dc.r3, dc.r2);
dc.Bic(dc.r11, dc.r11, dc.r4);
dc.Orr(dc.r11, dc.r11, dc.r2);
dc.And(dc.r5, dc.r11, dc.r0);
dc.Mov(dc.r12, dc.r5, dc.LSL, 1);
dc.Sub(dc.r10, dc.r4, dc.r12);
dc.Sub(dc.r11, dc.r12, dc.r5, dc.LSR, 7);
dc.Ldr(dc.r2, dc.r7, -4);
dc.Orr(dc.r11, dc.r11, dc.r10);
dc.Mvn(dc.r11, dc.r11);
dc.Str(dc.r11, dc.r8, 0);
dc.Mvn(dc.r2, dc.r2);
dc.Add(dc.r4, dc.r2, dc.r3);
dc.Eor(dc.r11, dc.r2, dc.r3);
dc.And(dc.r2, dc.r3, dc.r2);
dc.Bic(dc.r11, dc.r11, dc.r4);
dc.Orr(dc.r11, dc.r11, dc.r2);
dc.And(dc.r5, dc.r11, dc.r0);
dc.Mov(dc.r12, dc.r5, dc.LSL, 1);
dc.Sub(dc.r10, dc.r4, dc.r12);
dc.Sub(dc.r11, dc.r12, dc.r5, dc.LSR, 7);
dc.Orr(dc.r11, dc.r11, dc.r10);
dc.Mvn(dc.r11, dc.r11);
dc.Str(dc.r11, dc.r8, 4);
dc.Mov(dc.pc, dc.lr);
dc.PutLabel(l_zero);
dc.Ldr(dc.r2, dc.r7, 4);
dc.LdrAdvance(dc.r1, dc.r7, 8); //source stride
dc.Str(dc.r2, dc.r8, 4);
dc.Str(dc.r1, dc.r8, 0);
dc.Mov(dc.pc, dc.lr);
dc.PutLabel(l_no_src);
dc.Cmp(dc.r3, 0);
dc.Cond(dc.MI); dc.Mov(dc.r3, 0);
dc.Cond(dc.PL); dc.Cmp(dc.r3, 255);
dc.Cond(dc.GT); dc.Mov(dc.r3, 255);
dc.Orr(dc.r3, dc.r3, dc.r3, dc.LSL, 8);
dc.Orr(dc.r3, dc.r3, dc.r3, dc.LSL, 16);
dc.Str(dc.r3, dc.r8, 0);
dc.Str(dc.r3, dc.r8, 4);
dc.Mov(dc.pc, dc.lr);
dc.PutLabel(CarryMask);
return fnc_begin;
}
//----------------------------
static void BuildIDCT_Block4x8(C_dyn_code &dc, void *l_MCol8, void *l_RowConst){
// r6 Block
// r7 Src
// r8 Dst
void *W1 = dc.DeclareData(2841); // 2048*sqrt(2)*cos(1*pi/16)
void *W2 = dc.DeclareData(2676); // 2048*sqrt(2)*cos(2*pi/16)
void *W3 = dc.DeclareData(2408); // 2048*sqrt(2)*cos(3*pi/16)
void *W6 = dc.DeclareData(1108); // 2048*sqrt(2)*cos(6*pi/16)
void *W7 = dc.DeclareData(565); //2048*sqrt(2)*cos(7*pi/16)
void *minus_W5 = dc.DeclareData((dword)-1609); // 2048*sqrt(2)*cos(5*pi/16)
void *Row4_NoSrc = dc.Label(false);
void *Row4_Sat = dc.Label(false);
struct S_stack{
void *end_ptr;
dword dst_pitch;
dword saved_regs[C_dyn_code::STACKFRAME];
};
dc.Align(16);
dc.FunctionBegin(OffsetOf(S_stack, saved_regs));
dc.Mov(dc.r6, dc.r0); //Block
dc.Add(dc.r0, dc.r0, 128);
dc.Str(dc.r2, dc.sp, OffsetOf(S_stack, dst_pitch));
dc.Str(dc.r0, dc.sp, OffsetOf(S_stack, end_ptr));
dc.Mov(dc.r7, dc.r3); //Src
dc.Mov(dc.r8, dc.r1); //Dst
dc.BranchLink(l_MCol8);
dc.Add(dc.r6, dc.r6, 2);
dc.BranchLink(l_MCol8);
dc.Add(dc.r6, dc.r6, 2);
dc.BranchLink(l_MCol8);
dc.Add(dc.r6, dc.r6, 2);
dc.BranchLink(l_MCol8);
dc.Sub(dc.r6, dc.r6, 6);
{
void *Row4_Loop = dc.Label();
dc.SHalf(); dc.Ldr(dc.r4, dc.r6, 4); //x3
dc.SHalf(); dc.Ldr(dc.r5, dc.r6, 6); //x7
dc.SHalf(); dc.Ldr(dc.r3, dc.r6, 2); //x4
dc.SHalf(); dc.Ldr(dc.r0, dc.r6, 0); //x0
dc.Orr(dc.r11, dc.r5, dc.r4);
dc.Set(); dc.Orr(dc.r11, dc.r11, dc.r3);
void *Row4_NoConst = dc.Label(false);
dc.Branch(dc.NE, Row4_NoConst);
dc.BranchLink(l_RowConst);
void *Row4_Next = dc.Label(false);
dc.Branch(Row4_Next);
dc.PutLabel(Row4_NoConst);
dc.Cmp(dc.r7, 0);
dc.Ldr(dc.r10, W7);
dc.Ldr(dc.r11, W1);
dc.Mov(dc.r2, 4);
dc.Add(dc.r0, dc.r0, 32);
dc.Mov(dc.r0, dc.r0, dc.LSL, 8); //x0
dc.Mla(dc.r14, dc.r3, dc.r10, dc.r2); //x5 = x4 * W7 + 4
dc.Ldr(dc.r10, W3);
dc.Mla(dc.r3, dc.r11, dc.r3, dc.r2); //x4 = x4 * W1 + 4
dc.Mov(dc.r14, dc.r14, dc.ASR, 3); //x5 >>= 3
dc.Ldr(dc.r11, minus_W5);
dc.Mla(dc.r12, dc.r5, dc.r10, dc.r2); //x6 = x7 * W3 + 4
dc.Mov(dc.r3, dc.r3, dc.ASR, 3); //x4 >>= 3
dc.Ldr(dc.r10, W6);
dc.Mla(dc.r5, dc.r11, dc.r5, dc.r2); //x7 = x7 * minus_W5 + 4
dc.Ldr(dc.r11, W2);
dc.Add(dc.r9, dc.r3, dc.r12, dc.ASR, 3); //x1 = x4 + (x6 >> 3)
dc.Sub(dc.r3, dc.r3, dc.r12, dc.ASR, 3); //x4 = x4 - (x6 >> 3)
dc.Mla(dc.r12, dc.r4, dc.r10, dc.r2); //x2 = x3 * W6 + 4
dc.Mla(dc.r4, dc.r11, dc.r4, dc.r2); //x3 = x3 * W2 + 4
dc.Add(dc.r2, dc.r14, dc.r5, dc.ASR, 3); //x6 = x5 + (x7 >> 3)
dc.Sub(dc.r5, dc.r14, dc.r5, dc.ASR, 3); //x5 = x5 - (x7 >> 3)
dc.Add(dc.r14, dc.r0, dc.r4, dc.ASR, 3); //x7 = x0 + (x3 >> 3)
dc.Sub(dc.r4, dc.r0, dc.r4, dc.ASR, 3); //x8 = x0 - (x3 >> 3)
dc.Add(dc.r10, dc.r0, dc.r12, dc.ASR, 3);//x3 = x0 + (x2 >> 3)
dc.Sub(dc.r0, dc.r0, dc.r12, dc.ASR, 3); //x0 = x0 - (x2 >> 3)
dc.Add(dc.r1, dc.r5, dc.r3);
dc.Mov(dc.r11, 181);
dc.Mul(dc.r12, dc.r1, dc.r11); //x2 = 181 * (x5 + x4)
dc.Sub(dc.r3, dc.r3, dc.r5);
dc.Mul(dc.r1, dc.r3, dc.r11); //x4 = 181 * (x4 - x5)
dc.Add(dc.r12, dc.r12, 128); //x2 += 128
dc.Add(dc.r3, dc.r1, 128); //x4 += 128
dc.Add(dc.r1, dc.r14, dc.r9); //x5 = x7 + x1
dc.Sub(dc.r5, dc.r14, dc.r9); //x1 = x7 - x1
dc.Add(dc.r11, dc.r10, dc.r12, dc.ASR, 8); //x7 = x3 + (x2 >> 8)
dc.Sub(dc.r14, dc.r10, dc.r12, dc.ASR, 8); //x2 = x3 - (x2 >> 8)
dc.Add(dc.r9, dc.r0, dc.r3, dc.ASR, 8); //x3 = x0 + (x4 >> 8)
dc.Sub(dc.r3, dc.r0, dc.r3, dc.ASR, 8); //x4 = x0 - (x4 >> 8)
dc.Add(dc.r12, dc.r4, dc.r2); //x0 = x8 + x6
dc.Sub(dc.r4, dc.r4, dc.r2); //x6 = x8 - x6
dc.Branch(dc.EQ, Row4_NoSrc);
dc.Byte(); dc.Ldr(dc.r0, dc.r7, 0);
dc.Byte(); dc.Ldr(dc.r2, dc.r7, 7);
dc.Byte(); dc.Ldr(dc.r10, dc.r7, 1);
dc.Add(dc.r1, dc.r0, dc.r1, dc.ASR, 14);
dc.Add(dc.r5, dc.r2, dc.r5, dc.ASR, 14);
dc.Add(dc.r11, dc.r10, dc.r11, dc.ASR, 14);
dc.Byte(); dc.Ldr(dc.r2, dc.r7, 6);
dc.Byte(); dc.Ldr(dc.r0, dc.r7, 2);
dc.Byte(); dc.Ldr(dc.r10, dc.r7, 5);
dc.Add(dc.r14, dc.r2, dc.r14, dc.ASR, 14);
dc.Add(dc.r9, dc.r0, dc.r9, dc.ASR, 14);
dc.Byte(); dc.Ldr(dc.r0, dc.r7, 3);
dc.Byte(); dc.Ldr(dc.r2, dc.r7, 4);
dc.Add(dc.r3, dc.r10, dc.r3, dc.ASR, 14);
dc.Add(dc.r12, dc.r0, dc.r12, dc.ASR, 14);
dc.Add(dc.r4, dc.r2, dc.r4, dc.ASR, 14);
dc.Add(dc.r7, dc.r7, 8); //source stride
dc.PutLabel(Row4_Sat);
dc.Orr(dc.r0, dc.r5, dc.r14);
dc.Orr(dc.r0, dc.r0, dc.r4);
dc.Orr(dc.r0, dc.r0, dc.r1);
dc.Orr(dc.r0, dc.r0, dc.r12);
dc.Orr(dc.r0, dc.r0, dc.r11);
dc.Orr(dc.r0, dc.r0, dc.r9);
dc.Orr(dc.r0, dc.r0, dc.r3);
dc.Set(); dc.Bic(dc.r0, dc.r0, 255);
void *Row4_Write = dc.Label(false);
dc.Branch(dc.EQ, Row4_Write);
dc.Mov(dc.r0, 0xffffff00);
dc.Tst(dc.r1, dc.r0);
dc.Cond(dc.NE); dc.Mov(dc.r1, 0xFF);
dc.Cond(dc.MI); dc.Mov(dc.r1, 0x00);
dc.Tst(dc.r11, dc.r0);
dc.Cond(dc.NE); dc.Mov(dc.r11, 0xFF);
dc.Cond(dc.MI); dc.Mov(dc.r11, 0x00);
dc.Tst(dc.r9, dc.r0);
dc.Cond(dc.NE); dc.Mov(dc.r9, 0xFF);
dc.Cond(dc.MI); dc.Mov(dc.r9, 0x00);
dc.Tst(dc.r12, dc.r0);
dc.Cond(dc.NE); dc.Mov(dc.r12, 0xFF);
dc.Cond(dc.MI); dc.Mov(dc.r12, 0x00);
dc.Tst(dc.r4, dc.r0);
dc.Cond(dc.NE); dc.Mov(dc.r4, 0xFF);
dc.Cond(dc.MI); dc.Mov(dc.r4, 0x00);
dc.Tst(dc.r3, dc.r0);
dc.Cond(dc.NE); dc.Mov(dc.r3, 0xFF);
dc.Cond(dc.MI); dc.Mov(dc.r3, 0x00);
dc.Tst(dc.r14, dc.r0);
dc.Cond(dc.NE); dc.Mov(dc.r14, 0xFF);
dc.Cond(dc.MI); dc.Mov(dc.r14, 0x00);
dc.Tst(dc.r5, dc.r0);
dc.Cond(dc.NE); dc.Mov(dc.r5, 0xFF);
dc.Cond(dc.MI); dc.Mov(dc.r5, 0x00);
dc.PutLabel(Row4_Write);
dc.Byte(); dc.Str(dc.r1, dc.r8, 0);
dc.Byte(); dc.Str(dc.r11, dc.r8, 1);
dc.Byte(); dc.Str(dc.r9, dc.r8, 2);
dc.Byte(); dc.Str(dc.r12, dc.r8, 3);
dc.Byte(); dc.Str(dc.r4, dc.r8, 4);
dc.Byte(); dc.Str(dc.r3, dc.r8, 5);
dc.Byte(); dc.Str(dc.r14, dc.r8, 6);
dc.Byte(); dc.Str(dc.r5, dc.r8, 7);
dc.PutLabel(Row4_Next);
dc.Ldr(dc.r2, dc.sp, OffsetOf(S_stack, dst_pitch));
dc.Ldr(dc.r1, dc.sp, OffsetOf(S_stack, end_ptr));
dc.Add(dc.r6, dc.r6, 16); //Block += 16
dc.Add(dc.r8, dc.r8, dc.r2); //Dst += DstStride
dc.Cmp(dc.r6, dc.r1);
dc.Branch(dc.NE, Row4_Loop);
}
dc.FunctionEnd();
dc.PutLabel(Row4_NoSrc);
dc.Mov(dc.r5, dc.r5, dc.ASR, 14);
dc.Mov(dc.r14, dc.r14, dc.ASR, 14);
dc.Mov(dc.r12, dc.r12, dc.ASR, 14);
dc.Mov(dc.r1, dc.r1, dc.ASR, 14);
dc.Mov(dc.r11, dc.r11, dc.ASR, 14);
dc.Mov(dc.r9, dc.r9, dc.ASR, 14);
dc.Mov(dc.r3, dc.r3, dc.ASR, 14);
dc.Mov(dc.r4, dc.r4, dc.ASR, 14);
dc.Branch(Row4_Sat);
dc.PutLabel(W1);
dc.PutLabel(W2);
dc.PutLabel(W3);
dc.PutLabel(W6);
dc.PutLabel(W7);
dc.PutLabel(minus_W5);
}
//----------------------------
// r6 Block
// r7 Src
// r8 Dst
static void BuildIDCT_Block8x8(C_dyn_code &dc, void *l_MCol8, void *l_RowConst){
//void *W1 = dc.DeclareData(2841); // 2048*sqrt(2)*cos(1*pi/16)
//void *W2 = dc.DeclareData(2676); // 2048*sqrt(2)*cos(2*pi/16)
void *W3 = dc.DeclareData(2408); // 2048*sqrt(2)*cos(3*pi/16)
void *W6 = dc.DeclareData(1108); // 2048*sqrt(2)*cos(6*pi/16)
void *W7 = dc.DeclareData(565); //2048*sqrt(2)*cos(7*pi/16)
void *W1_minus_W7 = dc.DeclareData(2276);
void *minus_W1_minus_W7 = dc.DeclareData((dword)-3406);
void *W5_minus_W3 = dc.DeclareData((dword)-799);
void *minus_W2_minus_W6 = dc.DeclareData((dword)-3784);
void *Row8_NoSrc = dc.Label(false);
void *Row8_Sat = dc.Label(false);
struct S_stack{
void *end_ptr;
dword dst_pitch;
dword saved_regs[C_dyn_code::STACKFRAME];
};
dc.Align(16);
dc.FunctionBegin(OffsetOf(S_stack, saved_regs));
dc.Mov(dc.r6, dc.r0); //Block
dc.Add(dc. r0, dc.r0, 128);
dc.Str(dc.r2, dc.sp, OffsetOf(S_stack, dst_pitch));
dc.Str(dc.r0, dc.sp, OffsetOf(S_stack, end_ptr));
//stmdb sp!, {r0, dc.r2, dc.r4 - dc.r12, lr} // r0=BlockEnd r2=DstStride
//dc.Sub(dc.r6, dc.r0, 128 //Block
dc.Mov(dc.r7, dc.r3); //Src
dc.Mov(dc.r8, dc.r1); //Dst
dc.BranchLink(l_MCol8);
dc.Add(dc.r6, dc.r6, 2);
dc.BranchLink(l_MCol8);
dc.Add(dc.r6, dc.r6, 2);
dc.BranchLink(l_MCol8);
dc.Add(dc.r6, dc.r6, 2);
dc.BranchLink(l_MCol8);
dc.Add(dc.r6, dc.r6, 2);
dc.BranchLink(l_MCol8);
dc.Add(dc.r6, dc.r6, 2);
dc.BranchLink(l_MCol8);
dc.Add(dc.r6, dc.r6, 2);
dc.BranchLink(l_MCol8);
dc.Add(dc.r6, dc.r6, 2);
dc.BranchLink(l_MCol8);
dc.Sub(dc.r6, dc.r6, 14);
{
void *Row8_Loop = dc.Label();
dc.SHalf(); dc.Ldr(dc.r0, dc.r6, 0); //x0
dc.SHalf(); dc.Ldr(dc.r3, dc.r6, 2); //x4
dc.SHalf(); dc.Ldr(dc.r4, dc.r6, 4); //x3
dc.SHalf(); dc.Ldr(dc.r5, dc.r6, 6); //x7
dc.SHalf(); dc.Ldr(dc.r9, dc.r6, 8); //x1
dc.SHalf(); dc.Ldr(dc.r2, dc.r6, 10); //x6
dc.SHalf(); dc.Ldr(dc.r14,dc.r6, 12); //x2
dc.SHalf(); dc.Ldr(dc.r1, dc.r6, 14); //x5
dc.Orr(dc.r11, dc.r3, dc.r4);
dc.Orr(dc.r11, dc.r11, dc.r5);
dc.Orr(dc.r11, dc.r11, dc.r9);
dc.Orr(dc.r11, dc.r11, dc.r2);
dc.Orr(dc.r11, dc.r11, dc.r14);
dc.Set(); dc.Orr(dc.r11, dc.r11, dc.r1);
void *Row8_NoConst = dc.Label(false);
dc.Branch(dc.NE, Row8_NoConst);
dc.BranchLink(l_RowConst);
//bl RowConst
//b Row8_Next
void *Row8_Next = dc.Label(false);
dc.Branch(Row8_Next);
/*
_W3 DCW 2408 // 2048*sqrt(2)*cos(3*pi/16)
_W6 DCW 1108 // 2048*sqrt(2)*cos(6*pi/16)
_W7 DCW 565 // 2048*sqrt(2)*cos(7*pi/16)
W1_minus_W7 DCW 2276
minus_W1_minus_W7 DCW 0xF2B2 //-3406
W5_minus_W3 DCW 0xFCE1 //-799
minus_W2_minus_W6 DCW 0xF138 //-3784
*/
dc.PutLabel(Row8_NoConst);
dc.Cmp(dc.r7, 0);
dc.Add(dc.r0, dc.r0, 32);
dc.Ldr(dc.r10, W7);
dc.Mov(dc.r0, dc.r0, dc.LSL, 11); //x0 = (x0 + 32) << 11
dc.Ldr(dc.r12, W1_minus_W7);
dc.Add(dc.r11, dc.r3, dc.r1);
dc.Mul(dc.r11, dc.r10, dc.r11); //x8 = W7 * (x4 + x5)
dc.Ldr(dc.r10, minus_W1_minus_W7);
dc.Mla(dc.r3, dc.r12, dc.r3, dc.r11); //x4 = x8 + W1_minus_W7 * x4
dc.Ldr(dc.r12, W3);
dc.Mla(dc.r1, dc.r10, dc.r1, dc.r11); //x5 = x8 + minus_W1_minus_W7 * x5
dc.Ldr(dc.r10, W5_minus_W3);
dc.Add(dc.r11, dc.r2, dc.r5); //x6 + x7
dc.Mul(dc.r11, dc.r12, dc.r11); //x8 = W3 * (x6 + x7)
dc.Mvn(dc.r12, 0xfb0); //minus_W3_minus_W5
dc.Mla(dc.r2, dc.r10, dc.r2, dc.r11); //x6 = x8 + W5_minus_W3 * x6
dc.Ldr(dc.r10, W6);
dc.Mla(dc.r5, dc.r12, dc.r5, dc.r11); //x7 = x8 + minus_W3_minus_W5 * x7
dc.Ldr(dc.r12, minus_W2_minus_W6);
dc.Add(dc.r11, dc.r0, dc.r9, dc.LSL, 11);//x8 = x0 + (x1 << 11)
dc.Sub(dc.r0, dc.r0, dc.r9, dc.LSL, 11); //x0 = x0 - (x1 << 11)
dc.Add(dc.r9, dc.r4, dc.r14);
dc.Mul(dc.r9, dc.r10, dc.r9); //x1 = W6 * (x3 + x2)
dc.Mov(dc.r10, 0x620); //W2_minus_W6
dc.Mla(dc.r14, dc.r12, dc.r14, dc.r9); //x2 = x1 + minus_W2_minus_W6 * x2
dc.Mov(dc.r12, 181);
dc.Mla(dc.r4, dc.r10, dc.r4, dc.r9); //x3 = x1 + W2_minus_W6 * x3
dc.Add(dc.r9, dc.r3, dc.r2); //x1 = x4 + x6
dc.Sub(dc.r3, dc.r3, dc.r2); //x4 = x4 - x6
dc.Add(dc.r2, dc.r1, dc.r5); //x6 = x5 + x7
dc.Sub(dc.r1, dc.r1, dc.r5); //x5 = x5 - x7
dc.Add(dc.r5, dc.r11, dc.r4); //x7 = x8 + x3
dc.Sub(dc.r11, dc.r11, dc.r4); //x8 = x8 - x3
dc.Add(dc.r4, dc.r0, dc.r14); //x3 = x0 + x2
dc.Sub(dc.r0, dc.r0, dc.r14); //x0 = x0 - x2
dc.Add(dc.r3, dc.r3, 4); //
dc.Add(dc.r14, dc.r3, dc.r1); //x2 = x4 + x5 + 4
dc.Sub(dc.r3, dc.r3, dc.r1); //x4 = x4 - x5 + 4
dc.Mov(dc.r10, 16);
dc.Mov(dc.r14, dc.r14, dc.ASR, 3);
dc.Mov(dc.r3, dc.r3, dc.ASR, 3);
dc.Mla(dc.r14, dc.r12, dc.r14, dc.r10); //x2 = 181 * ((x4 + x5 + 4) >> 3) + 16
dc.Mla(dc.r3, dc.r12, dc.r3, dc.r10); //x4 = 181 * ((x4 - x5 + 4) >> 3) + 16
dc.Add(dc.r1, dc.r5, dc.r9); //x5 = x7 + x1
dc.Sub(dc.r9, dc.r5, dc.r9); //x1 = x7 - x1
dc.Add(dc.r5, dc.r4, dc.r14, dc.ASR, 5); //x7 = x3 + (x2 >> 5)
dc.Sub(dc.r14, dc.r4, dc.r14, dc.ASR, 5); //x2 = x3 - (x2 >> 5)
dc.Add(dc.r4, dc.r0, dc.r3, dc.ASR, 5); //x3 = x0 + (x4 >> 5)
dc.Sub(dc.r3, dc.r0, dc.r3, dc.ASR, 5); //x4 = x0 - (x4 >> 5)
dc.Add(dc.r0, dc.r11, dc.r2); //x0 = x8 + x6
dc.Sub(dc.r2, dc.r11, dc.r2); //x6 = x8 - x6
dc.Branch(dc.EQ, Row8_NoSrc);
dc.Byte(); dc.Ldr(dc.r10, dc.r7, 0);
dc.Byte(); dc.Ldr(dc.r12, dc.r7, 7);
dc.Byte(); dc.Ldr(dc.r11, dc.r7, 1);
dc.Add(dc.r1, dc.r10, dc.r1, dc.ASR, 17);
dc.Add(dc.r9, dc.r12, dc.r9, dc.ASR, 17);
dc.Add(dc.r5, dc.r11, dc.r5, dc.ASR, 17);
dc.Byte(); dc.Ldr(dc.r10, dc.r7, 6);
dc.Byte(); dc.Ldr(dc.r12, dc.r7, 2);
dc.Byte(); dc.Ldr(dc.r11, dc.r7, 5);
dc.Add(dc.r14, dc.r10, dc.r14, dc.ASR, 17);
dc.Add(dc.r4, dc.r12, dc.r4, dc.ASR, 17);
dc.Byte(); dc.Ldr(dc.r10, dc.r7, 3);
dc.Byte(); dc.Ldr(dc.r12, dc.r7, 4);
dc.Add(dc.r3, dc.r11, dc.r3, dc.ASR, 17);
dc.Add(dc.r0, dc.r10, dc.r0, dc.ASR, 17);
dc.Add(dc.r2, dc.r12, dc.r2, dc.ASR, 17);
dc.Add(dc.r7, dc.r7, 8); //source stride
dc.PutLabel(Row8_Sat);
dc.Orr(dc.r10, dc.r1, dc.r9);
dc.Orr(dc.r10, dc.r10, dc.r5);
dc.Orr(dc.r10, dc.r10, dc.r14);
dc.Orr(dc.r10, dc.r10, dc.r4);
dc.Orr(dc.r10, dc.r10, dc.r3);
dc.Orr(dc.r10, dc.r10, dc.r0);
dc.Orr(dc.r10, dc.r10, dc.r2);
dc.Set(); dc.Bic(dc.r10, dc.r10, 0xff); // 0xFF = 255
void *Row8_Write = dc.Label(false);
dc.Branch(dc.EQ, Row8_Write);
dc.Mov(dc.r10, 0xffffff00);
dc.Tst(dc.r1, dc.r10);
dc.Cond(dc.NE); dc.Mov(dc.r1, 0xFF);
dc.Cond(dc.MI); dc.Mov(dc.r1, 0x00);
dc.Tst(dc.r9, dc.r10);
dc.Cond(dc.NE); dc.Mov(dc.r9, 0xFF);
dc.Cond(dc.MI); dc.Mov(dc.r9, 0x00);
dc.Tst(dc.r5, dc.r10);
dc.Cond(dc.NE); dc.Mov(dc.r5, 0xFF);
dc.Cond(dc.MI); dc.Mov(dc.r5, 0x00);
dc.Tst(dc.r14, dc.r10);
dc.Cond(dc.NE); dc.Mov(dc.r14, 0xFF);
dc.Cond(dc.MI); dc.Mov(dc.r14, 0x00);
dc.Tst(dc.r4, dc.r10);
dc.Cond(dc.NE); dc.Mov(dc.r4, 0xFF);
dc.Cond(dc.MI); dc.Mov(dc.r4, 0x00);
dc.Tst(dc.r3, dc.r10);
dc.Cond(dc.NE); dc.Mov(dc.r3, 0xFF);
dc.Cond(dc.MI); dc.Mov(dc.r3, 0x00);
dc.Tst(dc.r0, dc.r10);
dc.Cond(dc.NE); dc.Mov(dc.r0, 0xFF);
dc.Cond(dc.MI); dc.Mov(dc.r0, 0x00);
dc.Tst(dc.r2, dc.r10);
dc.Cond(dc.NE); dc.Mov(dc.r2, 0xFF);
dc.Cond(dc.MI); dc.Mov(dc.r2, 0x00);
dc.PutLabel(Row8_Write);
dc.Byte(); dc.Str(dc.r1, dc.r8, 0);
dc.Byte(); dc.Str(dc.r5, dc.r8, 1);
dc.Byte(); dc.Str(dc.r4, dc.r8, 2);
dc.Byte(); dc.Str(dc.r0, dc.r8, 3);
dc.Byte(); dc.Str(dc.r2, dc.r8, 4);
dc.Byte(); dc.Str(dc.r3, dc.r8, 5);
dc.Byte(); dc.Str(dc.r14,dc.r8, 6);
dc.Byte(); dc.Str(dc.r9, dc.r8, 7);
dc.PutLabel(Row8_Next);
//dc.Ldr(dc. r2, [sp, 4); //DstStride
//dc.Ldr(dc. r1, [sp, 0); //BlockEnd
dc.Ldr(dc.r2, dc.sp, OffsetOf(S_stack, dst_pitch));
dc.Ldr(dc.r1, dc.sp, OffsetOf(S_stack, end_ptr));
dc.Add(dc.r6, dc.r6,16); //Block += 16
dc.Add(dc.r8, dc.r8, dc.r2); //Dst += DstStride
dc.Cmp(dc.r6, dc.r1);
dc.Branch(dc.NE, Row8_Loop);
}
dc.FunctionEnd();
dc.PutLabel(Row8_NoSrc);
dc.Mov(dc.r1, dc.r1, dc.ASR, 17);
dc.Mov(dc.r9, dc.r9, dc.ASR, 17);
dc.Mov(dc.r5, dc.r5, dc.ASR, 17);
dc.Mov(dc.r14, dc.r14, dc.ASR, 17);
dc.Mov(dc.r4, dc.r4, dc.ASR, 17);
dc.Mov(dc.r3, dc.r3, dc.ASR, 17);
dc.Mov(dc.r0, dc.r0, dc.ASR, 17);
dc.Mov(dc.r2, dc.r2, dc.ASR, 17);
dc.Branch(Row8_Sat);
dc.PutLabel(W3);
dc.PutLabel(W6);
dc.PutLabel(W7);
dc.PutLabel(W1_minus_W7);
dc.PutLabel(minus_W1_minus_W7);
dc.PutLabel(W5_minus_W3);
dc.PutLabel(minus_W2_minus_W6);
}
//----------------------------
#ifdef USE_IDCT_WMMX
//----------------------------
static void BuildWMMXconst8x8copyrow(C_dyn_code &dc){
dc.Wldrd(dc.wr1, dc.r3, 0);
dc.Add(dc.r3, dc.r3, 8);
dc.Wldrd(dc.wr2, dc.r3, 0);
dc.Add(dc.r3, dc.r3, 8);
dc.Wstrd(dc.wr1, dc.r1, 0);
dc.Add(dc.r1, dc.r1, dc.r2);
dc.Wstrd(dc.wr2, dc.r1, 0);
dc.Add(dc.r1, dc.r1, dc.r2);
}
//----------------------------
static void BuildWMMXconst8x8addrow(C_dyn_code &dc){
dc.Wldrd(dc.wr1, dc.r3, 0);
dc.Add(dc.r3, dc.r3, 8);
dc.Wldrd(dc.wr2, dc.r3, 0);
dc.Add(dc.r3, dc.r3, 8);
dc.Waddbus(dc.wr1, dc.wr1, dc.wr0);
dc.Waddbus(dc.wr2, dc.wr2, dc.wr0);
dc.Wstrd(dc.wr1, dc.r1, 0);
dc.Add(dc.r1, dc.r1, dc.r2);
dc.Wstrd(dc.wr2, dc.r1, 0);
dc.Add(dc.r1, dc.r1, dc.r2);
}
//----------------------------
static void BuildWMMXconst8x8subrow(C_dyn_code &dc){
dc.Wldrd(dc.wr1, dc.r3, 0);
dc.Add(dc.r3, dc.r3, 8);
dc.Wldrd(dc.wr2, dc.r3, 0);
dc.Add(dc.r3, dc.r3, 8);
dc.Wsubbus(dc.wr1, dc.wr1, dc.wr0);
dc.Wsubbus(dc.wr2, dc.wr2, dc.wr0);
dc.Wstrd(dc.wr1, dc.r1, 0);
dc.Add(dc.r1, dc.r1, dc.r2);
dc.Wstrd(dc.wr2, dc.r1, 0);
dc.Add(dc.r1, dc.r1, dc.r2);
}
//----------------------------
static void BuildWMMXIDCT_Const8x8(C_dyn_code &dc){
void *l_const8x8add = dc.Label(false);
void *l_const8x8sub = dc.Label(false);
void *l_const8x8done = dc.Label(false);
dc.Align(16);
dc.FunctionBegin();
dc.Cmp(dc.r0, 0);
dc.Branch(dc.GT, l_const8x8add);
dc.Branch(dc.LT, l_const8x8sub);
dc.Cmp(dc.r1, dc.r3);
dc.Branch(dc.EQ, l_const8x8done);
BuildWMMXconst8x8copyrow(dc);
BuildWMMXconst8x8copyrow(dc);
BuildWMMXconst8x8copyrow(dc);
BuildWMMXconst8x8copyrow(dc);
dc.PutLabel(l_const8x8done);
dc.FunctionEnd();
dc.PutLabel(l_const8x8add);
dc.Tbcstb(dc.wr0, dc.r0);
BuildWMMXconst8x8addrow(dc);
BuildWMMXconst8x8addrow(dc);
BuildWMMXconst8x8addrow(dc);
BuildWMMXconst8x8addrow(dc);
dc.FunctionEnd();
dc.PutLabel(l_const8x8sub);
dc.Rsb(dc.r0, dc.r0, 0);
dc.Tbcstb(dc.wr0, dc.r0);
BuildWMMXconst8x8subrow(dc);
BuildWMMXconst8x8subrow(dc);
BuildWMMXconst8x8subrow(dc);
BuildWMMXconst8x8subrow(dc);
dc.FunctionEnd();
}
//----------------------------
#endif
//----------------------------
void BuildIDCTFunctions(C_dyn_code &dc, bool use_wmmx){
void *l_MCol8 = BuildMCol8(dc);
void *l_RowConst = BuildRowConst(dc);
//IDCT_Block4x8 (13)
BuildIDCT_Block4x8(dc, l_MCol8, l_RowConst);
//IDCT_Block8x8 (14)
BuildIDCT_Block8x8(dc, l_MCol8, l_RowConst);
//IDCT_Const8x8 (15)
#ifdef USE_IDCT_WMMX
if(use_wmmx){
BuildWMMXIDCT_Const8x8(dc);
}
#endif
{
//implemented in C++, make empty function
dc.FunctionBegin();
dc.FunctionEnd();
}
}
//----------------------------
#else //ARM
//----------------------------
#define W1 2841 // 2048*sqrt(2)*cos(1*pi/16)
#define W2 2676 // 2048*sqrt(2)*cos(2*pi/16)
#define W3 2408 // 2048*sqrt(2)*cos(3*pi/16)
#define W5 1609 // 2048*sqrt(2)*cos(5*pi/16)
#define W6 1108 // 2048*sqrt(2)*cos(6*pi/16)
#define W7 565 // 2048*sqrt(2)*cos(7*pi/16)
#define W1_minus_W7 2276
#define W1_plus_W7 3406
#define W3_minus_W5 799
#define W3_plus_W5 4017
#define W2_minus_W6 1568
#define W2_plus_W6 3784
static void IDCT_Col8(idct_block_t *Blk){
int x0, x1, x2, x3, x4, x5, x6, x7, x8;
int x567,x123;
x0 = Blk[0];
x4 = Blk[8];
x3 = Blk[16];
x7 = Blk[24];
x1 = Blk[32] << 11;
x6 = Blk[40];
x2 = Blk[48];
x5 = Blk[56];
x123=x1|x2|x3;
x567=x5|x6|x7;
if(!(x123|x567)){
if (!x4) { // x0
if (x0)
Blk[0] = Blk[8] = Blk[16] = Blk[24] = Blk[32] = Blk[40] = Blk[48] = Blk[56] = (idct_block_t)(x0 << 3);
}else{ // x0,x4
x0 = (x0 << 11) + 128;
x5 = W7 * x4;
x1 = W1 * x4;
x2 = ((181 * W1_plus_W7 + 128) >> 8) * x4;
x4 = ((181 * W1_minus_W7 + 128) >> 8) * x4;
Blk[0] = (idct_block_t)((x0 + x1) >> 8);
Blk[8] = (idct_block_t)((x0 + x2) >> 8);
Blk[16] = (idct_block_t)((x0 + x4) >> 8);
Blk[24] = (idct_block_t)((x0 + x5) >> 8);
Blk[32] = (idct_block_t)((x0 - x5) >> 8);
Blk[40] = (idct_block_t)((x0 - x4) >> 8);
Blk[48] = (idct_block_t)((x0 - x2) >> 8);
Blk[56] = (idct_block_t)((x0 - x1) >> 8);
}
}else
if (!(x4|x567)) { // x0,x1,x2,x3
x0 = (x0 << 11) + 128;
x8 = x0 + x1;
x0 -= x1;
x1 = W6 * (x3 + x2);
x2 = x1 - (W2_plus_W6) * x2;
x3 = x1 + (W2_minus_W6) * x3;
x7 = x8 + x3;
x8 -= x3;
x3 = x0 + x2;
x0 -= x2;
Blk[0] = (idct_block_t)(x7 >> 8);
Blk[8] = (idct_block_t)(x3 >> 8);
Blk[16] = (idct_block_t)(x0 >> 8);
Blk[24] = (idct_block_t)(x8 >> 8);
Blk[32] = (idct_block_t)(x8 >> 8);
Blk[40] = (idct_block_t)(x0 >> 8);
Blk[48] = (idct_block_t)(x3 >> 8);
Blk[56] = (idct_block_t)(x7 >> 8);
return;
}else { //x0,x1,x2,x3,x4,x5,x6,x7
x0 = (x0 << 11) + 128;
x8 = W7 * (x4 + x5);
x4 = x8 + (W1_minus_W7) * x4;
x5 = x8 - (W1_plus_W7) * x5;
x8 = W3 * (x6 + x7);
x6 = x8 - (W3_minus_W5) * x6;
x7 = x8 - (W3_plus_W5) * x7;
x8 = x0 + x1;
x0 -= x1;
x1 = W6 * (x3 + x2);
x2 = x1 - (W2_plus_W6) * x2;
x3 = x1 + (W2_minus_W6) * x3;
x1 = x4 + x6;
x4 -= x6;
x6 = x5 + x7;
x5 -= x7;
x7 = x8 + x3;
x8 -= x3;
x3 = x0 + x2;
x0 -= x2;
x2 = (181 * (x4 + x5) + 128) >> 8;
x4 = (181 * (x4 - x5) + 128) >> 8;
Blk[0] = (idct_block_t)((x7 + x1) >> 8);
Blk[8] = (idct_block_t)((x3 + x2) >> 8);
Blk[16] = (idct_block_t)((x0 + x4) >> 8);
Blk[24] = (idct_block_t)((x8 + x6) >> 8);
Blk[32] = (idct_block_t)((x8 - x6) >> 8);
Blk[40] = (idct_block_t)((x0 - x4) >> 8);
Blk[48] = (idct_block_t)((x3 - x2) >> 8);
Blk[56] = (idct_block_t)((x7 - x1) >> 8);
}
}
static void IDCT_RowConst(int v, byte *Dst, const byte *Src){
if (Src) {
dword MaskCarry = 0x80808080U;
dword a,b,c,d;
a = ((dword*)Src)[0];
d = ((dword*)Src)[1];
if(v>0){
v |= v << 8;
v |= v << 16;
ADDSAT32(a, ((dword*)Dst)[0], v);
ADDSAT32(d, ((dword*)Dst)[1], v);
}else
if(v<0){
v = -v;
v |= v << 8;
v |= v << 16;
SUBSAT32(a,((dword*)Dst)[0],v);
SUBSAT32(d,((dword*)Dst)[1],v);
}else{
((dword*)Dst)[0] = a;
((dword*)Dst)[1] = d;
}
}else{
SAT(v);
v &= 255;
v |= v << 8;
v |= v << 16;
((dword*)Dst)[1] = ((dword*)Dst)[0] = v;
}
}
//----------------------------
static void IDCT_Row8(idct_block_t *Blk, byte *Dst, const byte *Src){
int x0, x1, x2, x3, x4, x5, x6, x7, x8;
x4 = Blk[1];
x3 = Blk[2];
x7 = Blk[3];
x1 = Blk[4];
x6 = Blk[5];
x2 = Blk[6];
x5 = Blk[7];
if (!(x1|x2|x3|x4|x5|x6|x7))
{
IDCT_RowConst((Blk[0] + 32) >> 6,Dst,Src);
return;
}
x1 <<= 8;
x0 = (Blk[0] << 8) + 8192;
x8 = W7 * (x4 + x5) + 4;
x4 = (x8 + (W1_minus_W7) * x4) >> 3;
x5 = (x8 - (W1_plus_W7) * x5) >> 3;
x8 = W3 * (x6 + x7) + 4;
x6 = (x8 - (W3_minus_W5) * x6) >> 3;
x7 = (x8 - (W3_plus_W5) * x7) >> 3;
x8 = x0 + x1;
x0 -= x1;
x1 = W6 * (x3 + x2) + 4;
x2 = (x1 - (W2_plus_W6) * x2) >> 3;
x3 = (x1 + (W2_minus_W6) * x3) >> 3;
x1 = x4 + x6;
x4 -= x6;
x6 = x5 + x7;
x5 -= x7;
x7 = x8 + x3;
x8 -= x3;
x3 = x0 + x2;
x0 -= x2;
x2 = (181 * (x4 + x5) + 128) >> 8;
x4 = (181 * (x4 - x5) + 128) >> 8;
x5 = (x7 + x1) >> 14;
x1 = (x7 - x1) >> 14;
x7 = (x3 + x2) >> 14;
x2 = (x3 - x2) >> 14;
x3 = (x0 + x4) >> 14;
x4 = (x0 - x4) >> 14;
x0 = (x8 + x6) >> 14;
x6 = (x8 - x6) >> 14;
if (Src)
{
x5 += Src[0];
x1 += Src[7];
x7 += Src[1];
x2 += Src[6];
x3 += Src[2];
x4 += Src[5];
x0 += Src[3];
x6 += Src[4];
}
x8 = (x5|x1|x7|x2|x3|x4|x0|x6)>>8;
if (x8)
{
SAT(x5)
SAT(x7)
SAT(x3)
SAT(x0)
SAT(x6)
SAT(x4)
SAT(x2)
SAT(x1)
}
Dst[0] = (byte)x5;
Dst[1] = (byte)x7;
Dst[2] = (byte)x3;
Dst[3] = (byte)x0;
Dst[4] = (byte)x6;
Dst[5] = (byte)x4;
Dst[6] = (byte)x2;
Dst[7] = (byte)x1;
}
//----------------------------
static void IDCT_Row4(idct_block_t *Blk, byte *Dst, const byte *Src){
int x0, x1, x2, x3, x4, x5, x6, x7, x8;
x4 = Blk[1];
x3 = Blk[2];
x7 = Blk[3];
if (!(x3|x4|x7))
{
IDCT_RowConst((Blk[0] + 32) >> 6,Dst,Src);
return;
}
x0 = (Blk[0] << 8) + 8192;
x5 = (W7 * x4 + 4) >> 3;
x4 = (W1 * x4 + 4) >> 3;
x6 = (W3 * x7 + 4) >> 3;
x7 = (-W5 * x7 + 4) >> 3;
x2 = (W6 * x3 + 4) >> 3;
x3 = (W2 * x3 + 4) >> 3;
x1 = x4 + x6;
x4 -= x6;
x6 = x5 + x7;
x5 -= x7;
x7 = x0 + x3;
x8 = x0 - x3;
x3 = x0 + x2;
x0 -= x2;
x2 = (181 * (x4 + x5) + 128) >> 8;
x4 = (181 * (x4 - x5) + 128) >> 8;
x5 = (x7 + x1) >> 14;
x1 = (x7 - x1) >> 14;
x7 = (x3 + x2) >> 14;
x2 = (x3 - x2) >> 14;
x3 = (x0 + x4) >> 14;
x4 = (x0 - x4) >> 14;
x0 = (x8 + x6) >> 14;
x6 = (x8 - x6) >> 14;
if (Src)
{
x5 += Src[0];
x1 += Src[7];
x7 += Src[1];
x2 += Src[6];
x3 += Src[2];
x4 += Src[5];
x0 += Src[3];
x6 += Src[4];
}
x8 = (x5|x1|x7|x2|x3|x4|x0|x6)>>8;
if (x8)
{
SAT(x5)
SAT(x7)
SAT(x3)
SAT(x0)
SAT(x6)
SAT(x4)
SAT(x2)
SAT(x1)
}
Dst[0] = (byte)x5;
Dst[1] = (byte)x7;
Dst[2] = (byte)x3;
Dst[3] = (byte)x0;
Dst[4] = (byte)x6;
Dst[5] = (byte)x4;
Dst[6] = (byte)x2;
Dst[7] = (byte)x1;
}
//----------------------------
void IDCT_Block8x8(idct_block_t *Block, byte *Dest, int DestStride, const byte *Src){
int SrcStride;
IDCT_Col8(Block+0);
IDCT_Col8(Block+1);
IDCT_Col8(Block+2);
IDCT_Col8(Block+3);
IDCT_Col8(Block+4);
IDCT_Col8(Block+5);
IDCT_Col8(Block+6);
IDCT_Col8(Block+7);
SrcStride = 0;
if (Src) SrcStride = 8;
IDCT_Row8(Block,Dest,Src);
Dest+=DestStride;
Src+=SrcStride;
IDCT_Row8(Block+8,Dest,Src);
Dest+=DestStride;
Src+=SrcStride;
IDCT_Row8(Block+16,Dest,Src);
Dest+=DestStride;
Src+=SrcStride;
IDCT_Row8(Block+24,Dest,Src);
Dest+=DestStride;
Src+=SrcStride;
IDCT_Row8(Block+32,Dest,Src);
Dest+=DestStride;
Src+=SrcStride;
IDCT_Row8(Block+40,Dest,Src);
Dest+=DestStride;
Src+=SrcStride;
IDCT_Row8(Block+48,Dest,Src);
Dest+=DestStride;
Src+=SrcStride;
IDCT_Row8(Block+56,Dest,Src);
}
//----------------------------
void IDCT_Block4x8(idct_block_t *Block, byte *Dest, int DestStride, const byte *Src){
int SrcStride;
IDCT_Col8(Block);
IDCT_Col8(Block+1);
IDCT_Col8(Block+2);
IDCT_Col8(Block+3);
SrcStride = 0;
if (Src) SrcStride = 8;
IDCT_Row4(Block,Dest,Src);
Dest+=DestStride;
Src+=SrcStride;
IDCT_Row4(Block+8,Dest,Src);
Dest+=DestStride;
Src+=SrcStride;
IDCT_Row4(Block+16,Dest,Src);
Dest+=DestStride;
Src+=SrcStride;
IDCT_Row4(Block+24,Dest,Src);
Dest+=DestStride;
Src+=SrcStride;
IDCT_Row4(Block+32,Dest,Src);
Dest+=DestStride;
Src+=SrcStride;
IDCT_Row4(Block+40,Dest,Src);
Dest+=DestStride;
Src+=SrcStride;
IDCT_Row4(Block+48,Dest,Src);
Dest+=DestStride;
Src+=SrcStride;
IDCT_Row4(Block+56,Dest,Src);
}
#endif //!ARM
//----------------------------
void IDCT_Const8x8(int v, byte * Dst, int DstPitch, const byte *Src){
int SrcPitch = 8;
const byte* SrcEnd = Src + 8*SrcPitch;
dword MaskCarry = 0x80808080U;
dword a,b,c,d;
if(v>0){
v |= v << 8;
v |= v << 16;
do{
a = ((dword*)Src)[0];
d = ((dword*)Src)[1];
ADDSAT32(a,((dword*)Dst)[0],v);
ADDSAT32(d,((dword*)Dst)[1],v);
Dst += DstPitch;
Src += SrcPitch;
}while (Src != SrcEnd);
}else
if(v<0){
v = -v;
v |= v << 8;
v |= v << 16;
do{
a = ((dword*)Src)[0];
d = ((dword*)Src)[1];
SUBSAT32(a,((dword*)Dst)[0],v);
SUBSAT32(d,((dword*)Dst)[1],v);
Dst += DstPitch;
Src += SrcPitch;
}while (Src != SrcEnd);
}
}
//----------------------------