www.pudn.com > x264_2007.rar > quant.c


/*****************************************************************************
* quant.c: h264 encoder
*****************************************************************************
* Authors: Guillaume Poirier 
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
*****************************************************************************/

#if defined SYS_LINUX
#include 
#endif

typedef union {
  unsigned int s[4];
  vector unsigned int v;
} vect_int_u;

typedef union {
  unsigned short s[8];
  vector unsigned short v;
} vect_ushort_u;

#include "common/common.h"
#include "ppccommon.h"
#include "quant.h"            

// quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
#define QUANT_16_U( dct0, dct1, quant_mf0, quant_mf1, quant_mf2, quant_mf3 ) \
temp1v = vec_ld((dct0), *dct);                                               \
temp2v = vec_ld((dct1), *dct);                                               \
mfvA = (vec_u16_t) vec_packs((vec_u32_t)vec_ld((quant_mf0), *quant_mf), (vec_u32_t)vec_ld((quant_mf1), *quant_mf));    \
mfvB = (vec_u16_t) vec_packs((vec_u32_t)vec_ld((quant_mf2), *quant_mf), (vec_u32_t)vec_ld((quant_mf3), *quant_mf));    \
mskA = vec_cmplt(temp1v, zerov);                                             \
mskB = vec_cmplt(temp2v, zerov);                                             \
coefvA = (vec_u16_t)vec_max(vec_sub(zerov, temp1v), temp1v);                 \
coefvB = (vec_u16_t)vec_max(vec_sub(zerov, temp2v), temp2v);                 \
multEvenvA = vec_mule(coefvA, mfvA);                                         \
multOddvA = vec_mulo(coefvA, mfvA);                                          \
multEvenvB = vec_mule(coefvB, mfvB);                                         \
multOddvB = vec_mulo(coefvB, mfvB);                                          \
multEvenvA = vec_adds(multEvenvA, fV);                                        \
multOddvA = vec_adds(multOddvA, fV);                                          \
multEvenvB = vec_adds(multEvenvB, fV);                                        \
multOddvB = vec_adds(multOddvB, fV);                                          \
multEvenvA = vec_sr(multEvenvA, i_qbitsv);                                   \
multOddvA = vec_sr(multOddvA, i_qbitsv);                                     \
multEvenvB = vec_sr(multEvenvB, i_qbitsv);                                   \
multOddvB = vec_sr(multOddvB, i_qbitsv);                                     \
temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
temp1v = vec_xor(temp1v, mskA);                                              \
temp2v = vec_xor(temp2v, mskB);                                              \
temp1v = vec_adds(temp1v, vec_and(mskA, one));                                \
vec_st(temp1v, (dct0), (int16_t*)dct);                                        \
temp2v = vec_adds(temp2v, vec_and(mskB, one));                                \
vec_st(temp2v, (dct1), (int16_t*)dct);
                
void x264_quant_4x4_altivec( int16_t dct[4][4], int quant_mf[4][4], int const i_qbits, int const f ) {
    vector bool short mskA;
    vec_u32_t i_qbitsv;
    vec_u16_t coefvA;
    vec_u32_t multEvenvA, multOddvA;
    vec_u16_t mfvA;
    vec_s16_t zerov, one;
    vec_u32_t fV;

    vector bool short mskB;
    vec_u16_t coefvB;
    vec_u32_t multEvenvB, multOddvB;
    vec_u16_t mfvB;

    vec_s16_t temp1v, temp2v;

    vect_int_u qbits_u;
    qbits_u.s[0]=i_qbits;
    i_qbitsv = vec_splat(qbits_u.v, 0);

    vect_int_u f_u;
    f_u.s[0]=f;

    fV = vec_splat(f_u.v, 0);

    zerov = vec_splat_s16(0);
    one = vec_splat_s16(1);

    QUANT_16_U( 0, 16, 0, 16, 32, 48 );
}

// DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
#define QUANT_16_U_DC( dct0, dct1 )                             \
temp1v = vec_ld((dct0), *dct);                                  \
temp2v = vec_ld((dct1), *dct);                                  \
mskA = vec_cmplt(temp1v, zerov);                                \
mskB = vec_cmplt(temp2v, zerov);                                \
coefvA = (vec_u16_t) vec_max(vec_sub(zerov, temp1v), temp1v);   \
coefvB = (vec_u16_t) vec_max(vec_sub(zerov, temp2v), temp2v);   \
multEvenvA = vec_mule(coefvA, mfv);                             \
multOddvA = vec_mulo(coefvA, mfv);                              \
multEvenvB = vec_mule(coefvB, mfv);                             \
multOddvB = vec_mulo(coefvB, mfv);                              \
multEvenvA = vec_add(multEvenvA, fV);                           \
multOddvA = vec_add(multOddvA, fV);                             \
multEvenvB = vec_add(multEvenvB, fV);                           \
multOddvB = vec_add(multOddvB, fV);                             \
multEvenvA = vec_sr(multEvenvA, i_qbitsv);                      \
multOddvA = vec_sr(multOddvA, i_qbitsv);                        \
multEvenvB = vec_sr(multEvenvB, i_qbitsv);                      \
multOddvB = vec_sr(multOddvB, i_qbitsv);                        \
temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
temp1v = vec_xor(temp1v, mskA);                                 \
temp2v = vec_xor(temp2v, mskB);                                 \
temp1v = vec_add(temp1v, vec_and(mskA, one));                   \
vec_st(temp1v, (dct0), (int16_t*)dct);                          \
temp2v = vec_add(temp2v, vec_and(mskB, one));                   \
vec_st(temp2v, (dct1), (int16_t*)dct);


void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int i_quant_mf, int const i_qbits, int const f ) {
    vector bool short mskA;
    vec_u32_t i_qbitsv;
    vec_u16_t coefvA;
    vec_u32_t multEvenvA, multOddvA;
    vec_s16_t zerov, one;
    vec_u32_t fV;

    vector bool short mskB;
    vec_u16_t coefvB;
    vec_u32_t multEvenvB, multOddvB;

    vec_s16_t temp1v, temp2v;

    vec_u16_t mfv;
    vect_ushort_u mf_u;
    mf_u.s[0]=i_quant_mf;
    mfv = vec_splat( mf_u.v, 0 );

    vect_int_u qbits_u;
    qbits_u.s[0]=i_qbits;
    i_qbitsv = vec_splat(qbits_u.v, 0);

    vect_int_u f_u;
    f_u.s[0]=f;
    fV = vec_splat(f_u.v, 0);

    zerov = vec_splat_s16(0);
    one = vec_splat_s16(1);

    QUANT_16_U_DC( 0, 16 );
}


void x264_quant_8x8_altivec( int16_t dct[8][8], int quant_mf[8][8], int const i_qbits, int const f ) {
    vector bool short mskA;
    vec_u32_t i_qbitsv;
    vec_u16_t coefvA;
    vec_u32_t multEvenvA, multOddvA;
    vec_u16_t mfvA;
    vec_s16_t zerov, one;
    vec_u32_t fV;
    
    vector bool short mskB;
    vec_u16_t coefvB;
    vec_u32_t multEvenvB, multOddvB;
    vec_u16_t mfvB;
    
    vec_s16_t temp1v, temp2v;
    
    vect_int_u qbits_u;
    qbits_u.s[0]=i_qbits;
    i_qbitsv = vec_splat(qbits_u.v, 0);

    vect_int_u f_u;
    f_u.s[0]=f;
    fV = vec_splat(f_u.v, 0);

    zerov = vec_splat_s16(0);
    one = vec_splat_s16(1);
    
    int i;

    for ( i=0; i<4; i++ ) {
      QUANT_16_U( i*2*16, i*2*16+16, i*4*16, i*4*16+16, i*4*16+32, i*4*16+48 );
    }
}