www.pudn.com > T264-src-0.02.zip > t264enc.c


/***************************************************************************** 
 * 
 *  T264 AVC CODEC 
 * 
 *  Copyright(C) 2004-2005 llcc  
 *               2004-2005 visionany  
 * 
 *  This program is free software ; you can redistribute it and/or modify 
 *  it under the terms of the GNU General Public License as published by 
 *  the Free Software Foundation ; either version 2 of the License, or 
 *  (at your option) any later version. 
 * 
 *  This program is distributed in the hope that it will be useful, 
 *  but WITHOUT ANY WARRANTY ; without even the implied warranty of 
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 *  GNU General Public License for more details. 
 * 
 *  You should have received a copy of the GNU General Public License 
 *  along with this program ; if not, write to the Free Software 
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA 
 * 
 ****************************************************************************/ 
 
#include "stdio.h" 
#include "memory.h" 
 
#include "t264.h" 
#include "utility.h" 
#include "intra.h" 
#include "cavlc.h" 
#include "inter.h" 
#include "interpolate.h" 
#include "estimation.h" 
#include "deblock.h" 
#include "ratecontrol.h" 
#include "sse2\sse2.h" 
#include "math.h" 
 
static const int32_t chroma_qp[] = 
{ 
    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  
    11, 12, 13, 14, 15, 16, 17, 18, 19, 20,  
    21, 22, 23, 24, 25, 26, 27, 28, 29, 
    29, 30, 31, 32, 32, 33, 34, 34, 35, 35, 
    36, 36, 37, 37, 37, 38, 38, 38, 39, 39, 39, 39 
}; 
 
//! convert from H.263 QP to H.26L quant given by: quant=pow(2,QP/6) 
static const int32_t qp_cost[52]= 
{ 
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
    1, 1, 1, 1, 2, 2, 2, 2, 
    3, 3, 3, 4, 4, 4, 5, 6, 
    6, 7, 8, 9,10,11,13,14, 
    16,18,20,23,25,29,32,36, 
    40,45,51,57,64,72,81,91 
}; 
 
float  
psnr(uint8_t* p1, uint8_t* p2, int32_t size) 
{ 
    float sad = 0; 
    int32_t i; 
 
    for (i = 0 ; i < size ; i ++) 
    { 
        int32_t tmp; 
        tmp = (p1[i] - p2[i]); 
        sad += tmp * tmp; 
    } 
 
    return (float)(10 * log10(65025.0f * size / sad)); 
} 
 
void 
T264_mb_load_context(T264_t* t, int32_t mb_y, int32_t mb_x) 
{ 
    int32_t qpc; 
    int32_t i, j; 
 
    t->mb.mb_x = mb_x; 
    t->mb.mb_y = mb_y; 
    t->mb.mb_xy = t->mb.mb_y * t->mb_stride + t->mb.mb_x; 
    t->mb.mb_neighbour = 0; 
    if (mb_x != 0) 
        t->mb.mb_neighbour |= MB_LEFT; 
    if (mb_y != 0) 
    { 
        t->mb.mb_neighbour |= MB_TOP; 
        if (mb_x != t->mb_stride - 1) 
            t->mb.mb_neighbour |= MB_TOPRIGHT; 
    } 
    t->mb.src_y = t->cur.Y[0]  + (mb_y << 4) * t->stride    + (mb_x << 4); 
    t->mb.dst_y  = t->rec->Y[0] + (mb_y << 4) * t->edged_stride    + (mb_x << 4); 
    t->mb.src_u = t->cur.U     + (mb_y << 3) * t->stride_uv + (mb_x << 3); 
    t->mb.dst_u = t->rec->U + (mb_y << 3) * t->edged_stride_uv + (mb_x << 3); 
    t->mb.src_v = t->cur.V     + (mb_y << 3) * t->stride_uv + (mb_x << 3); 
    t->mb.dst_v = t->rec->V + (mb_y << 3) * t->edged_stride_uv + (mb_x << 3); 
 
    t->mb.mb_qp_delta = 0; 
    /* t->ps.chroma_qp_index_offset maybe modify in ratecontrol */ 
    qpc = clip3(t->ps.chroma_qp_index_offset + t->qp_y, 0, 51); 
    t->qp_uv = chroma_qp[qpc]; 
    t->mb.lambda = qp_cost[t->qp_y]; 
 
    t->mb.context = &t->rec->mb[t->mb.mb_xy]; 
 
#define INITINVALIDVEC(vec) vec.refno = -2; vec.x = vec.y = 0; 
    INITINVALIDVEC(t->mb.vec_ref[0].vec); 
    INITINVALIDVEC(t->mb.vec_ref[IPM_LUMA - 8 + 4].vec); 
    INITINVALIDVEC(t->mb.vec_ref[IPM_LUMA - 8 + 4 + 8].vec); 
    INITINVALIDVEC(t->mb.vec_ref[IPM_LUMA - 8 + 4 + 16].vec); 
    INITINVALIDVEC(t->mb.vec_ref[IPM_LUMA - 8 + 4 + 24].vec); 
     
    t->mb.vec_ref[0].part = -1; 
    t->mb.vec_ref[IPM_LUMA - 8 + 4].part      = -1; 
    t->mb.vec_ref[IPM_LUMA - 8 + 4 + 8].part  = -1; 
    t->mb.vec_ref[IPM_LUMA - 8 + 4 + 16].part = -1; 
    t->mb.vec_ref[IPM_LUMA - 8 + 4 + 24].part = -1; 
     
    t->mb.vec_ref[0].subpart = -1; 
    t->mb.vec_ref[IPM_LUMA - 8 + 4].subpart      = -1; 
    t->mb.vec_ref[IPM_LUMA - 8 + 4 + 8].subpart  = -1; 
    t->mb.vec_ref[IPM_LUMA - 8 + 4 + 16].subpart = -1; 
    t->mb.vec_ref[IPM_LUMA - 8 + 4 + 24].subpart = -1; 
	 
    memset(t->mb.submb_part, -1, sizeof(t->mb.submb_part)); 
    t->mb.mb_part = -1; 
    for(i = 0 ; i < 2 ; i ++) 
    { 
        for(j = 0 ; j < 16 ; j ++) 
        { 
            INITINVALIDVEC(t->mb.vec[i][j]); 
        } 
    } 
    t->mb.sad_ref[0] = t->mb.sad_ref[1] = t->mb.sad_ref[2] = -1; 
 
    //intra_4x4 prediction modes and non-zero counts 
	if( mb_y > 0 ) 
    { 
        int16_t top_xy  = t->mb.mb_xy - t->mb_stride; 
        /* intra 4x4 pred mode layout 
         	? x x x x 
            x 
            x 
            x 
            x 
         */         
        t->mb.i4x4_pred_mode_ref[IPM_LUMA - 8 + 0] = t->rec->mb[top_xy].mode_i4x4[10]; 
        t->mb.i4x4_pred_mode_ref[IPM_LUMA - 8 + 1] = t->rec->mb[top_xy].mode_i4x4[11]; 
        t->mb.i4x4_pred_mode_ref[IPM_LUMA - 8 + 2] = t->rec->mb[top_xy].mode_i4x4[14]; 
        t->mb.i4x4_pred_mode_ref[IPM_LUMA - 8 + 3] = t->rec->mb[top_xy].mode_i4x4[15]; 
 
        t->mb.vec_ref[IPM_LUMA - 8 + 0].vec = t->rec->mb[top_xy].vec[0][12]; 
        t->mb.vec_ref[IPM_LUMA - 8 + 1].vec = t->rec->mb[top_xy].vec[0][13]; 
        t->mb.vec_ref[IPM_LUMA - 8 + 2].vec = t->rec->mb[top_xy].vec[0][14]; 
        t->mb.vec_ref[IPM_LUMA - 8 + 3].vec = t->rec->mb[top_xy].vec[0][15]; 
 
        t->mb.vec_ref[IPM_LUMA - 8 + 0].part =  
        t->mb.vec_ref[IPM_LUMA - 8 + 1].part =  
        t->mb.vec_ref[IPM_LUMA - 8 + 2].part =  
        t->mb.vec_ref[IPM_LUMA - 8 + 3].part = t->rec->mb[top_xy].mb_part; 
 
        t->mb.vec_ref[IPM_LUMA - 8 + 0].subpart = t->rec->mb[top_xy].submb_part[12]; 
        t->mb.vec_ref[IPM_LUMA - 8 + 1].subpart = t->rec->mb[top_xy].submb_part[13]; 
        t->mb.vec_ref[IPM_LUMA - 8 + 2].subpart = t->rec->mb[top_xy].submb_part[14]; 
        t->mb.vec_ref[IPM_LUMA - 8 + 3].subpart = t->rec->mb[top_xy].submb_part[15]; 
 
        t->mb.sad_ref[1] = t->rec->mb[top_xy].sad; 
 
        if (mb_x != t->mb_stride - 1) 
        { 
            int32_t righttop_xy = top_xy + 1; 
            t->mb.vec_ref[IPM_LUMA - 8 + 4].vec     = t->rec->mb[righttop_xy].vec[0][12]; 
            t->mb.vec_ref[IPM_LUMA - 8 + 4].part    = t->rec->mb[righttop_xy].mb_part; 
            t->mb.vec_ref[IPM_LUMA - 8 + 4].subpart = t->rec->mb[righttop_xy].submb_part[12]; 
            t->mb.sad_ref[2] = t->rec->mb[righttop_xy].sad; 
        } 
        /* nnz layout: 
          ? x x x x ? x x 
          x         x 
          x         x 
          x         ? x x 
          x         x 
                    x 
         */ 
        t->mb.nnz_ref[NNZ_LUMA - 8 + 0] = t->rec->mb[top_xy].nnz[12]; 
        t->mb.nnz_ref[NNZ_LUMA - 8 + 1] = t->rec->mb[top_xy].nnz[13]; 
        t->mb.nnz_ref[NNZ_LUMA - 8 + 2] = t->rec->mb[top_xy].nnz[14]; 
        t->mb.nnz_ref[NNZ_LUMA - 8 + 3] = t->rec->mb[top_xy].nnz[15]; 
 
        t->mb.nnz_ref[NNZ_CHROMA0 - 8 + 0] = t->rec->mb[top_xy].nnz[18]; 
        t->mb.nnz_ref[NNZ_CHROMA0 - 8 + 1] = t->rec->mb[top_xy].nnz[19]; 
        t->mb.nnz_ref[NNZ_CHROMA1 - 8 + 0] = t->rec->mb[top_xy].nnz[22]; 
        t->mb.nnz_ref[NNZ_CHROMA1 - 8 + 1] = t->rec->mb[top_xy].nnz[23]; 
    } 
    else 
    { 
        /* load intra4x4 */ 
        t->mb.i4x4_pred_mode_ref[IPM_LUMA - 8 + 0] =  
        t->mb.i4x4_pred_mode_ref[IPM_LUMA - 8 + 1] =  
        t->mb.i4x4_pred_mode_ref[IPM_LUMA - 8 + 2] =  
        t->mb.i4x4_pred_mode_ref[IPM_LUMA - 8 + 3] = -1; 
 
        INITINVALIDVEC(t->mb.vec_ref[IPM_LUMA - 8 + 0].vec); 
        INITINVALIDVEC(t->mb.vec_ref[IPM_LUMA - 8 + 1].vec); 
        INITINVALIDVEC(t->mb.vec_ref[IPM_LUMA - 8 + 2].vec); 
        INITINVALIDVEC(t->mb.vec_ref[IPM_LUMA - 8 + 3].vec); 
 
        t->mb.vec_ref[IPM_LUMA - 8 + 0].part =  
        t->mb.vec_ref[IPM_LUMA - 8 + 1].part =  
        t->mb.vec_ref[IPM_LUMA - 8 + 2].part =  
        t->mb.vec_ref[IPM_LUMA - 8 + 3].part = -1; 
 
        t->mb.vec_ref[IPM_LUMA - 8 + 0].subpart =  
        t->mb.vec_ref[IPM_LUMA - 8 + 1].subpart =  
        t->mb.vec_ref[IPM_LUMA - 8 + 2].subpart =  
        t->mb.vec_ref[IPM_LUMA - 8 + 3].subpart = -1; 
 
        t->mb.nnz_ref[NNZ_LUMA - 8 + 0] = 
        t->mb.nnz_ref[NNZ_LUMA - 8 + 1] = 
        t->mb.nnz_ref[NNZ_LUMA - 8 + 2] = 
        t->mb.nnz_ref[NNZ_LUMA - 8 + 3] = 0x80; 
 
        t->mb.nnz_ref[NNZ_CHROMA0 - 8 + 0] = 
        t->mb.nnz_ref[NNZ_CHROMA0 - 8 + 1] = 
        t->mb.nnz_ref[NNZ_CHROMA1 - 8 + 0] = 
        t->mb.nnz_ref[NNZ_CHROMA1 - 8 + 1] = 0x80; 
    } 
 
    if( mb_x > 0 ) 
    { 
        int16_t left_xy  = t->mb.mb_xy - 1; 
 
        /* load intra4x4 */ 
        t->mb.i4x4_pred_mode_ref[IPM_LUMA - 1 + 0] = t->rec->mb[left_xy].mode_i4x4[5]; 
        t->mb.i4x4_pred_mode_ref[IPM_LUMA - 1 + 8] = t->rec->mb[left_xy].mode_i4x4[7]; 
        t->mb.i4x4_pred_mode_ref[IPM_LUMA - 1 + 16] = t->rec->mb[left_xy].mode_i4x4[13]; 
        t->mb.i4x4_pred_mode_ref[IPM_LUMA - 1 + 24] = t->rec->mb[left_xy].mode_i4x4[15]; 
 
        t->mb.vec_ref[IPM_LUMA - 1 + 0].vec = t->rec->mb[left_xy].vec[0][3]; 
        t->mb.vec_ref[IPM_LUMA - 1 + 8].vec = t->rec->mb[left_xy].vec[0][7]; 
        t->mb.vec_ref[IPM_LUMA - 1 + 16].vec = t->rec->mb[left_xy].vec[0][11]; 
        t->mb.vec_ref[IPM_LUMA - 1 + 24].vec = t->rec->mb[left_xy].vec[0][15]; 
 
        t->mb.vec_ref[IPM_LUMA - 1 + 0].part =  
        t->mb.vec_ref[IPM_LUMA - 1 + 8].part =  
        t->mb.vec_ref[IPM_LUMA - 1 + 16].part = 
        t->mb.vec_ref[IPM_LUMA - 1 + 24].part = t->rec->mb[left_xy].mb_part; 
 
        t->mb.vec_ref[IPM_LUMA - 8 + 0].subpart = t->rec->mb[left_xy].submb_part[3]; 
        t->mb.vec_ref[IPM_LUMA - 8 + 8].subpart = t->rec->mb[left_xy].submb_part[7]; 
        t->mb.vec_ref[IPM_LUMA - 8 + 16].subpart = t->rec->mb[left_xy].submb_part[11]; 
        t->mb.vec_ref[IPM_LUMA - 8 + 24].subpart = t->rec->mb[left_xy].submb_part[15]; 
 
        t->mb.sad_ref[0] = t->rec->mb[left_xy].sad; 
 
        /* load non_zero_count */ 
        t->mb.nnz_ref[NNZ_LUMA - 1 + 0] = t->rec->mb[left_xy].nnz[3]; 
        t->mb.nnz_ref[NNZ_LUMA - 1 + 8] = t->rec->mb[left_xy].nnz[7]; 
        t->mb.nnz_ref[NNZ_LUMA - 1 + 16] = t->rec->mb[left_xy].nnz[11]; 
        t->mb.nnz_ref[NNZ_LUMA - 1 + 24] = t->rec->mb[left_xy].nnz[15]; 
 
        t->mb.nnz_ref[NNZ_CHROMA0 - 1 + 0] = t->rec->mb[left_xy].nnz[17]; 
        t->mb.nnz_ref[NNZ_CHROMA0 - 1 + 8] = t->rec->mb[left_xy].nnz[19]; 
        t->mb.nnz_ref[NNZ_CHROMA1 - 1 + 0] = t->rec->mb[left_xy].nnz[21]; 
        t->mb.nnz_ref[NNZ_CHROMA1 - 1 + 8] = t->rec->mb[left_xy].nnz[23]; 
    } 
    else 
    { 
        t->mb.i4x4_pred_mode_ref[IPM_LUMA - 1 + 0]  =  
        t->mb.i4x4_pred_mode_ref[IPM_LUMA - 1 + 8]  =  
        t->mb.i4x4_pred_mode_ref[IPM_LUMA - 1 + 16] = 
        t->mb.i4x4_pred_mode_ref[IPM_LUMA - 1 + 24] = -1; 
 
        INITINVALIDVEC(t->mb.vec_ref[IPM_LUMA - 1 + 0].vec); 
        INITINVALIDVEC(t->mb.vec_ref[IPM_LUMA - 1 + 8].vec); 
        INITINVALIDVEC(t->mb.vec_ref[IPM_LUMA - 1 + 16].vec); 
        INITINVALIDVEC(t->mb.vec_ref[IPM_LUMA - 1 + 24].vec); 
 
        t->mb.vec_ref[IPM_LUMA - 1 + 0].part  =  
        t->mb.vec_ref[IPM_LUMA - 1 + 8].part  =  
        t->mb.vec_ref[IPM_LUMA - 1 + 16].part = 
        t->mb.vec_ref[IPM_LUMA - 1 + 24].part = -1; 
 
        t->mb.vec_ref[IPM_LUMA - 1 + 0].subpart  =  
        t->mb.vec_ref[IPM_LUMA - 1 + 8].subpart  =  
        t->mb.vec_ref[IPM_LUMA - 1 + 16].subpart = 
        t->mb.vec_ref[IPM_LUMA - 1 + 24].subpart = -1; 
 
        t->mb.nnz_ref[NNZ_LUMA - 1 + 0]  = 
        t->mb.nnz_ref[NNZ_LUMA - 1 + 8]  = 
        t->mb.nnz_ref[NNZ_LUMA - 1 + 16] = 
        t->mb.nnz_ref[NNZ_LUMA - 1 + 24] = 0x80; 
 
        t->mb.nnz_ref[NNZ_CHROMA0 - 1 + 0] = 
        t->mb.nnz_ref[NNZ_CHROMA0 - 1 + 8] = 
        t->mb.nnz_ref[NNZ_CHROMA1 - 1 + 0] = 
        t->mb.nnz_ref[NNZ_CHROMA1 - 1 + 8] = 0x80; 
    } 
    if (mb_x > 0 && mb_y > 0) 
    { 
        int32_t lefttop_xy = t->mb.mb_xy - t->mb_stride - 1; 
        t->mb.vec_ref[0].vec = t->rec->mb[lefttop_xy].vec[0][15]; 
        t->mb.vec_ref[0].subpart = t->rec->mb[lefttop_xy].submb_part[15]; 
        t->mb.vec_ref[0].part = t->rec->mb[lefttop_xy].mb_part; 
    } 
#undef INITINVALIDVEC 
} 
 
void 
T264_mb_save_context(T264_t* t) 
{ 
    memcpy(t->mb.context, &t->mb, sizeof(*t->mb.context)); 
//	memcpy(&t->rec->mb[t->mb.mb_xy], &t->mb, sizeof(*t->mb.context)); 
} 
 
static void 
T264_reset_ref(T264_t* t) 
{ 
    int32_t i; 
 
    for(i = 1 ; i < MAX_REFFRAMES ; i ++) 
    { 
        t->refn[i].frame_num = -1; 
    } 
    t->rec = &t->refn[0]; 
    t->refn[0].frame_num = 0; 
} 
 
static void 
T264_load_ref(T264_t* t) 
{ 
    int32_t i; 
 
    /* now we only deal with p frame, the descend order is always right */ 
    t->refl0_num = 0; 
    for(i = 1 ; i < t->param.ref_num + 1 ; i ++) 
    { 
        if (t->refn[i].frame_num >= 0) 
        { 
            t->refl0[t->refl0_num ++] = &t->refn[i]; 
        } 
    } 
} 
 
static void 
T264_extend_border(T264_t* t, T264_frame_t* f) 
{ 
    int32_t i; 
    uint8_t* py; 
    uint8_t* pu; 
    uint8_t* pv; 
    uint8_t* tmpy; 
    uint8_t* tmpu; 
    uint8_t* tmpv; 
 
    // TODO: we need extend the interpolate pics 
 
    // top, top-left, top-right 
    py = f->Y[0] - t->edged_stride; 
    pu = f->U - t->edged_stride_uv; 
    pv = f->V - t->edged_stride_uv; 
    for(i = 0 ; i < (EDGED_HEIGHT >> 1) ; i ++) 
    { 
        // y 
        memcpy(py, f->Y[0], t->stride); 
        memset(py - EDGED_WIDTH, f->Y[0][0], EDGED_WIDTH); 
        memset(py + t->stride, f->Y[0][t->stride - 1], EDGED_WIDTH); 
        py -= t->edged_stride; 
 
        memcpy(py, f->Y[0], t->stride); 
        memset(py - EDGED_WIDTH, f->Y[0][0], EDGED_WIDTH); 
        memset(py + t->stride, f->Y[0][t->stride - 1], EDGED_WIDTH); 
        py -= t->edged_stride; 
 
        // u 
        memcpy(pu, f->U, t->stride_uv); 
        memset(pu - (EDGED_WIDTH >> 1), f->U[0], EDGED_WIDTH >> 1); 
        memset(pu + t->stride_uv, f->U[t->stride_uv - 1], EDGED_WIDTH >> 1); 
        pu -= t->edged_stride_uv; 
 
        // V 
        memcpy(pv, f->V, t->stride_uv); 
        memset(pv - (EDGED_WIDTH >> 1), f->V[0], EDGED_WIDTH >> 1); 
        memset(pv + t->stride_uv, f->V[t->stride_uv - 1], EDGED_WIDTH >> 1); 
        pv -= t->edged_stride_uv; 
    } 
 
    // left & right 
    py = f->Y[0] - EDGED_WIDTH; 
    pu = f->U - (EDGED_WIDTH >> 1); 
    pv = f->V - (EDGED_WIDTH >> 1); 
    for(i = 0 ; i < (t->height >> 1) ; i ++) 
    { 
        // left 
        memset(py, py[EDGED_WIDTH], EDGED_WIDTH); 
        // right 
        memset(&py[t->stride + EDGED_WIDTH], py[t->stride + EDGED_WIDTH - 1], EDGED_WIDTH); 
        py += t->edged_stride; 
 
        memset(py, py[EDGED_WIDTH], EDGED_WIDTH); 
        memset(&py[t->stride + EDGED_WIDTH], py[t->stride + EDGED_WIDTH - 1], EDGED_WIDTH); 
        py += t->edged_stride; 
 
        // u 
        memset(pu, pu[EDGED_WIDTH >> 1], EDGED_WIDTH >> 1); 
        memset(&pu[t->stride_uv + (EDGED_WIDTH >> 1)], pu[t->stride_uv + (EDGED_WIDTH >> 1) - 1], EDGED_WIDTH >> 1); 
        pu += t->edged_stride_uv; 
 
        // v 
        memset(pv, pv[EDGED_WIDTH >> 1], EDGED_WIDTH >> 1); 
        memset(&pv[t->stride_uv + (EDGED_WIDTH >> 1)], pv[t->stride_uv + (EDGED_WIDTH >> 1) - 1], EDGED_WIDTH >> 1); 
        pv += t->edged_stride_uv; 
    } 
 
    // bottom, left-bottom,right-bottom 
    py = f->Y[0] + t->edged_stride * t->height; 
    tmpy = f->Y[0] + t->edged_stride * (t->height - 1); 
    pu = f->U + t->edged_stride_uv * (t->height >> 1); 
    tmpu = f->U + t->edged_stride_uv * ((t->height >> 1) - 1); 
    pv = f->V + t->edged_stride_uv * (t->height >> 1); 
    tmpv = f->V + t->edged_stride_uv * ((t->height >> 1)- 1); 
    for(i = 0 ; i < (EDGED_HEIGHT >> 1) ; i ++) 
    { 
        // y 
        memcpy(py, tmpy, t->stride); 
        memset(py - EDGED_WIDTH, tmpy[0], EDGED_WIDTH); 
        memset(py + t->stride, tmpy[t->stride - 1], EDGED_WIDTH); 
        py += t->edged_stride; 
 
        memcpy(py, tmpy, t->stride); 
        memset(py - EDGED_WIDTH, tmpy[0], EDGED_WIDTH); 
        memset(py + t->stride, tmpy[t->stride - 1], EDGED_WIDTH); 
        py += t->edged_stride; 
 
        // u 
        memcpy(pu, tmpu, t->stride_uv); 
        memset(pu - (EDGED_WIDTH >> 1), tmpu[0], EDGED_WIDTH >> 1); 
        memset(pu + t->stride_uv, tmpu[t->stride_uv - 1], EDGED_WIDTH >> 1); 
        pu += t->edged_stride_uv; 
 
        // v 
        memcpy(pv, tmpv, t->stride_uv); 
        memset(pv - (EDGED_WIDTH >> 1), tmpv[0], EDGED_WIDTH >> 1); 
        memset(pv + t->stride_uv, tmpv[t->stride_uv - 1], EDGED_WIDTH >> 1); 
        pv += t->edged_stride_uv; 
    } 
} 
 
static void 
T264_interpolate_halfpel(T264_t* t, T264_frame_t* f) 
{ 
    int32_t src_offset; 
    int32_t width, height; 
 
    if (t->flags & (USE_HALFPEL| USE_QUARTPEL)) 
    { 
        src_offset = - 32 * t->edged_stride - 32; 
        width      = t->edged_width - (EDGED_WIDTH - 32) * 2; 
        height     = t->edged_height - (EDGED_HEIGHT - 32) * 2; 
        t->interpolate_halfpel_h(f->Y[0] + src_offset, t->edged_stride, f->Y[1] + src_offset, t->edged_stride, width, height); 
        t->interpolate_halfpel_v(f->Y[0] + src_offset, t->edged_stride, f->Y[2] + src_offset, t->edged_stride, width, height); 
        t->interpolate_halfpel_hv(f->Y[0] + src_offset, t->edged_stride, f->Y[3] + src_offset, t->edged_stride, width, height); 
    } 
} 
 
static void 
T264_save_ref(T264_t* t) 
{ 
    int32_t i; 
    T264_frame_t tmp; 
    /* deblock filter exec here */ 
    if (t->param.disable_filter == 0) 
        T264_deblock_frame(t, t->rec); 
    /* current only del with i,p */ 
    T264_extend_border(t, t->rec); 
    T264_interpolate_halfpel(t, t->rec); 
 
    tmp = t->refn[t->param.ref_num]; 
    for(i = t->param.ref_num ; i >= 1 ; i --) 
    { 
        t->refn[i] = t->refn[i - 1]; 
    } 
 
    t->refn[0] = tmp; 
    t->rec = &t->refn[0]; 
} 
 
void 
T264_mb_mode_decision(T264_t* t) 
{ 
    if (t->slice_type == SLICE_I) 
    { 
        T264_mode_decision_intra_y(t); 
    } 
    else if(t->slice_type == SLICE_P) 
    { 
        T264_mode_decision_inter_y(t); 
    } 
} 
 
void 
T264_mb_encode(T264_t* t) 
{ 
    if (t->mb.mb_mode == I_4x4 || t->mb.mb_mode == I_16x16) 
    { 
        T264_encode_intra_y(t); 
 
        // 
        // Chroma 
        // 
        T264_mode_decision_intra_uv(t); 
        T264_encode_intra_uv(t); 
 
        t->stat.i_block_num[t->mb.mb_mode] ++; 
    } 
    else if(t->mb.mb_mode == P_L0) 
    { 
        T264_encode_inter_y(t); 
        T264_encode_inter_uv(t); 
 
		t->stat.p_block_num[t->mb.mb_part] ++; 
    } 
	else if(t->mb.mb_mode == P_SKIP) 
	{ 
//        T264_encode_inter_y(t); 
//        T264_encode_inter_uv(t); 
 
		t->stat.skip_block_num++; 
	} 
} 
 
void 
T264_emms_c() 
{ 
} 
 
static void 
T264_init_cpu(T264_t* t) 
{ 
    if ((t->param.cpu & T264_CPU_FORCE) != T264_CPU_FORCE) 
    { 
        t->param.cpu = T264_detect_cpu();  
    } 
 
    t->pred16x16[Intra_16x16_TOP]    = T264_predict_16x16_mode_0_c; 
    t->pred16x16[Intra_16x16_LEFT]   = T264_predict_16x16_mode_1_c; 
    t->pred16x16[Intra_16x16_DC]     = T264_predict_16x16_mode_2_c; 
    t->pred16x16[Intra_16x16_PLANE]  = T264_predict_16x16_mode_3_c; 
    t->pred16x16[Intra_16x16_DCTOP]  = T264_predict_16x16_mode_20_c; 
    t->pred16x16[Intra_16x16_DCLEFT] = T264_predict_16x16_mode_21_c; 
    t->pred16x16[Intra_16x16_DC128]  = T264_predict_16x16_mode_22_c; 
     
    t->pred8x8[Intra_8x8_TOP]    = T264_predict_8x8_mode_0_c; 
    t->pred8x8[Intra_8x8_LEFT]   = T264_predict_8x8_mode_1_c; 
    t->pred8x8[Intra_8x8_DC]     = T264_predict_8x8_mode_2_c; 
    t->pred8x8[Intra_8x8_PLANE]  = T264_predict_8x8_mode_3_c; 
    t->pred8x8[Intra_8x8_DCTOP]  = T264_predict_8x8_mode_20_c; 
    t->pred8x8[Intra_8x8_DCLEFT] = T264_predict_8x8_mode_21_c; 
    t->pred8x8[Intra_8x8_DC128]  = T264_predict_8x8_mode_22_c; 
 
    t->pred4x4[Intra_4x4_TOP]    = T264_predict_4x4_mode_0_c; 
    t->pred4x4[Intra_4x4_LEFT]   = T264_predict_4x4_mode_1_c; 
    t->pred4x4[Intra_4x4_DC]     = T264_predict_4x4_mode_2_c; 
    t->pred4x4[Intra_4x4_DCTOP]  = T264_predict_4x4_mode_20_c; 
    t->pred4x4[Intra_4x4_DCLEFT] = T264_predict_4x4_mode_21_c; 
    t->pred4x4[Intra_4x4_DC128]  = T264_predict_4x4_mode_22_c; 
 
	//cloud add 
    t->pred4x4[Intra_4x4_DIAGONAL_DOWNLEFT]  = T264_predict_4x4_mode_3_c; 
    t->pred4x4[Intra_4x4_DIAGONAL_DOWNRIGHT]  = T264_predict_4x4_mode_4_c; 
    t->pred4x4[Intra_4x4_VERTICAL_RIGHT]  = T264_predict_4x4_mode_5_c; 
    t->pred4x4[Intra_4x4_HORIZONTAL_DOWN]  = T264_predict_4x4_mode_6_c; 
    t->pred4x4[Intra_4x4_VERTICAL_LEFT]  = T264_predict_4x4_mode_7_c; 
    t->pred4x4[Intra_4x4_HORIZONTAL_UP]  = T264_predict_4x4_mode_8_c; 
 
    if (t->flags & USE_SAD) 
    { 
        t->cmp[MB_16x16] = T264_sad_u_16x16_c; 
        t->cmp[MB_16x8]  = T264_sad_u_16x8_c; 
        t->cmp[MB_8x16]  = T264_sad_u_8x16_c; 
        t->cmp[MB_8x8]   = T264_sad_u_8x8_c; 
        t->cmp[MB_8x4]   = T264_sad_u_8x4_c; 
        t->cmp[MB_4x8]   = T264_sad_u_4x8_c; 
        t->cmp[MB_4x4]   = T264_sad_u_4x4_c; 
    } 
    else 
    { 
        t->cmp[MB_16x16] = T264_satd_u_16x16_c; 
        t->cmp[MB_16x8]  = T264_satd_u_16x8_c; 
        t->cmp[MB_8x16]  = T264_satd_u_8x16_c; 
        t->cmp[MB_8x8]   = T264_satd_u_8x8_c; 
        t->cmp[MB_8x4]   = T264_satd_u_8x4_c; 
        t->cmp[MB_4x8]   = T264_satd_u_4x8_c; 
        t->cmp[MB_4x4]   = T264_satd_u_4x4_c; 
    } 
 
    t->sad[MB_16x16] = T264_sad_u_16x16_c; 
    t->sad[MB_16x8]  = T264_sad_u_16x8_c; 
    t->sad[MB_8x16]  = T264_sad_u_8x16_c; 
    t->sad[MB_8x8]   = T264_sad_u_8x8_c; 
    t->sad[MB_8x4]   = T264_sad_u_8x4_c; 
    t->sad[MB_4x8]   = T264_sad_u_4x8_c; 
    t->sad[MB_4x4]   = T264_sad_u_4x4_c; 
    t->fdct4x4   = dct4x4_c; 
    t->fdct4x4dc = dct4x4dc_c; 
    t->fdct2x2dc = dct2x2dc_c; 
    t->idct4x4   = idct4x4_c; 
    t->idct4x4dc = idct4x4dc_c; 
    t->idct2x2dc = idct2x2dc_c; 
 
    t->quant4x4    = quant4x4_c; 
    t->quant4x4dc  = quant4x4dc_c; 
    t->quant2x2dc  = quant2x2dc_c; 
    t->iquant4x4   = iquant4x4_c; 
    t->iquant4x4dc = iquant4x4dc_c; 
    t->iquant2x2dc = iquant2x2dc_c; 
 
    t->expand8to16   = expand8to16_c; 
    t->contract16to8 = contract16to8_c; 
    t->contract16to8add = contract16to8add_c; 
    t->expand8to16sub   = expand8to16sub_c; 
    t->memcpy_stride_u = memcpy_stride_u_c; 
    t->eighth_pixel_mc_u = T264_eighth_pixel_mc_u_c; 
 
    t->interpolate_halfpel_h = interpolate_halfpel_h_c; 
    t->interpolate_halfpel_v = interpolate_halfpel_v_c; 
    t->interpolate_halfpel_hv = interpolate_halfpel_hv_c; 
    t->pixel_avg = T264_pixel_avg_c; 
    t->T264_satd_16x16_u = T264_satd_i16x16_u_c; 
    t->emms = T264_emms_c; 
     
    // flags relative 
    if (t->flags & USE_FULLSEARCH) 
        // xxx 
        t->search = T264_spiral_search_full; 
    else if (t->flags & USE_DIAMONDSEACH) 
        t->search = T264_search; 
    else 
        t->search = T264_search_full; 
 
    if (t->param.cpu & T264_CPU_MMX) 
    { 
        t->emms = T264_emms_mmx; 
        t->fdct4x4 = dct4x4_mmx; 
        t->fdct4x4dc = dct4x4dc_mmx; 
        t->idct4x4 = idct4x4_mmx; 
        t->idct4x4dc = idct4x4dc_mmx; 
    } 
    if (t->param.cpu & T264_CPU_SSE) 
    { 
        if (t->flags & USE_SAD) 
        { 
            t->cmp[MB_8x16]  = T264_sad_u_8x16_sse; 
            t->cmp[MB_8x8]   = T264_sad_u_8x8_sse; 
            t->cmp[MB_8x4]   = T264_sad_u_8x4_sse; 
            t->cmp[MB_4x8]   = T264_sad_u_4x8_sse; 
            t->cmp[MB_4x4]   = T264_sad_u_4x4_sse; 
        } 
 
        t->sad[MB_8x16]  = T264_sad_u_8x16_sse; 
        t->sad[MB_8x8]   = T264_sad_u_8x8_sse; 
        t->sad[MB_8x4]   = T264_sad_u_8x4_sse; 
        t->sad[MB_4x8]   = T264_sad_u_4x8_sse; 
        t->sad[MB_4x4]   = T264_sad_u_4x4_sse; 
    } 
    if (t->param.cpu & T264_CPU_SSE2) 
    { 
        t->quant4x4 = quant4x4_sse2; 
        t->iquant4x4 = iquant4x4_sse2; 
        if (t->flags & USE_SAD) 
        { 
            t->cmp[MB_16x16] = T264_sad_u_16x16_sse2; 
            t->cmp[MB_16x8]  = T264_sad_u_16x8_sse2; 
        } 
 
        t->sad[MB_16x16] = T264_sad_u_16x16_sse2; 
        t->sad[MB_16x8]  = T264_sad_u_16x8_sse2; 
        t->interpolate_halfpel_h = interpolate_halfpel_h_sse2; 
        t->interpolate_halfpel_v = interpolate_halfpel_v_sse2; 
    } 
} 
 
void 
T264_init_frame(T264_t* t, uint8_t* src, T264_frame_t* f, int32_t frame_num) 
{ 
    f->Y[0] = src; 
    f->U = f->Y[0] + t->width * t->height; 
    f->V = f->U + (t->width * t->height >> 2); 
    f->frame_num = frame_num; 
} 
 
// get non zero count & cbp 
void 
T264_mb_encode_post(T264_t* t) 
{ 
    int32_t i, j; 
 
    if (t->mb.mb_mode == I_16x16) 
    { 
        t->mb.cbp_y = 0; 
        for(i = 0; i < 16 ; i ++) 
        { 
            int32_t x, y; 
            const int32_t nz = array_non_zero_count(&(t->mb.dct_y_z[i][1]), 15); 
            x = luma_inverse_x[i]; 
            y = luma_inverse_y[i]; 
            t->mb.nnz[luma_index[i]] = nz; 
            t->mb.nnz_ref[NNZ_LUMA + y * 8 + x] = nz; 
            if( nz > 0 ) 
            { 
                t->mb.cbp_y = 0x0f; 
            } 
        } 
    }	 
    else 
    { 
        t->mb.cbp_y = 0; 
        for(i = 0; i < 16; i ++) 
        { 
            int32_t x, y; 
            const int32_t nz = array_non_zero_count(t->mb.dct_y_z[i], 16); 
            x = luma_inverse_x[i]; 
            y = luma_inverse_y[i]; 
            t->mb.nnz[luma_index[i]] = nz; 
            t->mb.nnz_ref[NNZ_LUMA + y * 8 + x] = nz; 
            if( nz > 0 ) 
            { 
                t->mb.cbp_y |= 1 << (i / 4); 
            } 
        } 
    } 
 
    /* Calculate the chroma patern */ 
    t->mb.cbp_c = 0; 
    for(i = 0; i < 8; i ++) 
    { 
        int32_t x, y; 
        const int nz = array_non_zero_count(&(t->mb.dct_uv_z[i / 4][i % 4][1]), 15); 
        t->mb.nnz[i + 16] = nz; 
        if (i < 4) 
        { 
            x = i % 2; 
            y = i / 2; 
            t->mb.nnz_ref[NNZ_CHROMA0 + y * 8 + x] = nz; 
        } 
        else 
        { 
            int32_t j = i - 4; 
            x = j % 2; 
            y = j / 2; 
            t->mb.nnz_ref[NNZ_CHROMA1 + y * 8 + x] = nz; 
        } 
        if( nz > 0 ) 
        { 
            t->mb.cbp_c = 0x02;    /* dc+ac */ 
        } 
    } 
    if(t->mb.cbp_c == 0x00 && 
       (array_non_zero_count(t->mb.dc2x2_z[0], 4) > 0 || array_non_zero_count(t->mb.dc2x2_z[1], 4) > 0)) 
    { 
        t->mb.cbp_c = 0x01;    /* dc only */ 
    } 
 
    // really decide SKIP mode 
    if(t->slice_type == SLICE_P && t->mb.mb_part == MB_16x16) 
    { 
        if (t->mb.cbp_y == 0 && t->mb.cbp_c == 0) 
        { 
            T264_vector_t vec; 
            T264_predict_mv_skip(t, 0, &vec); 
            if (vec.x == t->mb.vec[0][0].x && 
                vec.y == t->mb.vec[0][0].y) 
            { 
                t->mb.mb_part = MB_16x16; 
                t->mb.mb_mode = P_SKIP; 
            } 
        } 
    } 
 
    if (t->mb.mb_mode == I_4x4) 
    { 
        int8_t* p = t->mb.i4x4_pred_mode_ref; 
        for(i = 0; i < 16 ; i ++) 
        { 
            int32_t x, y; 
            x = luma_inverse_x[i]; 
            y = luma_inverse_y[i]; 
            p[IPM_LUMA + y * 8 + x] = t->mb.mode_i4x4[i]; 
            t->mb.mode_i4x4[i] = t->mb.mode_i4x4[i]; 
        } 
    } 
    else 
    { 
        memset(t->mb.mode_i4x4, Intra_4x4_DC, 16 * sizeof(uint8_t)); 
    } 
 
    if (t->mb.mb_mode != I_4x4 && t->mb.mb_mode != I_16x16) 
    { 
        for(i = 0 ; i < 16 ; i ++) 
        { 
            int32_t x, y; 
            x = i % 4; 
            y = i / 4; 
            t->mb.vec_ref[VEC_LUMA + y * 8 + x].vec     = t->mb.vec[0][i]; 
            t->mb.vec_ref[VEC_LUMA + y * 8 + x].part    = t->mb.mb_part; 
            t->mb.vec_ref[VEC_LUMA + y * 8 + x].subpart = t->mb.submb_part[i]; 
        } 
    } 
    else 
    { 
        memset(t->mb.submb_part, -1, sizeof(t->mb.submb_part)); 
        t->mb.mb_part = -1; 
#define INITINVALIDVEC(vec) vec.refno = -1; vec.x = vec.y = 0; 
        for(i = 0 ; i < 2 ; i ++) 
        { 
            for(j = 0 ; j < 16 ; j ++) 
            { 
                INITINVALIDVEC(t->mb.vec[i][j]); 
            } 
        } 
    } 
#undef INITINVALIDVEC 
} 
 
static uint32_t 
write_dst(uint8_t* src, int32_t nal_pos[4], int32_t nal_num, uint8_t* dst, int32_t dst_size) 
{ 
    int32_t i, j, n; 
    int32_t count; 
    int32_t nal_len; 
 
    n = 0; 
    for(i = 0 ; i < nal_num - 1; i ++) 
    { 
        nal_len = nal_pos[i + 1] - nal_pos[i]; 
         
        // start code 00 00 00 01 
        dst[n ++] = src[0]; 
        dst[n ++] = src[1]; 
        dst[n ++] = src[2]; 
        dst[n ++] = src[3]; 
        count = 0; 
        for(j = 4 ; j < nal_len - 1; j ++) 
        { 
            if (src[j] == 0) 
            { 
                count ++; 
                if (count >= 2 && src[j + 1] <= 3) 
                { 
                    dst[n ++] = 0; 
                    dst[n ++] = 3; 
                    count = 0; 
                    continue; 
                } 
            } 
            else 
            { 
                count = 0; 
            } 
            dst[n ++] = src[j]; 
        } 
        dst[n ++] = src[j]; 
        src += nal_len; 
    } 
 
    return n; 
} 
 
/////////////////////////////////////////////////////////// 
// interface 
T264_t* 
T264_open(T264_param_t* para) 
{ 
    T264_t* t; 
    int32_t i; 
 
    // 
    // TODO: here check the input param if it is valid 
    // 
    if (para->flags & USE_FORCEBLOCKSIZE) 
        para->flags |= USE_SUBBLOCK; 
    if (para->flags & USE_QUARTPEL) 
        para->flags |= USE_HALFPEL; 
 
    t = T264_malloc(sizeof(T264_t), CACHE_SIZE); 
    memset(t, 0, sizeof(T264_t)); 
 
    t->mb_width  = para->width >> 4; 
    t->mb_height = para->height >> 4; 
    t->mb_stride = t->mb_width; 
    t->width  = t->mb_width << 4; 
    t->height = t->mb_height << 4; 
    t->edged_width = t->width + 2 * EDGED_WIDTH; 
    t->edged_height = t->height + 2 * EDGED_HEIGHT; 
    t->qp_y   = para->qp; 
    t->flags  = para->flags; 
 
    t->stride    = t->width; 
    t->stride_uv = t->width >> 1; 
    t->edged_stride = t->edged_width; 
    t->edged_stride_uv = t->edged_width >> 1; 
 
    t->bs_buf = T264_malloc(t->width * t->height << 1, CACHE_SIZE); 
 
    for(i = 0 ; i < para->ref_num + 1 ; i ++) 
    { 
        uint8_t* p = T264_malloc(t->edged_width * t->edged_height + (t->edged_width * t->edged_height >> 1), CACHE_SIZE); 
        t->refn[i].Y[0] = p + EDGED_HEIGHT * t->edged_width + EDGED_WIDTH; 
        t->refn[i].U = p + t->edged_width * t->edged_height + (t->edged_width * EDGED_HEIGHT >> 2) + (EDGED_WIDTH >> 1); 
        t->refn[i].V = p + t->edged_width * t->edged_height + (t->edged_width * t->edged_height >> 2) + (t->edged_width * EDGED_HEIGHT >> 2) + (EDGED_WIDTH >> 1); 
        t->refn[i].mb = T264_malloc(t->mb_height * t->mb_width * sizeof(T264_mb_context_t), CACHE_SIZE); 
        p = T264_malloc(t->edged_width * t->edged_height * 3, CACHE_SIZE); 
        t->refn[i].Y[1] = p + EDGED_HEIGHT * t->edged_width + EDGED_WIDTH; 
        t->refn[i].Y[2] = t->refn[i].Y[1] + t->edged_width * t->edged_height; 
        t->refn[i].Y[3] = t->refn[i].Y[2] + t->edged_width * t->edged_height; 
    } 
 
    t->param = *para; 
    t->idr_pic_id = -1; 
    t->frame_id = -1; 
 
    T264_init_cpu(t); 
 
    rc_init_seq(t); 
 
    return t; 
} 
 
void 
T264_close(T264_t* t) 
{ 
    int32_t i; 
 
    for(i = 0 ; i < t->param.ref_num + 1 ; i ++) 
    { 
        T264_free(t->refn[i].Y[0] - (EDGED_HEIGHT * t->edged_width + EDGED_WIDTH)); 
        T264_free(t->refn[i].mb); 
        T264_free(t->refn[i].Y[1] - (EDGED_HEIGHT * t->edged_width + EDGED_WIDTH)); 
    } 
 
    T264_free(t->bs_buf); 
 
    T264_free(t); 
} 
 
int32_t 
T264_encode(T264_t* t, uint8_t* src, uint8_t* dst, int32_t dst_size) 
{ 
    int32_t i, j; 
    int32_t nal_pos[4];     // remember each nal start pos 
    int32_t nal_num = 0; 
    int32_t len; 
 
    eg_init(&t->bs, t->bs_buf, dst_size); 
    T264_init_frame(t, src, &t->cur, t->frame_num); 
 
    T264_load_ref(t); 
    t->frame_id ++; 
 
    if (t->frame_num % t->param.idrframe == 0) 
    { 
        nal_pos[nal_num ++] = eg_len(&t->bs); 
        nal_unit_init(&t->nal, 1, NAL_SEQ_SET); 
        nal_unit_write(t, &t->nal); 
 
        seq_set_init(t, &t->ss); 
        seq_set_write(t, &t->ss); 
 
        nal_pos[nal_num ++] = eg_len(&t->bs); 
 
        nal_unit_init(&t->nal, 1, NAL_PIC_SET); 
        nal_unit_write(t, &t->nal); 
 
        pic_set_init(t, &t->ps); 
        pic_set_write(t, &t->ps); 
 
        nal_pos[nal_num ++] = eg_len(&t->bs); 
 
        nal_unit_init(&t->nal, 1, NAL_SLICE_IDR); 
        nal_unit_write(t, &t->nal); 
 
        t->slice_type        = SLICE_I; 
        t->idr_pic_id = (t->idr_pic_id + 1) % 65535; 
        t->frame_num = 0; 
        t->poc = 0; 
 
        T264_reset_ref(t); 
 
        rc_init_gop(t); 
    } 
    else if (t->frame_num % t->param.iframe == 0) 
    { 
        nal_pos[nal_num ++] = eg_len(&t->bs); 
        nal_unit_init(&t->nal, 1, NAL_SLICE_NOPART); 
        nal_unit_write(t, &t->nal); 
 
        t->slice_type        = SLICE_I; 
 
        rc_init_gop(t); 
    } 
    else    // P or B pic 
    { 
        nal_pos[nal_num ++] = eg_len(&t->bs); 
        nal_unit_init(&t->nal, 1, NAL_SLICE_NOPART); 
        nal_unit_write(t, &t->nal); 
 
        t->slice_type        = SLICE_P; 
    } 
 
    rc_init_pic(t); 
    rc_update_qp(t); 
 
    slice_header_init(t, &t->slice); 
    slice_header_write(t, &t->slice); 
 
    t->rc.header_bits = eg_len(&t->bs) * 8; 
    t->sad_all = 0; 
 
    for(i = 0 ; i < t->mb_height ; i ++) 
    { 
        for(j = 0 ; j < t->mb_width ; j ++) 
        { 
            T264_mb_load_context(t, i, j); 
 
            T264_mb_mode_decision(t); 
 
            T264_mb_encode(t); 
 
			T264_mb_encode_post(t); 
 
			//SKIP 
			if(t->mb.mb_mode == P_SKIP) 
			{ 
				t->skip ++; 
			} 
			else 
			{ 
				T264_macroblock_write_cavlc(t);				 
			} 
			 
			T264_mb_save_context(t); 
            t->sad_all += t->mb.sad; 
        } 
    } 
 
    /* update current pic */ 
    t->poc = (t->poc + 2) % ((1 << (t->ss.max_pic_order + 4)) - 1); 
    if (t->slice_type != SLICE_B) 
    { 
        T264_save_ref(t); 
        t->frame_num = (t->frame_num + 1) % ((1 << (t->ss.log2_max_frame_num_minus4 + 4)) - 1); 
    } 
    if (t->skip > 0) 
    { 
        eg_write_ue(&t->bs, t->skip); 
        t->skip = 0; 
    } 
 
    eg_align(&t->bs); 
    eg_flush(&t->bs); 
    nal_pos[nal_num ++] = eg_len(&t->bs); 
     
    len = write_dst(t->bs_buf, nal_pos, nal_num, dst, dst_size); 
 
    t->emms(); 
 
    t->rc.bits = len * 8; 
    rc_update_quad_model(t); 
    rc_update_pic(t); 
 
    return len; 
}