www.pudn.com > T264-src-0.02.zip > t264enc.c
/***************************************************************************** * * T264 AVC CODEC * * Copyright(C) 2004-2005 llcc* 2004-2005 visionany * * This program is free software ; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation ; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY ; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program ; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * ****************************************************************************/ #include "stdio.h" #include "memory.h" #include "t264.h" #include "utility.h" #include "intra.h" #include "cavlc.h" #include "inter.h" #include "interpolate.h" #include "estimation.h" #include "deblock.h" #include "ratecontrol.h" #include "sse2\sse2.h" #include "math.h" static const int32_t chroma_qp[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 29, 30, 31, 32, 32, 33, 34, 34, 35, 35, 36, 36, 37, 37, 37, 38, 38, 38, 39, 39, 39, 39 }; //! convert from H.263 QP to H.26L quant given by: quant=pow(2,QP/6) static const int32_t qp_cost[52]= { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 6, 6, 7, 8, 9,10,11,13,14, 16,18,20,23,25,29,32,36, 40,45,51,57,64,72,81,91 }; float psnr(uint8_t* p1, uint8_t* p2, int32_t size) { float sad = 0; int32_t i; for (i = 0 ; i < size ; i ++) { int32_t tmp; tmp = (p1[i] - p2[i]); sad += tmp * tmp; } return (float)(10 * log10(65025.0f * size / sad)); } void T264_mb_load_context(T264_t* t, int32_t mb_y, int32_t mb_x) { int32_t qpc; int32_t i, j; t->mb.mb_x = mb_x; t->mb.mb_y = mb_y; t->mb.mb_xy = t->mb.mb_y * t->mb_stride + t->mb.mb_x; t->mb.mb_neighbour = 0; if (mb_x != 0) t->mb.mb_neighbour |= MB_LEFT; if (mb_y != 0) { t->mb.mb_neighbour |= MB_TOP; if (mb_x != t->mb_stride - 1) t->mb.mb_neighbour |= MB_TOPRIGHT; } t->mb.src_y = t->cur.Y[0] + (mb_y << 4) * t->stride + (mb_x << 4); t->mb.dst_y = t->rec->Y[0] + (mb_y << 4) * t->edged_stride + (mb_x << 4); t->mb.src_u = t->cur.U + (mb_y << 3) * t->stride_uv + (mb_x << 3); t->mb.dst_u = t->rec->U + (mb_y << 3) * t->edged_stride_uv + (mb_x << 3); t->mb.src_v = t->cur.V + (mb_y << 3) * t->stride_uv + (mb_x << 3); t->mb.dst_v = t->rec->V + (mb_y << 3) * t->edged_stride_uv + (mb_x << 3); t->mb.mb_qp_delta = 0; /* t->ps.chroma_qp_index_offset maybe modify in ratecontrol */ qpc = clip3(t->ps.chroma_qp_index_offset + t->qp_y, 0, 51); t->qp_uv = chroma_qp[qpc]; t->mb.lambda = qp_cost[t->qp_y]; t->mb.context = &t->rec->mb[t->mb.mb_xy]; #define INITINVALIDVEC(vec) vec.refno = -2; vec.x = vec.y = 0; INITINVALIDVEC(t->mb.vec_ref[0].vec); INITINVALIDVEC(t->mb.vec_ref[IPM_LUMA - 8 + 4].vec); INITINVALIDVEC(t->mb.vec_ref[IPM_LUMA - 8 + 4 + 8].vec); INITINVALIDVEC(t->mb.vec_ref[IPM_LUMA - 8 + 4 + 16].vec); INITINVALIDVEC(t->mb.vec_ref[IPM_LUMA - 8 + 4 + 24].vec); t->mb.vec_ref[0].part = -1; t->mb.vec_ref[IPM_LUMA - 8 + 4].part = -1; t->mb.vec_ref[IPM_LUMA - 8 + 4 + 8].part = -1; t->mb.vec_ref[IPM_LUMA - 8 + 4 + 16].part = -1; t->mb.vec_ref[IPM_LUMA - 8 + 4 + 24].part = -1; t->mb.vec_ref[0].subpart = -1; t->mb.vec_ref[IPM_LUMA - 8 + 4].subpart = -1; t->mb.vec_ref[IPM_LUMA - 8 + 4 + 8].subpart = -1; t->mb.vec_ref[IPM_LUMA - 8 + 4 + 16].subpart = -1; t->mb.vec_ref[IPM_LUMA - 8 + 4 + 24].subpart = -1; memset(t->mb.submb_part, -1, sizeof(t->mb.submb_part)); t->mb.mb_part = -1; for(i = 0 ; i < 2 ; i ++) { for(j = 0 ; j < 16 ; j ++) { INITINVALIDVEC(t->mb.vec[i][j]); } } t->mb.sad_ref[0] = t->mb.sad_ref[1] = t->mb.sad_ref[2] = -1; //intra_4x4 prediction modes and non-zero counts if( mb_y > 0 ) { int16_t top_xy = t->mb.mb_xy - t->mb_stride; /* intra 4x4 pred mode layout ? x x x x x x x x */ t->mb.i4x4_pred_mode_ref[IPM_LUMA - 8 + 0] = t->rec->mb[top_xy].mode_i4x4[10]; t->mb.i4x4_pred_mode_ref[IPM_LUMA - 8 + 1] = t->rec->mb[top_xy].mode_i4x4[11]; t->mb.i4x4_pred_mode_ref[IPM_LUMA - 8 + 2] = t->rec->mb[top_xy].mode_i4x4[14]; t->mb.i4x4_pred_mode_ref[IPM_LUMA - 8 + 3] = t->rec->mb[top_xy].mode_i4x4[15]; t->mb.vec_ref[IPM_LUMA - 8 + 0].vec = t->rec->mb[top_xy].vec[0][12]; t->mb.vec_ref[IPM_LUMA - 8 + 1].vec = t->rec->mb[top_xy].vec[0][13]; t->mb.vec_ref[IPM_LUMA - 8 + 2].vec = t->rec->mb[top_xy].vec[0][14]; t->mb.vec_ref[IPM_LUMA - 8 + 3].vec = t->rec->mb[top_xy].vec[0][15]; t->mb.vec_ref[IPM_LUMA - 8 + 0].part = t->mb.vec_ref[IPM_LUMA - 8 + 1].part = t->mb.vec_ref[IPM_LUMA - 8 + 2].part = t->mb.vec_ref[IPM_LUMA - 8 + 3].part = t->rec->mb[top_xy].mb_part; t->mb.vec_ref[IPM_LUMA - 8 + 0].subpart = t->rec->mb[top_xy].submb_part[12]; t->mb.vec_ref[IPM_LUMA - 8 + 1].subpart = t->rec->mb[top_xy].submb_part[13]; t->mb.vec_ref[IPM_LUMA - 8 + 2].subpart = t->rec->mb[top_xy].submb_part[14]; t->mb.vec_ref[IPM_LUMA - 8 + 3].subpart = t->rec->mb[top_xy].submb_part[15]; t->mb.sad_ref[1] = t->rec->mb[top_xy].sad; if (mb_x != t->mb_stride - 1) { int32_t righttop_xy = top_xy + 1; t->mb.vec_ref[IPM_LUMA - 8 + 4].vec = t->rec->mb[righttop_xy].vec[0][12]; t->mb.vec_ref[IPM_LUMA - 8 + 4].part = t->rec->mb[righttop_xy].mb_part; t->mb.vec_ref[IPM_LUMA - 8 + 4].subpart = t->rec->mb[righttop_xy].submb_part[12]; t->mb.sad_ref[2] = t->rec->mb[righttop_xy].sad; } /* nnz layout: ? x x x x ? x x x x x x x ? x x x x x */ t->mb.nnz_ref[NNZ_LUMA - 8 + 0] = t->rec->mb[top_xy].nnz[12]; t->mb.nnz_ref[NNZ_LUMA - 8 + 1] = t->rec->mb[top_xy].nnz[13]; t->mb.nnz_ref[NNZ_LUMA - 8 + 2] = t->rec->mb[top_xy].nnz[14]; t->mb.nnz_ref[NNZ_LUMA - 8 + 3] = t->rec->mb[top_xy].nnz[15]; t->mb.nnz_ref[NNZ_CHROMA0 - 8 + 0] = t->rec->mb[top_xy].nnz[18]; t->mb.nnz_ref[NNZ_CHROMA0 - 8 + 1] = t->rec->mb[top_xy].nnz[19]; t->mb.nnz_ref[NNZ_CHROMA1 - 8 + 0] = t->rec->mb[top_xy].nnz[22]; t->mb.nnz_ref[NNZ_CHROMA1 - 8 + 1] = t->rec->mb[top_xy].nnz[23]; } else { /* load intra4x4 */ t->mb.i4x4_pred_mode_ref[IPM_LUMA - 8 + 0] = t->mb.i4x4_pred_mode_ref[IPM_LUMA - 8 + 1] = t->mb.i4x4_pred_mode_ref[IPM_LUMA - 8 + 2] = t->mb.i4x4_pred_mode_ref[IPM_LUMA - 8 + 3] = -1; INITINVALIDVEC(t->mb.vec_ref[IPM_LUMA - 8 + 0].vec); INITINVALIDVEC(t->mb.vec_ref[IPM_LUMA - 8 + 1].vec); INITINVALIDVEC(t->mb.vec_ref[IPM_LUMA - 8 + 2].vec); INITINVALIDVEC(t->mb.vec_ref[IPM_LUMA - 8 + 3].vec); t->mb.vec_ref[IPM_LUMA - 8 + 0].part = t->mb.vec_ref[IPM_LUMA - 8 + 1].part = t->mb.vec_ref[IPM_LUMA - 8 + 2].part = t->mb.vec_ref[IPM_LUMA - 8 + 3].part = -1; t->mb.vec_ref[IPM_LUMA - 8 + 0].subpart = t->mb.vec_ref[IPM_LUMA - 8 + 1].subpart = t->mb.vec_ref[IPM_LUMA - 8 + 2].subpart = t->mb.vec_ref[IPM_LUMA - 8 + 3].subpart = -1; t->mb.nnz_ref[NNZ_LUMA - 8 + 0] = t->mb.nnz_ref[NNZ_LUMA - 8 + 1] = t->mb.nnz_ref[NNZ_LUMA - 8 + 2] = t->mb.nnz_ref[NNZ_LUMA - 8 + 3] = 0x80; t->mb.nnz_ref[NNZ_CHROMA0 - 8 + 0] = t->mb.nnz_ref[NNZ_CHROMA0 - 8 + 1] = t->mb.nnz_ref[NNZ_CHROMA1 - 8 + 0] = t->mb.nnz_ref[NNZ_CHROMA1 - 8 + 1] = 0x80; } if( mb_x > 0 ) { int16_t left_xy = t->mb.mb_xy - 1; /* load intra4x4 */ t->mb.i4x4_pred_mode_ref[IPM_LUMA - 1 + 0] = t->rec->mb[left_xy].mode_i4x4[5]; t->mb.i4x4_pred_mode_ref[IPM_LUMA - 1 + 8] = t->rec->mb[left_xy].mode_i4x4[7]; t->mb.i4x4_pred_mode_ref[IPM_LUMA - 1 + 16] = t->rec->mb[left_xy].mode_i4x4[13]; t->mb.i4x4_pred_mode_ref[IPM_LUMA - 1 + 24] = t->rec->mb[left_xy].mode_i4x4[15]; t->mb.vec_ref[IPM_LUMA - 1 + 0].vec = t->rec->mb[left_xy].vec[0][3]; t->mb.vec_ref[IPM_LUMA - 1 + 8].vec = t->rec->mb[left_xy].vec[0][7]; t->mb.vec_ref[IPM_LUMA - 1 + 16].vec = t->rec->mb[left_xy].vec[0][11]; t->mb.vec_ref[IPM_LUMA - 1 + 24].vec = t->rec->mb[left_xy].vec[0][15]; t->mb.vec_ref[IPM_LUMA - 1 + 0].part = t->mb.vec_ref[IPM_LUMA - 1 + 8].part = t->mb.vec_ref[IPM_LUMA - 1 + 16].part = t->mb.vec_ref[IPM_LUMA - 1 + 24].part = t->rec->mb[left_xy].mb_part; t->mb.vec_ref[IPM_LUMA - 8 + 0].subpart = t->rec->mb[left_xy].submb_part[3]; t->mb.vec_ref[IPM_LUMA - 8 + 8].subpart = t->rec->mb[left_xy].submb_part[7]; t->mb.vec_ref[IPM_LUMA - 8 + 16].subpart = t->rec->mb[left_xy].submb_part[11]; t->mb.vec_ref[IPM_LUMA - 8 + 24].subpart = t->rec->mb[left_xy].submb_part[15]; t->mb.sad_ref[0] = t->rec->mb[left_xy].sad; /* load non_zero_count */ t->mb.nnz_ref[NNZ_LUMA - 1 + 0] = t->rec->mb[left_xy].nnz[3]; t->mb.nnz_ref[NNZ_LUMA - 1 + 8] = t->rec->mb[left_xy].nnz[7]; t->mb.nnz_ref[NNZ_LUMA - 1 + 16] = t->rec->mb[left_xy].nnz[11]; t->mb.nnz_ref[NNZ_LUMA - 1 + 24] = t->rec->mb[left_xy].nnz[15]; t->mb.nnz_ref[NNZ_CHROMA0 - 1 + 0] = t->rec->mb[left_xy].nnz[17]; t->mb.nnz_ref[NNZ_CHROMA0 - 1 + 8] = t->rec->mb[left_xy].nnz[19]; t->mb.nnz_ref[NNZ_CHROMA1 - 1 + 0] = t->rec->mb[left_xy].nnz[21]; t->mb.nnz_ref[NNZ_CHROMA1 - 1 + 8] = t->rec->mb[left_xy].nnz[23]; } else { t->mb.i4x4_pred_mode_ref[IPM_LUMA - 1 + 0] = t->mb.i4x4_pred_mode_ref[IPM_LUMA - 1 + 8] = t->mb.i4x4_pred_mode_ref[IPM_LUMA - 1 + 16] = t->mb.i4x4_pred_mode_ref[IPM_LUMA - 1 + 24] = -1; INITINVALIDVEC(t->mb.vec_ref[IPM_LUMA - 1 + 0].vec); INITINVALIDVEC(t->mb.vec_ref[IPM_LUMA - 1 + 8].vec); INITINVALIDVEC(t->mb.vec_ref[IPM_LUMA - 1 + 16].vec); INITINVALIDVEC(t->mb.vec_ref[IPM_LUMA - 1 + 24].vec); t->mb.vec_ref[IPM_LUMA - 1 + 0].part = t->mb.vec_ref[IPM_LUMA - 1 + 8].part = t->mb.vec_ref[IPM_LUMA - 1 + 16].part = t->mb.vec_ref[IPM_LUMA - 1 + 24].part = -1; t->mb.vec_ref[IPM_LUMA - 1 + 0].subpart = t->mb.vec_ref[IPM_LUMA - 1 + 8].subpart = t->mb.vec_ref[IPM_LUMA - 1 + 16].subpart = t->mb.vec_ref[IPM_LUMA - 1 + 24].subpart = -1; t->mb.nnz_ref[NNZ_LUMA - 1 + 0] = t->mb.nnz_ref[NNZ_LUMA - 1 + 8] = t->mb.nnz_ref[NNZ_LUMA - 1 + 16] = t->mb.nnz_ref[NNZ_LUMA - 1 + 24] = 0x80; t->mb.nnz_ref[NNZ_CHROMA0 - 1 + 0] = t->mb.nnz_ref[NNZ_CHROMA0 - 1 + 8] = t->mb.nnz_ref[NNZ_CHROMA1 - 1 + 0] = t->mb.nnz_ref[NNZ_CHROMA1 - 1 + 8] = 0x80; } if (mb_x > 0 && mb_y > 0) { int32_t lefttop_xy = t->mb.mb_xy - t->mb_stride - 1; t->mb.vec_ref[0].vec = t->rec->mb[lefttop_xy].vec[0][15]; t->mb.vec_ref[0].subpart = t->rec->mb[lefttop_xy].submb_part[15]; t->mb.vec_ref[0].part = t->rec->mb[lefttop_xy].mb_part; } #undef INITINVALIDVEC } void T264_mb_save_context(T264_t* t) { memcpy(t->mb.context, &t->mb, sizeof(*t->mb.context)); // memcpy(&t->rec->mb[t->mb.mb_xy], &t->mb, sizeof(*t->mb.context)); } static void T264_reset_ref(T264_t* t) { int32_t i; for(i = 1 ; i < MAX_REFFRAMES ; i ++) { t->refn[i].frame_num = -1; } t->rec = &t->refn[0]; t->refn[0].frame_num = 0; } static void T264_load_ref(T264_t* t) { int32_t i; /* now we only deal with p frame, the descend order is always right */ t->refl0_num = 0; for(i = 1 ; i < t->param.ref_num + 1 ; i ++) { if (t->refn[i].frame_num >= 0) { t->refl0[t->refl0_num ++] = &t->refn[i]; } } } static void T264_extend_border(T264_t* t, T264_frame_t* f) { int32_t i; uint8_t* py; uint8_t* pu; uint8_t* pv; uint8_t* tmpy; uint8_t* tmpu; uint8_t* tmpv; // TODO: we need extend the interpolate pics // top, top-left, top-right py = f->Y[0] - t->edged_stride; pu = f->U - t->edged_stride_uv; pv = f->V - t->edged_stride_uv; for(i = 0 ; i < (EDGED_HEIGHT >> 1) ; i ++) { // y memcpy(py, f->Y[0], t->stride); memset(py - EDGED_WIDTH, f->Y[0][0], EDGED_WIDTH); memset(py + t->stride, f->Y[0][t->stride - 1], EDGED_WIDTH); py -= t->edged_stride; memcpy(py, f->Y[0], t->stride); memset(py - EDGED_WIDTH, f->Y[0][0], EDGED_WIDTH); memset(py + t->stride, f->Y[0][t->stride - 1], EDGED_WIDTH); py -= t->edged_stride; // u memcpy(pu, f->U, t->stride_uv); memset(pu - (EDGED_WIDTH >> 1), f->U[0], EDGED_WIDTH >> 1); memset(pu + t->stride_uv, f->U[t->stride_uv - 1], EDGED_WIDTH >> 1); pu -= t->edged_stride_uv; // V memcpy(pv, f->V, t->stride_uv); memset(pv - (EDGED_WIDTH >> 1), f->V[0], EDGED_WIDTH >> 1); memset(pv + t->stride_uv, f->V[t->stride_uv - 1], EDGED_WIDTH >> 1); pv -= t->edged_stride_uv; } // left & right py = f->Y[0] - EDGED_WIDTH; pu = f->U - (EDGED_WIDTH >> 1); pv = f->V - (EDGED_WIDTH >> 1); for(i = 0 ; i < (t->height >> 1) ; i ++) { // left memset(py, py[EDGED_WIDTH], EDGED_WIDTH); // right memset(&py[t->stride + EDGED_WIDTH], py[t->stride + EDGED_WIDTH - 1], EDGED_WIDTH); py += t->edged_stride; memset(py, py[EDGED_WIDTH], EDGED_WIDTH); memset(&py[t->stride + EDGED_WIDTH], py[t->stride + EDGED_WIDTH - 1], EDGED_WIDTH); py += t->edged_stride; // u memset(pu, pu[EDGED_WIDTH >> 1], EDGED_WIDTH >> 1); memset(&pu[t->stride_uv + (EDGED_WIDTH >> 1)], pu[t->stride_uv + (EDGED_WIDTH >> 1) - 1], EDGED_WIDTH >> 1); pu += t->edged_stride_uv; // v memset(pv, pv[EDGED_WIDTH >> 1], EDGED_WIDTH >> 1); memset(&pv[t->stride_uv + (EDGED_WIDTH >> 1)], pv[t->stride_uv + (EDGED_WIDTH >> 1) - 1], EDGED_WIDTH >> 1); pv += t->edged_stride_uv; } // bottom, left-bottom,right-bottom py = f->Y[0] + t->edged_stride * t->height; tmpy = f->Y[0] + t->edged_stride * (t->height - 1); pu = f->U + t->edged_stride_uv * (t->height >> 1); tmpu = f->U + t->edged_stride_uv * ((t->height >> 1) - 1); pv = f->V + t->edged_stride_uv * (t->height >> 1); tmpv = f->V + t->edged_stride_uv * ((t->height >> 1)- 1); for(i = 0 ; i < (EDGED_HEIGHT >> 1) ; i ++) { // y memcpy(py, tmpy, t->stride); memset(py - EDGED_WIDTH, tmpy[0], EDGED_WIDTH); memset(py + t->stride, tmpy[t->stride - 1], EDGED_WIDTH); py += t->edged_stride; memcpy(py, tmpy, t->stride); memset(py - EDGED_WIDTH, tmpy[0], EDGED_WIDTH); memset(py + t->stride, tmpy[t->stride - 1], EDGED_WIDTH); py += t->edged_stride; // u memcpy(pu, tmpu, t->stride_uv); memset(pu - (EDGED_WIDTH >> 1), tmpu[0], EDGED_WIDTH >> 1); memset(pu + t->stride_uv, tmpu[t->stride_uv - 1], EDGED_WIDTH >> 1); pu += t->edged_stride_uv; // v memcpy(pv, tmpv, t->stride_uv); memset(pv - (EDGED_WIDTH >> 1), tmpv[0], EDGED_WIDTH >> 1); memset(pv + t->stride_uv, tmpv[t->stride_uv - 1], EDGED_WIDTH >> 1); pv += t->edged_stride_uv; } } static void T264_interpolate_halfpel(T264_t* t, T264_frame_t* f) { int32_t src_offset; int32_t width, height; if (t->flags & (USE_HALFPEL| USE_QUARTPEL)) { src_offset = - 32 * t->edged_stride - 32; width = t->edged_width - (EDGED_WIDTH - 32) * 2; height = t->edged_height - (EDGED_HEIGHT - 32) * 2; t->interpolate_halfpel_h(f->Y[0] + src_offset, t->edged_stride, f->Y[1] + src_offset, t->edged_stride, width, height); t->interpolate_halfpel_v(f->Y[0] + src_offset, t->edged_stride, f->Y[2] + src_offset, t->edged_stride, width, height); t->interpolate_halfpel_hv(f->Y[0] + src_offset, t->edged_stride, f->Y[3] + src_offset, t->edged_stride, width, height); } } static void T264_save_ref(T264_t* t) { int32_t i; T264_frame_t tmp; /* deblock filter exec here */ if (t->param.disable_filter == 0) T264_deblock_frame(t, t->rec); /* current only del with i,p */ T264_extend_border(t, t->rec); T264_interpolate_halfpel(t, t->rec); tmp = t->refn[t->param.ref_num]; for(i = t->param.ref_num ; i >= 1 ; i --) { t->refn[i] = t->refn[i - 1]; } t->refn[0] = tmp; t->rec = &t->refn[0]; } void T264_mb_mode_decision(T264_t* t) { if (t->slice_type == SLICE_I) { T264_mode_decision_intra_y(t); } else if(t->slice_type == SLICE_P) { T264_mode_decision_inter_y(t); } } void T264_mb_encode(T264_t* t) { if (t->mb.mb_mode == I_4x4 || t->mb.mb_mode == I_16x16) { T264_encode_intra_y(t); // // Chroma // T264_mode_decision_intra_uv(t); T264_encode_intra_uv(t); t->stat.i_block_num[t->mb.mb_mode] ++; } else if(t->mb.mb_mode == P_L0) { T264_encode_inter_y(t); T264_encode_inter_uv(t); t->stat.p_block_num[t->mb.mb_part] ++; } else if(t->mb.mb_mode == P_SKIP) { // T264_encode_inter_y(t); // T264_encode_inter_uv(t); t->stat.skip_block_num++; } } void T264_emms_c() { } static void T264_init_cpu(T264_t* t) { if ((t->param.cpu & T264_CPU_FORCE) != T264_CPU_FORCE) { t->param.cpu = T264_detect_cpu(); } t->pred16x16[Intra_16x16_TOP] = T264_predict_16x16_mode_0_c; t->pred16x16[Intra_16x16_LEFT] = T264_predict_16x16_mode_1_c; t->pred16x16[Intra_16x16_DC] = T264_predict_16x16_mode_2_c; t->pred16x16[Intra_16x16_PLANE] = T264_predict_16x16_mode_3_c; t->pred16x16[Intra_16x16_DCTOP] = T264_predict_16x16_mode_20_c; t->pred16x16[Intra_16x16_DCLEFT] = T264_predict_16x16_mode_21_c; t->pred16x16[Intra_16x16_DC128] = T264_predict_16x16_mode_22_c; t->pred8x8[Intra_8x8_TOP] = T264_predict_8x8_mode_0_c; t->pred8x8[Intra_8x8_LEFT] = T264_predict_8x8_mode_1_c; t->pred8x8[Intra_8x8_DC] = T264_predict_8x8_mode_2_c; t->pred8x8[Intra_8x8_PLANE] = T264_predict_8x8_mode_3_c; t->pred8x8[Intra_8x8_DCTOP] = T264_predict_8x8_mode_20_c; t->pred8x8[Intra_8x8_DCLEFT] = T264_predict_8x8_mode_21_c; t->pred8x8[Intra_8x8_DC128] = T264_predict_8x8_mode_22_c; t->pred4x4[Intra_4x4_TOP] = T264_predict_4x4_mode_0_c; t->pred4x4[Intra_4x4_LEFT] = T264_predict_4x4_mode_1_c; t->pred4x4[Intra_4x4_DC] = T264_predict_4x4_mode_2_c; t->pred4x4[Intra_4x4_DCTOP] = T264_predict_4x4_mode_20_c; t->pred4x4[Intra_4x4_DCLEFT] = T264_predict_4x4_mode_21_c; t->pred4x4[Intra_4x4_DC128] = T264_predict_4x4_mode_22_c; //cloud add t->pred4x4[Intra_4x4_DIAGONAL_DOWNLEFT] = T264_predict_4x4_mode_3_c; t->pred4x4[Intra_4x4_DIAGONAL_DOWNRIGHT] = T264_predict_4x4_mode_4_c; t->pred4x4[Intra_4x4_VERTICAL_RIGHT] = T264_predict_4x4_mode_5_c; t->pred4x4[Intra_4x4_HORIZONTAL_DOWN] = T264_predict_4x4_mode_6_c; t->pred4x4[Intra_4x4_VERTICAL_LEFT] = T264_predict_4x4_mode_7_c; t->pred4x4[Intra_4x4_HORIZONTAL_UP] = T264_predict_4x4_mode_8_c; if (t->flags & USE_SAD) { t->cmp[MB_16x16] = T264_sad_u_16x16_c; t->cmp[MB_16x8] = T264_sad_u_16x8_c; t->cmp[MB_8x16] = T264_sad_u_8x16_c; t->cmp[MB_8x8] = T264_sad_u_8x8_c; t->cmp[MB_8x4] = T264_sad_u_8x4_c; t->cmp[MB_4x8] = T264_sad_u_4x8_c; t->cmp[MB_4x4] = T264_sad_u_4x4_c; } else { t->cmp[MB_16x16] = T264_satd_u_16x16_c; t->cmp[MB_16x8] = T264_satd_u_16x8_c; t->cmp[MB_8x16] = T264_satd_u_8x16_c; t->cmp[MB_8x8] = T264_satd_u_8x8_c; t->cmp[MB_8x4] = T264_satd_u_8x4_c; t->cmp[MB_4x8] = T264_satd_u_4x8_c; t->cmp[MB_4x4] = T264_satd_u_4x4_c; } t->sad[MB_16x16] = T264_sad_u_16x16_c; t->sad[MB_16x8] = T264_sad_u_16x8_c; t->sad[MB_8x16] = T264_sad_u_8x16_c; t->sad[MB_8x8] = T264_sad_u_8x8_c; t->sad[MB_8x4] = T264_sad_u_8x4_c; t->sad[MB_4x8] = T264_sad_u_4x8_c; t->sad[MB_4x4] = T264_sad_u_4x4_c; t->fdct4x4 = dct4x4_c; t->fdct4x4dc = dct4x4dc_c; t->fdct2x2dc = dct2x2dc_c; t->idct4x4 = idct4x4_c; t->idct4x4dc = idct4x4dc_c; t->idct2x2dc = idct2x2dc_c; t->quant4x4 = quant4x4_c; t->quant4x4dc = quant4x4dc_c; t->quant2x2dc = quant2x2dc_c; t->iquant4x4 = iquant4x4_c; t->iquant4x4dc = iquant4x4dc_c; t->iquant2x2dc = iquant2x2dc_c; t->expand8to16 = expand8to16_c; t->contract16to8 = contract16to8_c; t->contract16to8add = contract16to8add_c; t->expand8to16sub = expand8to16sub_c; t->memcpy_stride_u = memcpy_stride_u_c; t->eighth_pixel_mc_u = T264_eighth_pixel_mc_u_c; t->interpolate_halfpel_h = interpolate_halfpel_h_c; t->interpolate_halfpel_v = interpolate_halfpel_v_c; t->interpolate_halfpel_hv = interpolate_halfpel_hv_c; t->pixel_avg = T264_pixel_avg_c; t->T264_satd_16x16_u = T264_satd_i16x16_u_c; t->emms = T264_emms_c; // flags relative if (t->flags & USE_FULLSEARCH) // xxx t->search = T264_spiral_search_full; else if (t->flags & USE_DIAMONDSEACH) t->search = T264_search; else t->search = T264_search_full; if (t->param.cpu & T264_CPU_MMX) { t->emms = T264_emms_mmx; t->fdct4x4 = dct4x4_mmx; t->fdct4x4dc = dct4x4dc_mmx; t->idct4x4 = idct4x4_mmx; t->idct4x4dc = idct4x4dc_mmx; } if (t->param.cpu & T264_CPU_SSE) { if (t->flags & USE_SAD) { t->cmp[MB_8x16] = T264_sad_u_8x16_sse; t->cmp[MB_8x8] = T264_sad_u_8x8_sse; t->cmp[MB_8x4] = T264_sad_u_8x4_sse; t->cmp[MB_4x8] = T264_sad_u_4x8_sse; t->cmp[MB_4x4] = T264_sad_u_4x4_sse; } t->sad[MB_8x16] = T264_sad_u_8x16_sse; t->sad[MB_8x8] = T264_sad_u_8x8_sse; t->sad[MB_8x4] = T264_sad_u_8x4_sse; t->sad[MB_4x8] = T264_sad_u_4x8_sse; t->sad[MB_4x4] = T264_sad_u_4x4_sse; } if (t->param.cpu & T264_CPU_SSE2) { t->quant4x4 = quant4x4_sse2; t->iquant4x4 = iquant4x4_sse2; if (t->flags & USE_SAD) { t->cmp[MB_16x16] = T264_sad_u_16x16_sse2; t->cmp[MB_16x8] = T264_sad_u_16x8_sse2; } t->sad[MB_16x16] = T264_sad_u_16x16_sse2; t->sad[MB_16x8] = T264_sad_u_16x8_sse2; t->interpolate_halfpel_h = interpolate_halfpel_h_sse2; t->interpolate_halfpel_v = interpolate_halfpel_v_sse2; } } void T264_init_frame(T264_t* t, uint8_t* src, T264_frame_t* f, int32_t frame_num) { f->Y[0] = src; f->U = f->Y[0] + t->width * t->height; f->V = f->U + (t->width * t->height >> 2); f->frame_num = frame_num; } // get non zero count & cbp void T264_mb_encode_post(T264_t* t) { int32_t i, j; if (t->mb.mb_mode == I_16x16) { t->mb.cbp_y = 0; for(i = 0; i < 16 ; i ++) { int32_t x, y; const int32_t nz = array_non_zero_count(&(t->mb.dct_y_z[i][1]), 15); x = luma_inverse_x[i]; y = luma_inverse_y[i]; t->mb.nnz[luma_index[i]] = nz; t->mb.nnz_ref[NNZ_LUMA + y * 8 + x] = nz; if( nz > 0 ) { t->mb.cbp_y = 0x0f; } } } else { t->mb.cbp_y = 0; for(i = 0; i < 16; i ++) { int32_t x, y; const int32_t nz = array_non_zero_count(t->mb.dct_y_z[i], 16); x = luma_inverse_x[i]; y = luma_inverse_y[i]; t->mb.nnz[luma_index[i]] = nz; t->mb.nnz_ref[NNZ_LUMA + y * 8 + x] = nz; if( nz > 0 ) { t->mb.cbp_y |= 1 << (i / 4); } } } /* Calculate the chroma patern */ t->mb.cbp_c = 0; for(i = 0; i < 8; i ++) { int32_t x, y; const int nz = array_non_zero_count(&(t->mb.dct_uv_z[i / 4][i % 4][1]), 15); t->mb.nnz[i + 16] = nz; if (i < 4) { x = i % 2; y = i / 2; t->mb.nnz_ref[NNZ_CHROMA0 + y * 8 + x] = nz; } else { int32_t j = i - 4; x = j % 2; y = j / 2; t->mb.nnz_ref[NNZ_CHROMA1 + y * 8 + x] = nz; } if( nz > 0 ) { t->mb.cbp_c = 0x02; /* dc+ac */ } } if(t->mb.cbp_c == 0x00 && (array_non_zero_count(t->mb.dc2x2_z[0], 4) > 0 || array_non_zero_count(t->mb.dc2x2_z[1], 4) > 0)) { t->mb.cbp_c = 0x01; /* dc only */ } // really decide SKIP mode if(t->slice_type == SLICE_P && t->mb.mb_part == MB_16x16) { if (t->mb.cbp_y == 0 && t->mb.cbp_c == 0) { T264_vector_t vec; T264_predict_mv_skip(t, 0, &vec); if (vec.x == t->mb.vec[0][0].x && vec.y == t->mb.vec[0][0].y) { t->mb.mb_part = MB_16x16; t->mb.mb_mode = P_SKIP; } } } if (t->mb.mb_mode == I_4x4) { int8_t* p = t->mb.i4x4_pred_mode_ref; for(i = 0; i < 16 ; i ++) { int32_t x, y; x = luma_inverse_x[i]; y = luma_inverse_y[i]; p[IPM_LUMA + y * 8 + x] = t->mb.mode_i4x4[i]; t->mb.mode_i4x4[i] = t->mb.mode_i4x4[i]; } } else { memset(t->mb.mode_i4x4, Intra_4x4_DC, 16 * sizeof(uint8_t)); } if (t->mb.mb_mode != I_4x4 && t->mb.mb_mode != I_16x16) { for(i = 0 ; i < 16 ; i ++) { int32_t x, y; x = i % 4; y = i / 4; t->mb.vec_ref[VEC_LUMA + y * 8 + x].vec = t->mb.vec[0][i]; t->mb.vec_ref[VEC_LUMA + y * 8 + x].part = t->mb.mb_part; t->mb.vec_ref[VEC_LUMA + y * 8 + x].subpart = t->mb.submb_part[i]; } } else { memset(t->mb.submb_part, -1, sizeof(t->mb.submb_part)); t->mb.mb_part = -1; #define INITINVALIDVEC(vec) vec.refno = -1; vec.x = vec.y = 0; for(i = 0 ; i < 2 ; i ++) { for(j = 0 ; j < 16 ; j ++) { INITINVALIDVEC(t->mb.vec[i][j]); } } } #undef INITINVALIDVEC } static uint32_t write_dst(uint8_t* src, int32_t nal_pos[4], int32_t nal_num, uint8_t* dst, int32_t dst_size) { int32_t i, j, n; int32_t count; int32_t nal_len; n = 0; for(i = 0 ; i < nal_num - 1; i ++) { nal_len = nal_pos[i + 1] - nal_pos[i]; // start code 00 00 00 01 dst[n ++] = src[0]; dst[n ++] = src[1]; dst[n ++] = src[2]; dst[n ++] = src[3]; count = 0; for(j = 4 ; j < nal_len - 1; j ++) { if (src[j] == 0) { count ++; if (count >= 2 && src[j + 1] <= 3) { dst[n ++] = 0; dst[n ++] = 3; count = 0; continue; } } else { count = 0; } dst[n ++] = src[j]; } dst[n ++] = src[j]; src += nal_len; } return n; } /////////////////////////////////////////////////////////// // interface T264_t* T264_open(T264_param_t* para) { T264_t* t; int32_t i; // // TODO: here check the input param if it is valid // if (para->flags & USE_FORCEBLOCKSIZE) para->flags |= USE_SUBBLOCK; if (para->flags & USE_QUARTPEL) para->flags |= USE_HALFPEL; t = T264_malloc(sizeof(T264_t), CACHE_SIZE); memset(t, 0, sizeof(T264_t)); t->mb_width = para->width >> 4; t->mb_height = para->height >> 4; t->mb_stride = t->mb_width; t->width = t->mb_width << 4; t->height = t->mb_height << 4; t->edged_width = t->width + 2 * EDGED_WIDTH; t->edged_height = t->height + 2 * EDGED_HEIGHT; t->qp_y = para->qp; t->flags = para->flags; t->stride = t->width; t->stride_uv = t->width >> 1; t->edged_stride = t->edged_width; t->edged_stride_uv = t->edged_width >> 1; t->bs_buf = T264_malloc(t->width * t->height << 1, CACHE_SIZE); for(i = 0 ; i < para->ref_num + 1 ; i ++) { uint8_t* p = T264_malloc(t->edged_width * t->edged_height + (t->edged_width * t->edged_height >> 1), CACHE_SIZE); t->refn[i].Y[0] = p + EDGED_HEIGHT * t->edged_width + EDGED_WIDTH; t->refn[i].U = p + t->edged_width * t->edged_height + (t->edged_width * EDGED_HEIGHT >> 2) + (EDGED_WIDTH >> 1); t->refn[i].V = p + t->edged_width * t->edged_height + (t->edged_width * t->edged_height >> 2) + (t->edged_width * EDGED_HEIGHT >> 2) + (EDGED_WIDTH >> 1); t->refn[i].mb = T264_malloc(t->mb_height * t->mb_width * sizeof(T264_mb_context_t), CACHE_SIZE); p = T264_malloc(t->edged_width * t->edged_height * 3, CACHE_SIZE); t->refn[i].Y[1] = p + EDGED_HEIGHT * t->edged_width + EDGED_WIDTH; t->refn[i].Y[2] = t->refn[i].Y[1] + t->edged_width * t->edged_height; t->refn[i].Y[3] = t->refn[i].Y[2] + t->edged_width * t->edged_height; } t->param = *para; t->idr_pic_id = -1; t->frame_id = -1; T264_init_cpu(t); rc_init_seq(t); return t; } void T264_close(T264_t* t) { int32_t i; for(i = 0 ; i < t->param.ref_num + 1 ; i ++) { T264_free(t->refn[i].Y[0] - (EDGED_HEIGHT * t->edged_width + EDGED_WIDTH)); T264_free(t->refn[i].mb); T264_free(t->refn[i].Y[1] - (EDGED_HEIGHT * t->edged_width + EDGED_WIDTH)); } T264_free(t->bs_buf); T264_free(t); } int32_t T264_encode(T264_t* t, uint8_t* src, uint8_t* dst, int32_t dst_size) { int32_t i, j; int32_t nal_pos[4]; // remember each nal start pos int32_t nal_num = 0; int32_t len; eg_init(&t->bs, t->bs_buf, dst_size); T264_init_frame(t, src, &t->cur, t->frame_num); T264_load_ref(t); t->frame_id ++; if (t->frame_num % t->param.idrframe == 0) { nal_pos[nal_num ++] = eg_len(&t->bs); nal_unit_init(&t->nal, 1, NAL_SEQ_SET); nal_unit_write(t, &t->nal); seq_set_init(t, &t->ss); seq_set_write(t, &t->ss); nal_pos[nal_num ++] = eg_len(&t->bs); nal_unit_init(&t->nal, 1, NAL_PIC_SET); nal_unit_write(t, &t->nal); pic_set_init(t, &t->ps); pic_set_write(t, &t->ps); nal_pos[nal_num ++] = eg_len(&t->bs); nal_unit_init(&t->nal, 1, NAL_SLICE_IDR); nal_unit_write(t, &t->nal); t->slice_type = SLICE_I; t->idr_pic_id = (t->idr_pic_id + 1) % 65535; t->frame_num = 0; t->poc = 0; T264_reset_ref(t); rc_init_gop(t); } else if (t->frame_num % t->param.iframe == 0) { nal_pos[nal_num ++] = eg_len(&t->bs); nal_unit_init(&t->nal, 1, NAL_SLICE_NOPART); nal_unit_write(t, &t->nal); t->slice_type = SLICE_I; rc_init_gop(t); } else // P or B pic { nal_pos[nal_num ++] = eg_len(&t->bs); nal_unit_init(&t->nal, 1, NAL_SLICE_NOPART); nal_unit_write(t, &t->nal); t->slice_type = SLICE_P; } rc_init_pic(t); rc_update_qp(t); slice_header_init(t, &t->slice); slice_header_write(t, &t->slice); t->rc.header_bits = eg_len(&t->bs) * 8; t->sad_all = 0; for(i = 0 ; i < t->mb_height ; i ++) { for(j = 0 ; j < t->mb_width ; j ++) { T264_mb_load_context(t, i, j); T264_mb_mode_decision(t); T264_mb_encode(t); T264_mb_encode_post(t); //SKIP if(t->mb.mb_mode == P_SKIP) { t->skip ++; } else { T264_macroblock_write_cavlc(t); } T264_mb_save_context(t); t->sad_all += t->mb.sad; } } /* update current pic */ t->poc = (t->poc + 2) % ((1 << (t->ss.max_pic_order + 4)) - 1); if (t->slice_type != SLICE_B) { T264_save_ref(t); t->frame_num = (t->frame_num + 1) % ((1 << (t->ss.log2_max_frame_num_minus4 + 4)) - 1); } if (t->skip > 0) { eg_write_ue(&t->bs, t->skip); t->skip = 0; } eg_align(&t->bs); eg_flush(&t->bs); nal_pos[nal_num ++] = eg_len(&t->bs); len = write_dst(t->bs_buf, nal_pos, nal_num, dst, dst_size); t->emms(); t->rc.bits = len * 8; rc_update_quad_model(t); rc_update_pic(t); return len; }