www.pudn.com > uda0301s.rar > uda.h


//////////////////////////////////////////////////////////////////////// 
//	UDA Core 0.301 (2006.12.19) Author:dwing 
//////////////////////////////////////////////////////////////////////// 
#include  
#include  
#include  
typedef unsigned char  U8; 
typedef unsigned short U16; 
typedef unsigned int   U32; 
//////////////////////////////////////////////////////////////////////// 
// 0x100000/0x200000:	96/179 MB 
const  U32 MEM	=0x200000;	// 0x10000 ~ 0x2000000 
static U32 type =0; // 0=Default 1=EXE 
static U32 y	=0; // Last bit,0 or 1,set by encoder 
static U32 c0	=1; // Last 0-7 bits of the partial byte with lead 1 bit 
static U32 c1=0,c2=0,c3=0;//Last 1,2,3 byte. 
static U32 c4	=0; // Last 4 whole bytes,packed. Last byte is bits 0-7. 
static U32 bpos =0; // bits in c0 (0 to 7) 
static U32 pos	=0; // Number of input bytes in buf (not wrapped) 
//////////////////////////////////////////////////////////////////////// 
template class Array 
{ 
	T		*data; 
	U8		*p; 
	const U32 SIZE; 
public: 
	explicit Array(U32 i,U32 align=0); 
			~Array()				{free(p);} 
	T&		 operator[](U32 i)		{return data[i];} 
	const T& operator[](U32 i)const {return data[i];} 
	U32 	 size() const			{return SIZE;} 
}; 
//static U32 memsize=0; 
template Array::Array(U32 i,U32 align):SIZE(i) 
{ 
//	memsize+=align+SIZE*sizeof(T);printf("%u ",memsize); 
	if(!(p=(U8*)calloc(align+SIZE*sizeof(T),1))) 
	{ 
		printf("\nERROR: Not enough memory\n"); 
		exit(0); 
	} 
	data=(T*)(align?(p+align)-((U32)p&(align-1)):p); 
} 
//////////////////////////////////////////////////////////////////////// 
class Buf 
{ 
	Array b; 
public: 
		Buf(U32 i):b(i) 		{} 
	U32 size() const			{return b.size();} 
	U8& operator[](U32 i)		{return b[i&(size()-1)];} 
	U32 operator()(U32 i) const {return (U32)b[(pos-i)&(size()-1)];} 
}buf(MEM*8); 
//////////////////////////////////////////////////////////////////////// 
class Ilog	// ilog(x) = round(log2(x)*16),0<=x<65536 
{ 
	Array t; 
public: 
	Ilog():t(65536) 
	{ 
		U32 x=14155776; 
		for(U32 i=2;i<65536;++i) 
			t[i]=(x+=774541002/(i*2-1))>>24;	// 2^29/ln 2 
	} 
	U32 operator()(U16 x) const {return t[x];} 
}ilog; 
//////////////////////////////////////////////////////////////////////// 
U32 squash(int d) // return p = 1/(1+exp(-d)),d scaled 8b,p scaled 12b 
{ 
	static const U32 t[33]= 
	{	   1,	2,	 3,   6,  10,  16,	27,  45, 
		  73, 120, 194, 310, 488, 747,1101,1546, 
		2047,2549,2994,3348,3607,3785,3901,3975, 
		4022,4050,4068,4079,4085,4089,4092,4093,4094 
	}; 
	if(d> 2047) return 4095; 
	if(d<-2047) return 0; 
	const U32 w=d&127; 
	d=(d>>7)+16; 
	return (t[d]*(128-w)+t[(d+1)]*w+64) >> 7; 
} 
//////////////////////////////////////////////////////////////////////// 
// Inverse of squash. d = ln(p/(1-p)),d scaled by 8 bits,p by 12 bits. 
class Stretch 
{ 
	Array t; 
public: 
	Stretch():t(4096) 
	{ 
		U32 j=0; 
		for(int x=-2047;x<=2047;++x)	// invert squash() 
			for(U32 i=squash(x);j<=i;++j) t[j]=x; 
		t[4095]=2047; 
	} 
	int operator()(U32 p) const {return t[p];} 
}stretch; 
//////////////////////////////////////////////////////////////////////// 
// Mixer m(N,M,S=1,w=0) combines models using M neural networks with 
//	 N inputs each,of which up to S may be selected.  If S > 1 then 
//	 the outputs of these neural networks are combined using another 
//	 neural network (with parameters S,1,1).	If S = 1 then the 
//	 output is direct.	The weights are initially w (+-32K). 
//	 It is used as follows: 
// m.update() trains the network where the expected output is the 
//	 last bit (in the global variable y). 
// m.add(stretch(p)) inputs prediction from one of N models.  The 
//	 prediction should be positive to predict a 1 bit,negative for 0, 
//	 nominally +-256 to +-2K.  The maximum allowed value is +-32K but 
//	 using such large values may cause overflow ifN is large. 
// m.set(cxt,range) selects cxt as one of 'range' neural networks to 
//	 use.  0 <= cxt < range.  Should be called up to S times such 
//	 that the total of the ranges is <= M. 
// m.p() returns the output prediction that the next bit is 1 as a 
//	 12 bit number (0 to 4095). 
// dot_product returns dot product t*w of n elements.  n is rounded 
// up to a multiple of 8.  Result is scaled down by 8 bits. 
//int dot_product(short *t,short *w,U32 n) 
//{ 
//	int sum=0; 
//	n=(n+7)&-8; 
//	for(U32 i=0;i> 8; 
//	return sum; 
//} 
__declspec(naked) int __stdcall dot_product(short *t,short *w,U32 n) 
{__asm{ 
		mov 	ecx,[esp+12]	// n 
		dec 	ecx 			// n rounding up 
		mov 	edx,[esp+ 8]	// w 
		and 	ecx,-8 
		js		done_ 
		mov 	eax,[esp+ 4]	// t 
		pxor	mm0,mm0 		// sum = 0 
loop_:	movq	mm1,[eax+ecx*2	]// put halves of vector product in mm0 
		pmaddwd mm1,[edx+ecx*2	] 
		movq	mm2,[eax+ecx*2+8] 
		pmaddwd mm2,[edx+ecx*2+8] 
		psrad	mm1,8 
		psrad	mm2,8 
		paddd	mm0,mm1 
		paddd	mm0,mm2 
		sub 	ecx,8			// each loop sums 4 products 
		jns 	loop_ 
		movq	mm1,mm0 		// add 2 halves of mm0 and return in eax 
		psrlq	mm1,32 
		paddd	mm0,mm1 
		movd	eax,mm0 
		emms 
done_:	ret 	4*3 
}} 
// Train neural network weights w[n] given inputs t[n] and err. 
// w[i] += t[i]*err,i=0..n-1.	t,w,err are signed 16 bits (+- 32K). 
// err is scaled 16 bits (representing +- 1/2).w[i] is clamped to +- 32K 
// and rounded.  n is rounded up to a multiple of 8. 
//void train(short *t,short *w,U32 n,int err) 
//{ 
//	n=(n+7)&-8; 
//	for(U32 i=0;i>16)+1)>>1); 
//		if(wt<-32768) wt=-32768; 
//		if(wt> 32767) wt= 32767; 
//		w[i]=wt; 
//	} 
//} 
__declspec(naked) void __stdcall train(short *t,short *w,U32 n,U32 err) 
{__asm{ mov 	eax,[esp+16]		// err 
		pcmpeqd mm1,mm1 			// 4 copies of 1 in mm1 
		movd	mm0,eax 
		psrlw	mm1,15 
	  punpcklwd mm0,mm0 			// put 2 copies of err in mm0 
		mov 	ecx,[esp+12]		// n 
	  punpcklwd mm0,mm0 			// put 4 copies of err in mm0 
		mov 	eax,[esp+ 4]		// t 
		dec 	ecx 				// n/8 rounding up 
		mov 	edx,[esp+ 8]		// w 
		and 	ecx,-8 
		js		done_ 
loop_:	movq	mm2,[edx+ecx*2	]	// w[i] 
		movq	mm3,[eax+ecx*2	]	// t[i] 
		movq	mm4,[edx+ecx*2+8]	// w[i] 
		movq	mm5,[eax+ecx*2+8]	// t[i] 
		paddsw	mm3,mm3 
		paddsw	mm5,mm5 
		pmulhw	mm3,mm0 
		pmulhw	mm5,mm0 
		paddsw	mm3,mm1 
		paddsw	mm5,mm1 
		psraw	mm3,1 
		psraw	mm5,1 
		paddsw	mm2,mm3 
		paddsw	mm4,mm5 
		movq	[edx+ecx*2	],mm2 
		movq	[edx+ecx*2+8],mm4 
		sub 	ecx,8				// each iteration adjusts 8 weights 
		jns 	loop_ 
		emms 
done_:	ret 	4*4 
}} 
//////////////////////////////////////////////////////////////////////// 
class Mixer 
{ 
	const U32 N,S;		// max inputs,max context sets 
	Array tx;	// N inputs from add() 
	Array wx;	// N*M weights 
	Array cxt; 	// S contexts 
	U32 ncxt;			// number of contexts (0 to S) 
	U32 base;			// offset of next context 
	U32 nx; 			// Number of inputs in tx,0 to N 
	Array pr;		// last result (scaled 12 bits) 
	Mixer *mp;			// points to a Mixer to combine results 
public: 
	Mixer(U32 n,U32 k,U32 s=1,int w=0):N((n+7)&-8),S(s),tx(N,16), 
			wx(N*k,16),cxt(S),ncxt(0),base(0),nx(0),pr(S),mp(0) 
	{ 
		U32 i; 
		for(i=0;i1) mp=new Mixer(S,1,1,0x7fff); 
	} 
	~Mixer() {if(S>1) delete mp;} 
	void add(int x){tx[nx++]=x;}// Input x (call up to N times) 
	void set(U32 cx,U32 range) 
	{				// Set a context (call S times,sum of ranges <= M) 
		cxt[ncxt++]=base+cx; 
		base+=range; 
	} 
	// Adjust weights to minimize coding cost of last prediction 
	void update() 
	{ 
		for(U32 i=ncxt;i--;) 
			train(&tx[0],&wx[cxt[i]*N],nx,((y<<12)-pr[i])*7); 
		nx=base=ncxt=0; 
	} 
	U32 p() 					// predict next bit 
	{ 
		while(nx&7) tx[nx++]=0; // pad 
		if(mp)	// combine outputs 
		{ 
			mp->update(); 
			for(U32 i=ncxt;i--;) 
			{ 
				pr[i]=squash(dot_product(&tx[0],&wx[cxt[i]*N],nx)>>5); 
				mp->add(stretch(pr[i])); 
			} 
			mp->set(0,1); 
			return mp->p(); 
		} 
		return pr[0]=squash(dot_product(&tx[0],&wx[0],nx)>>8); 
	} 
}m(512,(16+256*3+512*13),4,128); 
//////////////////////////////////////////////////////////////////////// 
// APM maps a probability and a context into a new probability 
// that bit y will next be 1.  After each guess it updates 
// its state to improve future guesses.  Methods: 
// APM a(N) creates with N contexts,uses 66*N bytes memory. 
// a.p(pr,cx,rate=8) returned adjusted probability in context cx (0 to 
//	 N-1). rate determines the learning rate (smaller=faster,default 8). 
//	 Probabilities are scaled 16 bits (0-65535). 
class APM 
{ 
	U32 index;		// last p,context 
	const U32 N;	// number of contexts 
	Array t;	// [N][33]: p,context -> p 
public: 
	APM(U32 n):index(0),N(n),t(n*33)	// maps p,cxt -> p initially 
	{ 
		for(U32 i=0;i>rate; 
		t[index+1] += (g-t[index+1])>>rate; 
		const U32 w=pr&127;  // interpolation weight (33 points) 
		index=((pr+2048)>>7)+cxt*33; 
		return (t[index]*(128-w)+t[index+1]*w) >> 11; 
	} 
}; 
//////////////////////////////////////////////////////////////////////// 
#define nex(state,sel) state_table[state][sel] 
static const U8 state_table[256][4]={ 
{  1,  2, 0, 0},{  3,  5, 1, 0},{  4,  6, 0, 1},{  7, 10, 2, 0},//0-3 
{  8, 12, 1, 1},{  9, 13, 1, 1},{ 11, 14, 0, 2},{ 15, 19, 3, 0},//4-7 
{ 16, 23, 2, 1},{ 17, 24, 2, 1},{ 18, 25, 2, 1},{ 20, 27, 1, 2},//8-11 
{ 21, 28, 1, 2},{ 22, 29, 1, 2},{ 26, 30, 0, 3},{ 31, 33, 4, 0},//12-15 
{ 32, 35, 3, 1},{ 32, 35, 3, 1},{ 32, 35, 3, 1},{ 32, 35, 3, 1},//16-19 
{ 34, 37, 2, 2},{ 34, 37, 2, 2},{ 34, 37, 2, 2},{ 34, 37, 2, 2},//20-23 
{ 34, 37, 2, 2},{ 34, 37, 2, 2},{ 36, 39, 1, 3},{ 36, 39, 1, 3},//24-27 
{ 36, 39, 1, 3},{ 36, 39, 1, 3},{ 38, 40, 0, 4},{ 41, 43, 5, 0},//28-31 
{ 42, 45, 4, 1},{ 42, 45, 4, 1},{ 44, 47, 3, 2},{ 44, 47, 3, 2},//32-35 
{ 46, 49, 2, 3},{ 46, 49, 2, 3},{ 48, 51, 1, 4},{ 48, 51, 1, 4},//36-39 
{ 50, 52, 0, 5},{ 53, 43, 6, 0},{ 54, 57, 5, 1},{ 54, 57, 5, 1},//40-43 
{ 56, 59, 4, 2},{ 56, 59, 4, 2},{ 58, 61, 3, 3},{ 58, 61, 3, 3},//44-47 
{ 60, 63, 2, 4},{ 60, 63, 2, 4},{ 62, 65, 1, 5},{ 62, 65, 1, 5},//48-51 
{ 50, 66, 0, 6},{ 67, 55, 7, 0},{ 68, 57, 6, 1},{ 68, 57, 6, 1},//52-55 
{ 70, 73, 5, 2},{ 70, 73, 5, 2},{ 72, 75, 4, 3},{ 72, 75, 4, 3},//56-59 
{ 74, 77, 3, 4},{ 74, 77, 3, 4},{ 76, 79, 2, 5},{ 76, 79, 2, 5},//60-63 
{ 62, 81, 1, 6},{ 62, 81, 1, 6},{ 64, 82, 0, 7},{ 83, 69, 8, 0},//64-67 
{ 84, 71, 7, 1},{ 84, 71, 7, 1},{ 86, 73, 6, 2},{ 86, 73, 6, 2},//68-71 
{ 44, 59, 5, 3},{ 44, 59, 5, 3},{ 58, 61, 4, 4},{ 58, 61, 4, 4},//72-75 
{ 60, 49, 3, 5},{ 60, 49, 3, 5},{ 76, 89, 2, 6},{ 76, 89, 2, 6},//76-79 
{ 78, 91, 1, 7},{ 78, 91, 1, 7},{ 80, 92, 0, 8},{ 93, 69, 9, 0},//80-83 
{ 94, 87, 8, 1},{ 94, 87, 8, 1},{ 96, 45, 7, 2},{ 96, 45, 7, 2},//84-87 
{ 48, 99, 2, 7},{ 48, 99, 2, 7},{ 88,101, 1, 8},{ 88,101, 1, 8},//88-91 
{ 80,102, 0, 9},{103, 69,10, 0},{104, 87, 9, 1},{104, 87, 9, 1},//92-95 
{106, 57, 8, 2},{106, 57, 8, 2},{ 62,109, 2, 8},{ 62,109, 2, 8},//96-99 
{ 88,111, 1, 9},{ 88,111, 1, 9},{ 80,112, 0,10},{113, 85,11, 0},//100-03 
{114, 87,10, 1},{114, 87,10, 1},{116, 57, 9, 2},{116, 57, 9, 2},//104-07 
{ 62,119, 2, 9},{ 62,119, 2, 9},{ 88,121, 1,10},{ 88,121, 1,10},//108-11 
{ 90,122, 0,11},{123, 85,12, 0},{124, 97,11, 1},{124, 97,11, 1},//112-15 
{126, 57,10, 2},{126, 57,10, 2},{ 62,129, 2,10},{ 62,129, 2,10},//116-19 
{ 98,131, 1,11},{ 98,131, 1,11},{ 90,132, 0,12},{133, 85,13, 0},//120-23 
{134, 97,12, 1},{134, 97,12, 1},{136, 57,11, 2},{136, 57,11, 2},//124-27 
{ 62,139, 2,11},{ 62,139, 2,11},{ 98,141, 1,12},{ 98,141, 1,12},//128-31 
{ 90,142, 0,13},{143, 95,14, 0},{144, 97,13, 1},{144, 97,13, 1},//132-35 
{ 68, 57,12, 2},{ 68, 57,12, 2},{ 62, 81, 2,12},{ 62, 81, 2,12},//136-39 
{ 98,147, 1,13},{ 98,147, 1,13},{100,148, 0,14},{149, 95,15, 0},//140-43 
{150,107,14, 1},{150,107,14, 1},{108,151, 1,14},{108,151, 1,14},//144-47 
{100,152, 0,15},{153, 95,16, 0},{154,107,15, 1},{108,155, 1,15},//148-51 
{100,156, 0,16},{157, 95,17, 0},{158,107,16, 1},{108,159, 1,16},//152-55 
{100,160, 0,17},{161,105,18, 0},{162,107,17, 1},{108,163, 1,17},//156-59 
{110,164, 0,18},{165,105,19, 0},{166,117,18, 1},{118,167, 1,18},//160-63 
{110,168, 0,19},{169,105,20, 0},{170,117,19, 1},{118,171, 1,19},//164-67 
{110,172, 0,20},{173,105,21, 0},{174,117,20, 1},{118,175, 1,20},//168-71 
{110,176, 0,21},{177,105,22, 0},{178,117,21, 1},{118,179, 1,21},//172-75 
{110,180, 0,22},{181,115,23, 0},{182,117,22, 1},{118,183, 1,22},//176-79 
{120,184, 0,23},{185,115,24, 0},{186,127,23, 1},{128,187, 1,23},//180-83 
{120,188, 0,24},{189,115,25, 0},{190,127,24, 1},{128,191, 1,24},//184-87 
{120,192, 0,25},{193,115,26, 0},{194,127,25, 1},{128,195, 1,25},//188-91 
{120,196, 0,26},{197,115,27, 0},{198,127,26, 1},{128,199, 1,26},//192-95 
{120,200, 0,27},{201,115,28, 0},{202,127,27, 1},{128,203, 1,27},//196-99 
{120,204, 0,28},{205,115,29, 0},{206,127,28, 1},{128,207, 1,28},//200-03 
{120,208, 0,29},{209,125,30, 0},{210,127,29, 1},{128,211, 1,29},//204-07 
{130,212, 0,30},{213,125,31, 0},{214,137,30, 1},{138,215, 1,30},//208-11 
{130,216, 0,31},{217,125,32, 0},{218,137,31, 1},{138,219, 1,31},//212-15 
{130,220, 0,32},{221,125,33, 0},{222,137,32, 1},{138,223, 1,32},//216-19 
{130,224, 0,33},{225,125,34, 0},{226,137,33, 1},{138,227, 1,33},//220-23 
{130,228, 0,34},{229,125,35, 0},{230,137,34, 1},{138,231, 1,34},//224-27 
{130,232, 0,35},{233,125,36, 0},{234,137,35, 1},{138,235, 1,35},//228-31 
{130,236, 0,36},{237,125,37, 0},{238,137,36, 1},{138,239, 1,36},//232-35 
{130,240, 0,37},{241,125,38, 0},{242,137,37, 1},{138,243, 1,37},//236-39 
{130,244, 0,38},{245,135,39, 0},{246,137,38, 1},{138,247, 1,38},//240-43 
{140,248, 0,39},{249,135,40, 0},{250, 69,39, 1},{ 80,251, 1,39},//244-47 
{140,252, 0,40},{249,135,41, 0},{250, 69,40, 1},{ 80,251, 1,40},//248-51 
{140,252, 0,41}}; // 252,253-255 are reserved 
//////////////////////////////////////////////////////////////////////// 
// A StateMap maps a nonstationary counter state to a probability. 
// After each mapping,the mapping is adjusted to improve future 
// predictions.  Methods: 
// sm.p(cx) converts state cx (0-255) to a probability (0-4095). 
// Counter state -> probability * 256 
class StateMap 
{ 
	U32 cxt;		// context 
	Array t;	// 256 states -> probability * 64K 
public: 
	StateMap():cxt(0),t(256) 
	{ 
		for(U32 i=0;i<256;++i) 
		{ 
			U32 n0=nex(i,2); 
			U32 n1=nex(i,3); 
			if(!n0) n1*=256;	//128 
			if(!n1) n0*=256;	//128 
			t[i]=65536*(n1+1)/(n0+n1+2); 
		} 
	} 
	U32 p(U32 cx) 
	{ 
		t[cxt]+=((y<<16)-t[cxt]+128)>>8; 
		return t[cxt=cx]>>4; 
	} 
}; 
//////////////////////////////////////////////////////////////////////// 
inline U32 hash(U32 a,U32 b,U32 c=~0,U32 d=~0,U32 e=~0) // ~0 
{ 
	U32 h=a*200002979u+b*30005491u+c*50004239u+d*70004807u+e*110002499u; 
	return h;//^h>>9^a>>2^b>>3^c>>4^d>>5^e>>6; 
} 
//////////////////////////////////////////////////////////////////////// 
// A ContextMap maps contexts to a bit histories and makes predictions 
// to a Mixer.	Methods common to all classes: 
// ContextMap cm(M,C); creates using about M bytes of memory (a power 
//	 of 2) for C contexts. 
// cm.set(cx);	sets the next context to cx,called up to C times 
//	 cx is an arbitrary 32 bit value that identifies the context. 
//	 It should be called before predicting the first bit of each byte. 
// cm.mix(m) updates Mixer m with the next prediction.	Returns 1 
//	 ifcontext cx is found,else 0. Then it extends all the contexts with 
//	 global bit y.	It should be called for every bit: 
//	   if(bpos==0) 
//		 for(int i=0; i= 1. Context need not be hashed. 
// Predict to mixer m from bit history state s,using sm to map s to 
// a probability. 
inline U32 mix2(U32 s,StateMap &sm) 
{ 
	U32 p1=sm.p(s); 
	const U32 n0=-!nex(s,2); 
	const U32 n1=-!nex(s,3); 
	const int st=stretch(p1)>>2; 
	m.add(st); 
	p1>>=4; 
	const U32 p0=255-p1; 
	m.add((p1-p0)); 
	m.add((n1-n0)*st); 
	m.add((p1&n0)-(p0&n1)); 
	m.add((p1&n1)-(p0&n0)); 
	return s>0; 
} 
//////////////////////////////////////////////////////////////////////// 
// Context is looked up directly.  m=size is power of 2 in bytes. 
// Context should be < m/512.  High bits are discarded. 
class SmallStationaryContextMap 
{ 
	Array t; 
	U32 cxt; 
	U16 *cp; 
public: 
	SmallStationaryContextMap(U32 n):t(n/2),cxt(0) 
	{ 
		for(U32 i=0;i> rate; 
		cp=&t[cxt+c0]; 
		m.add(stretch(*cp>>4)); 
	} 
}; 
//////////////////////////////////////////////////////////////////////// 
// Context map for large contexts.Most modeling use this type of context 
// map.It includes a built in RunContextMap to predict last byte seen 
// in the same context,and also bit-level contexts that map to a bit 
// history state. 
// Bit histories are stored in a hash table. The table is organized into 
// 64-byte buckets alinged on cache page boundaries.Each bucket contains 
// a hash chain of 7 elements,plus a 2 element queue(packed into 1 byte) 
// of the last 2 elements accessed for LRU replacement. Each element has 
// a 2 byte checksum for detecting collisions,and an array of 7b history 
// states indexed by last 0~2 bits of context.The buckets are indexed 
// by a context ending after 0,2,or 5 bits of current byte.Thus,each 
// byte modeled results in 3 main memory accesses per context,with all 
// other accesses to cache. 
// On bits 0,2 and 5,the context is updated and a new bucket is selected 
// The most recently accessed element is tried first,by comparing the 
// 16 bit checksum,then the 7 elements are searched linearly.If no match 
// is found,then the element with the lowest priority among 5 elements 
// not in the LRU queue is replaced.  After a replacement,the queue is 
// emptied (so that consecutive misses favor a LFU replacement policy). 
// In all cases,the found/replaced element is put in front of the queue. 
// The priority is the state number of the first element (the one with 0 
// additional bits of context).The states are sorted by increasing n0+n1 
// (number of bits seen),implementing a LFU replacement policy. 
// When the context ends on a byte boundary (bit 0),only 3 of the 7 bit 
// history states are used.  The remaining 4 bytes implement a run model 
// as follows:     where  is the last byte 
// seen,possibly repeated,and  and  are the two bytes seen 
// before the first .   is a 7 bit count and a 1 bit 
// flag.  If d=0 then  = 1..127 is the number of repeats of  
// and no other bytes have been seen,and  are not used. 
// If  = 1 then the history is ,,and  - 2 repeats 
// of .  In this case, is valid only if >= 3 and 
//  is valid only if >= 2. 
// As an optimization, last two hash elements of each byte(representing 
// contexts with 2-7 bits) are not updated until a context is seen for 
// a second time.  This is indicated by  = <1,0>.After update, 
//  is updated to <2,0> or <2,1>. 
class ContextMap 
{ 
	class E 		// hash element,64 bytes 
	{ 
		U16 chk[7]; // byte context checksums 
		U8 last;	// last 2 accesses (0-6) in low,high nibble 
	public: 
		U8 bh[7][7];// byte context,3-bit context -> bit history state 
		// bh[][0] = 1st bit,bh[][1,2] = 2nd bit,bh[][3..6] = 3rd bit 
		// bh[][0] is also a replacement priority,0 = empty 
		U8* get(U16 chk);  // Find element (0-6) matching checksum. 
		// If not found,insert or replace lowest priority (not last). 
	}; 
	Array t; 	// bit histories for bits 0-1,2-4,5-7 
	// For 0-1,also contains a run count in bh[][4] and value in bh[][5] 
	// and pending update count in bh[7] 
	Array cp;	// C pointers to current bit history 
	Array cp0; // First element of 7 element array containing cp[i] 
	Array cxt; // C whole byte contexts (hashes) 
	Array runp;// C [0..3] = count,value,unused,unused 
	StateMap *sm;	// C maps of state -> p 
	U32 cn; 		// Next context to set by set() 
public: 
	ContextMap(U32 n,U32 c=1);	//m = memory in bytes,a power of 2,C = c 
	~ContextMap()		{delete []sm;} 
	void set(U32 cx);	// set next whole byte context 
	U32 mix(); 
}; 
ContextMap::	// Construct using m bytes of memory for c contexts 
ContextMap(U32 n,U32 c):t(n>>6,64),cp(c),cp0(c),cxt(c),runp(c),cn(0) 
{ 
	sm=new StateMap[c]; 
	for(U32 i=0;i>4!=i && pri>16; 
	cxt[i]=cx+i; 
} 
// Update the model with bit y1,and predict next bit to mixer m. 
U32 ContextMap::mix()	// Update model with y 
{ 
	const U32 bp=bpos,cc=c0; 
	U32 ret=0; 
	for(U32 i=0;i1&&runp[i][0]==0)	cp[i]=0; 
		else if(bp==1||bp==3||bp==6)	cp[i]=cp0[i]+1+(cc&1); 
		else if(bp==4||bp==7)			cp[i]=cp0[i]+3+(cc&3); 
		else 
		{ 
			cp0[i]=cp[i]=t[cxt[i]+cc&t.size()-1].get(cxt[i]>>16); 
			if(bp==0)	// Update pending bit histories for bits 2-7 
			{ 
				if(cp0[i][3]==2) 
				{ 
					const U32 c=cp0[i][4]+256; U8 *p; 
					p=t[cxt[i]+(c>>6)&t.size()-1].get(cxt[i]>>16); 
					p[0 		  ]=((c>>5)&1)+1; 
					p[1+((c>>5)&1)]=((c>>4)&1)+1; 
					p[3+((c>>4)&3)]=((c>>3)&1)+1; 
					p=t[cxt[i]+(c>>3)&t.size()-1].get(cxt[i]>>16); 
					p[0 		  ]=((c>>2)&1)+1; 
					p[1+((c>>2)&1)]=((c>>1)&1)+1; 
					p[3+((c>>1)&3)]=( c    &1)+1; 
					cp0[i][6]=0; 
				} 
				// Update run count of previous context 
				if(!runp[i][0]) 		// new context 
					runp[i][0]=2,runp[i][1]=c1; 
				else if(runp[i][1]!=c1) // different byte in context 
					runp[i][0]=1,runp[i][1]=c1; 
				else if(runp[i][0]<254) // same byte in context 
					runp[i][0]+=2; 
				runp[i]=cp0[i]+3; 
			} 
		} 
		// predict from last byte in context 
		if(((U32)runp[i][1]+256)>>(8-bp)==cc)//predicted bit +for1,-for0 
		{ 
			U32 rc=runp[i][0];// count*2,+1 if 2 different bytes seen 
			m.add(((runp[i][1]>>(7-bp)&1)*2-1)*ilog(rc+1)<<(2+(~rc&1))); 
		} 
		else 
			m.add(0); 
		// predict from bit context 
		ret+=mix2(cp[i]?*cp[i]:0, sm[i]); 
	} 
	if(bp==7) cn=0; 
	return ret; 
} 
//////////////////////////////////////////////////////////////////////// 
//matchModel() finds the longest matching context and returns its length 
U32 matchModel() 
{ 
	const  U32 MAXLEN=2047; 	// longest allowed match + 1 
	static Array t(MEM);	// hash table of pointers to contexts 
	static U32 h  =0;			// hash of last 7 bytes 
	static U32 ptr=0;			// points to next byte of match if any 
	static U32 len=0;			// length of match,or 0 ifno match 
	static U32 ret=0; 
	if(!bpos) 
	{ 
		h=h*997*8+c1+1&t.size()-1; // update context hash 
		if(len) ++len,++ptr; 
		else		// find match 
		{ 
			ptr=t[h]; 
			if(ptr && pos-ptr0&&!(ret&0xfff)) printf("pos=%d len=%d ptr=%d\n",pos,len,ptr); 
	} 
	// predict 
	if(len>MAXLEN) len=MAXLEN; 
	int sgn; 
	if(len && c1==buf[ptr-1] && c0==((U32)buf[ptr]+256)>>(8-bpos)) 
	{ 
		if(buf[ptr]>>(7-bpos)&1) sgn=1; 
		else sgn=-1; 
	} 
	else sgn=len=0; 
	m.add(sgn* 4*ilog(len)); 
	m.add(sgn*64*(len<32?len:32));	//112 
	return ret; 
} 
//////////////////////////////////////////////////////////////////////// 
// Model 2-D data with fixed record length. Also order 1-2 models 
// that include the distance to the last match. 
void recordModel() 
{										//buf(1)->last 3 pos 
	static Array cpos1(256),cpos2(256),cpos3(256),cpos4(256); 
	static U32 rlen=2,rlen1=3,rlen2=4;	// run length and 2 candidates 
	static U32 rcount1=0,rcount2=0; 	// candidate counts 
	static ContextMap cm(MEM*4,4); 
	if(!bpos) 
	{ 
		const U32 c=c1,r=pos-cpos1[c]; 
		if(r>1 && r==cpos1[c]-cpos2[c] 
			   && r==cpos2[c]-cpos3[c] 
			   && r==cpos3[c]-cpos4[c] 
			   && (r>15 || (c==buf(r*5+1)) && c==buf(r*6+1))) 
		{ 
				 if(r==rlen1) ++rcount1; 
			else if(r==rlen2) ++rcount2; 
			else if(rcount1>rcount2) rlen2=r,rcount2=1; 
			else rlen1=r,rcount1=1; 
		} 
		if(rcount1>15 && rlen!=rlen1) rlen=rlen1,rcount1=rcount2=0; 
		if(rcount2>15 && rlen!=rlen2) rlen=rlen2,rcount1=rcount2=0; 
		const U32 col=pos%rlen; 
		cm.set(c1<<8|buf(rlen));					//27 
		cm.set(rlen|buf(rlen)<<10|buf(rlen*2)<<18); //25 
		cm.set(rlen|c1		 <<10|col<<18); 		//34 
		cm.set( col|rlen<<12);						//14 
		cpos4[c]=cpos3[c];	// update last context positions 
		cpos3[c]=cpos2[c]; 
		cpos2[c]=cpos1[c]; 
		cpos1[c]=pos; 
	} 
	cm.mix(); 
} 
//////////////////////////////////////////////////////////////////////// 
// Model order 1-2 contexts with gaps. 
void sparseModel() 
{ 
	static ContextMap cm(MEM*4,7),scm(MEM,4); 
	if(!bpos) 
	{ 
		 cm.set(c4&0x00ff00ff); 	//16 
		 cm.set(c4&0xff0000ff); 	//18 
		 cm.set(c4&0x00ffff00); 	//29 
		 cm.set(c1|buf(5)<<8);		//16 
		 cm.set(c1|buf(6)<<8);		//17 
		 cm.set(c3|buf(6)<<8);		//14 
		 cm.set(c4>>24|buf(8)<<8);	//18 
		scm.set(c1);				//70 
		scm.set(c2);				//39 
		scm.set(c3);				//14 
		scm.set(buf(8));			//39 
	} 
	 cm.mix(); 
	scm.mix(); 
} 
//////////////////////////////////////////////////////////////////////// 
// Model English text (words and columns/end of line) 
void wordModel() 
{ 
	static U32 word0=0; 
	static U32 text0=0; // hash stream of letters 
	static ContextMap cm(MEM*16,4); 
	if(!bpos) 
	{ 
		U32 c=c1; 
		if(c-'A'<='Z'-'A') c+='a'-'A'; 
		if(c-'a'<='z'-'a') 
		{ 
			word0=word0*263* 4+c; 
			text0=text0*997*16+c; 
		} 
		else word0=0; 
		cm.set(word0);					//102 
		cm.set(text0&0xffffff); 		//23 
		cm.set(c4&0xffff0000);			//25 
		cm.set(c4&0xff00ff00|buf(6));	//22 
	} 
	cm.mix(); 
} 
//////////////////////////////////////////////////////////////////////// 
// The context is 1B string history that occurs within a 1/2 B context. 
void indirectModel() 
{ 
	static ContextMap cm(MEM*4,2); 
	static Array t1(256); 
	if(!bpos) 
	{ 
		U32 &r1=t1[c2]; r1=r1<<8|c1; 
		const U32 t=c1|t1[c1]<<8; 
		cm.set(t&0xffff);	//13 
		cm.set(t&0xffffff); //12 
	} 
	cm.mix(); 
} 
//////////////////////////////////////////////////////////////////////// 
// Model a 24-bit color uncompressed .bmp or .tiffile. 
// Return width in pixels ifan image file is detected,else 0. 
// 32-bit little endian number at buf(i)..buf(i-3) 
inline U32 i4(int i) 
{ 
	return buf(i)+256*buf(i-1)+65536*buf(i-2)+16777216*buf(i-3); 
} 
inline U32 i2(int i)	// 16-bit 
{ 
	return buf(i)+256*buf(i-1); 
} 
inline U32 sqrbuf(int i)// Square buf(i) 
{ 
	return buf(i)*buf(i); 
} 
int bmpModel() 
{ 
	static U32 w   =0;	// width of image in bytes (pixels * 3) 
	static U32 eoi =0;	// end of image 
	static U32 tiff=0;	// offset of tifheader 
	const  U32 SC  =0x20000; 
	static SmallStationaryContextMap scm1(SC),scm2(SC), 
				   scm3(SC),scm4(SC),scm5(SC),scm6(SC*2); 
	static ContextMap cm(MEM*4,8); 
	// Detect .bmp file header (24 bit color,not compressed) 
	if(!bpos && buf(54)=='B' && buf(53)=='M' && 
				 i4(44)==54  &&  i4(40)==40  && 
				 i2(26)==24  &&  i4(24)==0	 ) 
	{ 
		w=(i4(36)+3&-4)*3;	// image width 
		const U32 height=i4(32); 
		if(w<0x30000 && height<0x10000) 
		{ 
			eoi=pos+w*height;  // image size in bytes 
//			printf("BMP %dx%d\n",w/3,height); 
		} 
		else eoi=pos; 
	} 
	// Detect .tiffile header (24 bit color,not compressed). 
	// Parsing is crude,won't work with weird formats. 
	if(!bpos) 
	{ 
		if(c4==0x49492a00) tiff=pos;  // Intel format only 
		if(pos-tiff==4&&c4!=0x08000000) tiff=0;//8=normal offset to dir 
		if(tiff && pos-tiff==200)//most directory should be read by now 
		{ 
			int dirsize=i2(pos-tiff-4); // number of 12B dir entries 
			int bpp=0,compression=0,width=w=0,height=0; 
			for(U32 i=tiff+6; i0; i+=12) 
			{ 
				int tag=i2(pos-i);//256=width,257=height,259:1=nocompre 
				// 277=3 samples/pixel 
				int tagfmt=i2(pos-i-2); // 3=short,4=long 
				int taglen=i4(pos-i-4); // number of elements in tagval 
				int tagval=i4(pos-i-8); //1long,1-2short,or points2array 
				if((tagfmt==3||tagfmt==4) && taglen==1) 
				{ 
					if(tag==256) width=tagval; 
					if(tag==257) height=tagval; 
					if(tag==259) compression=tagval; // 1=no compression 
					if(tag==277) bpp=tagval;  // should be 3 
				} 
			} 
			if(width>0 && height>0 && width*height>50 && compression==1 
				&& (bpp==1||bpp==3)) 
				eoi=tiff+width*height*bpp,w=width*bpp; 
			if(eoi<=pos) tiff=w=0; 
//			else printf("TIFF %dx%dx%d\n",width,height,bpp); 
		} 
	} 
	if(pos>eoi) return w=0; 
	if(!bpos)	// Select nearby pixels as context 
	{ 
		const U32 color=pos%3; 
		U32 mean=c3+buf(w-3)+buf(w)+buf(w+3); 
		const U32 var=(sqrbuf(3)+sqrbuf(w-3)+sqrbuf(w)+sqrbuf(w+3)- 
													mean*mean/4)>>2; 
		mean>>=2; 
		const U32 logvar=ilog(var); 
		U32 i=0; 
		 cm .set(hash(++i,c3>>2,buf(w)>>2,color)); 
		 cm .set(hash(++i,c3>>2,	c1>>2,color)); 
		 cm .set(hash(++i,c3>>2,	c2>>2,color)); 
		 cm .set(hash(++i,buf(w)>>2,c1>>2,color)); 
		 cm .set(hash(++i,buf(w)>>2,c2>>2,color)); 
		 cm .set(hash(++i, (c3+buf(w))>>1,color)); 
		 cm .set(hash(++i, (c3+buf(w))>>3,c1>>5,c2>>5,color)); 
		 cm .set(hash(++i,mean, logvar>>5,color)); 
		scm1.set((c3  +buf(w))>>1); 
		scm2.set((c3  +buf(w)-buf(w+3))>>1); 
		scm3.set((c3*2-buf(6))>>1); 
		scm4.set((buf(w)*2-buf(w*2))>>1); 
		scm5.set((c3  +buf(w)-buf(w-3))>>1); 
		scm6.set(mean>>1|logvar<<1&0x180); 
	} 
	 cm .mix(); 
	scm1.mix(); 
	scm2.mix(); 
	scm3.mix(); 
	scm4.mix(); 
	scm5.mix(); 
	scm6.mix(); 
	return w; 
} 
//////////////////////////////////////////////////////////////////////// 
// Model x86 code.	The contexts are sparse containing only those bits 
// relevant to parsing (2 prefixes,opcode,and mod and r/m fields of mod 
// RM byte). Get context at buf(i) relevant to parsing 32-bit x86 code 
__declspec(naked) U32 __stdcall bswap(U32 x) 
{ 
	__asm mov eax,[esp+4] 
	__asm bswap eax 
	__asm ret 4 
} 
void exeModel() 
{ 
	const U32 N=12; 
	static ContextMap cm(MEM*2,N); 
	static U32 state=0,base=0,aim=0,len=0; 
	if(!bpos) 
	{ 
		U32 last4=bswap(c4); 
		switch(state) 
		{ 
		case 0: if((last4&0xffff)==0x5a4d) 
				{ 
					state=1; 
					base=pos-4; 
					aim=base+0x40; 
				}return; 
		case 1: if(pos>=aim) 
					if(pos>aim||last4+40x1000) state=0; 
					else {state=2;aim=base+last4+4;} 
				return; 
		case 2: if(pos>=aim) 
					if(pos>aim||last4!=0x4550) state=0; 
					else {state=3;aim+=0x108;} 
				return; 
		case 3: if(pos>=aim) 
					if(pos>aim||!last4||last4>MEM*8) state=0; 
					else {state=4;aim+=4;len=last4;} 
				return; 
		case 4: if(pos>=aim) 
					if(pos>aim||last4<=pos-base||last4>0x10000) state=0; 
					else {state=5;aim=base+last4;} 
				return; 
		case 5: if(pos>=aim) {state=6;aim+=len;type=1;} 
//printf("pos=%p,aim=%p,len=%p\n",pos,aim,len); 
				return; 
		case 6: if(pos>=aim) {state=0;type=0;return;} 
				else for(U32 i=0;i4)<<20); 
				} 
		} 
	} 
	if(state==6) cm.mix(); 
} 
//////////////////////////////////////////////////////////////////////// 
// file types (order is important: the last will be sorted by filetype 
// detection as the first)This combines all context models with a Mixer. 
U32 contextModel() 
{ 
	static ContextMap cm(MEM*32,5); 
	static U32 cxt[9]; 
	m.update(); 
	m.add(256); 
	U32 is=matchModel();	// Length of longest matching context 
	if(is>=100) 
	{ 
		m.set(0,8); 
		return m.p(); 
	} 
	is=bmpModel();	// Image width (B) if BMP or TIFF detected,or 0 
	if(is>0) 
	{ 
		static U32 col=0; 
		if(++col>=24) col=0; 
		m.set(2,8); 
		m.set(col,24); 
		m.set((buf(is)+c3)>>4,32); 
		m.set(c0,256); 
		return m.p(); 
	} 
	if(!bpos) 
	{ 
		for(U32 i=8;i;--i) cxt[i]=cxt[i-1]*257+c1+1; 
		cm.set(cxt[0]); 			//41 
		cm.set(cxt[2]); 			//33 
		cm.set(cxt[3]); 			//18 
		cm.set(cxt[5]); 			//11 
		if(type!=1) cm.set(cxt[8]); //13 
	} 
	const U32 order=cm.mix(); 
	sparseModel();		//396 
	if(type!=1) 
	{ 
		recordModel();	//177 
		wordModel();	//190 
	} 
	indirectModel();	//45 
	exeModel(); 
	m.set(8+order+(c4&0xe0)+(c1==c2)*16,256); 
	m.set(c0* 4+(c1/64),1024);			//37 
	m.set(c1/32+(c2/32)*8+(c3&192),256);//48 
	return m.p(); 
} 
//////////////////////////////////////////////////////////////////////// 
class Predictor 
{ 
	U32 p; 
public: 
	Predictor():p(2048) {} 
	U32 predict() const {return p;} // 0 ~ 4095 
	void update(); 
}; 
//////////////////////////////////////////////////////////////////////// 
void Predictor::update() 
{ 
	static APM a1(256),a2(65536); 
	c0+=c0+y; 
	if(c0>=256) 
	{ 
		buf[pos++]=c0; 
		c3=c2; c2=c1; c1=c0-256; 
		c4=(c4<<8)+c1; c0=1; 
	} 
	bpos=(bpos+1)&7; 
	p=contextModel(); 
	p=(a1.p(p,c0)*3+p+2)>>2; 
	p= a2.p(p,c0+c1*256); 
} 
//////////////////////////////////////////////////////////////////////// 
class Coder 
{ 
public: 
	enum		Mode{ENC,DEC}; 
private: 
	Predictor	pred; 
	U32 		x0,x1,x; 
	FILE		*fp; 
	Mode		mode; 
	enum		{OUTBUFSIZE=8192}; 
	U32 		outpos; 
	U8			outbuf[OUTBUFSIZE]; 
	void encode(U32 b) 
	{ 
		U32 p=pred.predict(); p+=(p<2048); // 1 ~ fff 
		U32 m=x1-x0; m=x0+(m>>12)*p+((m&0xfff)*p>>12); 
		if((y=b)) x1=m; else x0=m+1; 
		pred.update(); 
		while(!((x0^x1)&0xff000000)) 
		{ 
			outbuf[outpos++]=x1>>24; 
			if(outpos==OUTBUFSIZE) 
			{ 
				fwrite(outbuf,1,OUTBUFSIZE,fp); 
				outpos=0; 
			} 
			x0<<=8; 
			x1=(x1<<8)+255; 
		} 
	} 
	U32 decode() 
	{ 
		U32 p=pred.predict(); p+=(p<2048); // 1 ~ fff 
		U32 m=x1-x0; m=x0+(m>>12)*p+((m&0xfff)*p>>12); 
		if((y=x<=m)) x1=m; else x0=m+1; 
		pred.update(); 
		while(!((x0^x1)&0xff000000)) 
		{ 
			x=(x<<8)+(fgetc(fp)&255); 
			x0<<=8; 
			x1=(x1<<8)+255; 
		} 
		return y; 
	} 
public: 
	Coder(Mode m,FILE *f):mode(m),fp(f),x0(0),x1(-1),x(0),outpos(0) 
	{ 
		if(mode==DEC) 
			for(U32 i=4;i;--i) 
				x=(x<<8)+(fgetc(fp)&255); 
	} 
	void reset() {type=0;} 
	void enc(U8 c) 
	{ 
		for(U32 i=8;i;) encode((c>>--i)&1); 
	} 
	U8 dec() 
	{ 
		U8 c=0; 
		for(U32 i=8;i;--i) c+=c+decode(); 
		return c; 
	} 
	void flush() 
	{ 
		if(mode==ENC) 
		{ 
			outbuf[outpos++]=x0>>24; 
			fwrite(outbuf,1,outpos,fp); 
			outpos=0; 
		} 
	} 
}; 
////////////////////////////////////////////////////////////////////////