www.pudn.com > lame3(mp3園鷹殻會才彿創).zip > fftsse.nas


; back port from GOGO-no coda 2.24b by Takehiro TOMINAGA

; GOGO-no-coda
;	Copyright (C) 1999 shigeo
;	special thanks to Keiichi SAKAI

%include "nasm.h"

	globaldef fht_SSE
	globaldef fft_side_SSE
	externdef costab_fft
	externdef sintab_fft

	segment_data
	align 16
Q_MMPP	dd	0x0,0x0,0x80000000,0x80000000
Q_MPMP	dd	0x0,0x80000000,0x0,0x80000000
Q_002	dd	0.02236068, 0.02236068, 0.02236068, 0.02236068
D_SQRT2	dd	1.414213562,1.414213562
S_025	dd	0.25
S_05	DD	0.5
S_00005	DD	0.0005

	segment_code
;------------------------------------------------------------------------
;	by K. SAKAI
;	99/08/18	PIII 23k[clk]
;	99/08/19	命令順序入れ換え PIII 22k[clk]
;	99/08/20	bit reversal を旧午後から移植した PIII 17k[clk]
;	99/08/23	一部 unroll PIII 14k[clk]
;	99/11/12	clean up
;
;void fht_SSE(float *fz, int n);
	align 16
fht_SSE:
	push	ebx
	push	esi
	push	edi
	push	ebp
%assign _P 4*4

	;2つ目のループ
	mov	eax,[esp+_P+4]	;eax=fz
	mov	ebp,[esp+_P+8]	;=n
	shl	ebp,2
	add	ebp,eax		; fn  = fz + n, この関数終了まで不変

	xor	ecx,ecx		; ecx=k=0
	xor	eax,eax
	mov	al,4		; =k1=1*(sizeof float)	// 4, 16, 64, 256,...
	xor	edx,edx
	mov	dl,12		; =k3=3*k1
	jmp	short .lp2

	align	16
.lp2:				; do{
	add	cl,2		; k  += 2;
	shl	eax,2
	shl	edx,2

	mov	esi,[esp+_P+4]	;esi=fi=fz
	mov	edi,eax
	shr	edi,1
	add	edi,esi		; edi=gi=fi+ki/2

; たかだか2並列しか期待できない部分はFPUのほうが速い。
	movss	xmm7,[D_SQRT2]
	jmp	short .lp20

	align	16
.lp20:				; do{
;                       f0     = fi[0 ] + fi[k1];
;                       f2     = fi[k2] + fi[k3];
;                       f1     = fi[0 ] - fi[k1];
;                       f3     = fi[k2] - fi[k3];
;                       fi[0 ] = f0     + f2;
;                       fi[k1] = f1     + f3;
;                       fi[k2] = f0     - f2;
;                       fi[k3] = f1     - f3;
	fld	dword [esi]
	fadd	dword [esi+eax]
	fld	dword [esi+eax*2]
	fadd	dword [esi+edx]

	fld	dword [esi]
	fsub	dword [esi+eax]
	fld	dword [esi+eax*2]
	fsub	dword [esi+edx]

	fld	st1
	fadd	st0,st1
	fstp	dword [esi+eax]
	fsubp	st1,st0
	fstp	dword [esi+edx]

	fld	st1
	fadd	st0,st1
	fstp	dword [esi]
	fsubp	st1,st0
	fstp	dword [esi+eax*2]

	lea	esi,[esi + eax*4]	; = fi += (k1 * 4);
;	add	esi,eax
;	add	esi,edx
;                       g0     = gi[0 ] + gi[k1];
;                       g2     = SQRT2  * gi[k2];
;                       g1     = gi[0 ] - gi[k1];
;                       g3     = SQRT2  * gi[k3];
;                       gi[0 ] = g0     + g2;
;                       gi[k2] = g0     - g2;
;                       gi[k1] = g1     + g3;
;                       gi[k3] = g1     - g3;
	fld	dword [edi]
	fadd	dword [edi+eax]
	fld	dword [D_SQRT2]
	fmul	dword [edi+eax*2]

	fld	dword [edi]
	fsub	dword [edi+eax]
	fld	dword [D_SQRT2]
	fmul	dword [edi+edx]

	fld	st1
	fadd	st0,st1
	fstp	dword [edi+eax]
	fsubp	st1,st0
	fstp	dword [edi+edx]

	fld	st1
	fadd	st0,st1
	fstp	dword [edi]
	fsubp	st1,st0
	fstp	dword [edi+eax*2]

	lea	edi,[edi + eax*4]	; = gi += (k1 * 4);
	cmp	esi,ebp
	jl	near .lp20		; while (fi