www.pudn.com > lame3(mp3園鷹殻會才彿創).zip > fft.nas



;	for new GOGO-no-coda (1999/09)
;	Copyright (C) 1999 shigeo
;	special thanks to Keiichi SAKAI, URURI
%include "nasm.h"

	globaldef fht_3DN
	globaldef fht
	externdef costab_fft
	externdef sintab_fft
	externdef gray_index

	segment_data
	align 16
D_MSB1_0	dd	0         ,0x80000000
D_SQRT2	dd	1.414213562,1.414213562
t_s0	dd	0	;[ t_c:t_s]
t_c0	dd	0
t_c1	dd	0	;[-t_s:t_c]
t_s1	dd	0
D_s1c1	dd	0, 0
D_Mc1s1	dd	0, 0
D_s2c2	dd	0, 0
D_Mc2s2	dd	0, 0
D_0_1	dd	1.0, 0.0
S_05	DD	0.5
S_00005	DD	0.0005
fht		dd	0	;関数ポインタ

	segment_code

;************************************************************************

;	by shigeo
;	99/08/16
;	23000clk 辛かった〜
;	18500clk bit reversal from gogo1 by URURI

;void fht(float *fz, int n);
	align 16
fht_3DN:
	push	ebx
	push	esi
	push	edi
	push	ebp
%assign _P 4*4
	;まず最初のループ... はfht()の外へ移動

	mov	esi,[esp+_P+4]	;esi=fz
	mov	ecx,[esp+_P+8]	;ecx=n

	;メインループ
	movq	mm7,[D_MSB1_0]	;mm7=[1<<31:0]

%assign LOCAL_STACK	16
	sub	esp,LOCAL_STACK
%assign _P (_P+LOCAL_STACK)
	xor	eax,eax
	mov	[esp],eax	;k=0
%define k dword [esp]
%define kx	dword [esp+4]
%define fn dword [esp+8]

.lp30:	;k=0; do{
	mov	ecx,k
	add	ecx,2
	mov	k,ecx
	mov	eax,1
	shl	eax,cl		;eax=k1 = 1<>1
	mov	kx,esi		;保存(後で使う)
	mov	edi,[esp+_P+4]	;edi=fi=fz
	lea	ebp,[edi+esi*4]	;ebp=gi=fz+kx
	mov	esi,[esp+_P+8]	;esi=n
	lea	esi,[edi+esi*4]	;esi=fn=fz+n
	movq	mm6,[D_SQRT2]	;mm6=[√2:√2]

.lp31:	;fn=fz+n; do{ FLOAT g0,f0,f1,...
	movd	mm0,[edi]	;mm0=[0:fi[ 0]]
	movd	mm1,[edi+eax*4]	;mm1=[0:fi[k1]]
	punpckldq	mm0,mm0	;mm0=[fi_0 :fi_0 ]
	punpckldq	mm1,mm1	;mm1=[fi_k1:fi_k1]
	movd	mm2,[edi+ebx*4]
	movd	mm3,[edi+ecx*4]
	punpckldq	mm2,mm2	;mm2=[fi_k2:fi_k2]
	punpckldq	mm3,mm3	;mm3=[fi_k3:fi_k3]
	pxor	mm1,mm7		;mm1=[-fi_k1:fi_k1]
	pxor	mm3,mm7		;mm3=[-fi_k3:fi_k3]
	pfadd	mm0,mm1		;mm0=[f1:f0]=[fi_0 -fi_k1 : fi_0 +fi_k1]
	pfadd	mm2,mm3		;mm2=[f3:f2]=[fi_k2-fi_k3 : fi_k2+fi_k3]
	movq	mm3,mm0		;mm3=[f1:f0]
	pfadd	mm0,mm2		;mm0=[f1+f3:f0+f2]
	movd	[edi],mm0	;fi[0]=f0+f2
	psrlq	mm0,32		;mm0=[0:f1+f3]
	pfsub	mm3,mm2		;mm3=[f1-f3:f0-f2]
	movd	[edi+eax*4],mm0	;fi[k1]=f1+f3
	movd	[edi+ebx*4],mm3	;fi[k2]=f0-f2
	psrlq	mm3,32		;mm3=[0:f1-f3]
	movd	[edi+ecx*4],mm3	;fi[k3]=f1-f3

	movd	mm0,[ebp]	;mm0=[0:gi_0]
	movd	mm1,[ebp+eax*4]	;mm1=[0:gi_k1]
	punpckldq	mm0,mm0	;mm0=[gi_0 :gi_0 ]
	punpckldq	mm1,mm1	;mm1=[gi_k1:gi_k1]
	movd	mm2,[ebp+ebx*4]	;mm2=[0:gi_k2]
	pxor	mm1,mm7		;mm1=[-gi_k1:gi_k1]
	punpckldq	mm2,[ebp+ecx*4]	;mm2=[gi_k3:gi_k2]
	pfadd	mm0,mm1		;mm0=[g1:g0]=[gi_0 -gi_k1:gi_0 +gi_k1]
	pfmul	mm2,mm6		;mm2=[g3:g2]=sqrt2 * [gi_k3:gi_k2]
	movq	mm1,mm0		;mm1=[g1:g0]
	pfadd	mm0,mm2		;mm0=[g1+g3:g0+g2]
	movd	[ebp],mm0	;gi[0]=g0+g2
	psrlq	mm0,32		;mm0=[0:g1+g3]
	pfsub	mm1,mm2		;mm1=[g1-g3:g0-g2]
	movd	[ebp+eax*4],mm0	;gi[k1]=g1+g3
	movd	[ebp+ebx*4],mm1	;gi[k2]=g0-g2
	psrlq	mm1,32		;mm1=[0:g1-g3]
	movd	[ebp+ecx*4],mm1	;gi[k3]=g1-g3
	lea	edi,[edi+edx*4]	;fi += k4
	lea	ebp,[ebp+edx*4]	;gi += k4
	cmp	edi,esi
	jc	near .lp31	;}while(fi