www.pudn.com > lame3(mp3園鷹殻會才彿創).zip > fftsse.nas
; back port from GOGO-no coda 2.24b by Takehiro TOMINAGA
; GOGO-no-coda
; Copyright (C) 1999 shigeo
; special thanks to Keiichi SAKAI
%include "nasm.h"
globaldef fht_SSE
globaldef fft_side_SSE
externdef costab_fft
externdef sintab_fft
segment_data
align 16
Q_MMPP dd 0x0,0x0,0x80000000,0x80000000
Q_MPMP dd 0x0,0x80000000,0x0,0x80000000
Q_002 dd 0.02236068, 0.02236068, 0.02236068, 0.02236068
D_SQRT2 dd 1.414213562,1.414213562
S_025 dd 0.25
S_05 DD 0.5
S_00005 DD 0.0005
segment_code
;------------------------------------------------------------------------
; by K. SAKAI
; 99/08/18 PIII 23k[clk]
; 99/08/19 命令順序入れ換え PIII 22k[clk]
; 99/08/20 bit reversal を旧午後から移植した PIII 17k[clk]
; 99/08/23 一部 unroll PIII 14k[clk]
; 99/11/12 clean up
;
;void fht_SSE(float *fz, int n);
align 16
fht_SSE:
push ebx
push esi
push edi
push ebp
%assign _P 4*4
;2つ目のループ
mov eax,[esp+_P+4] ;eax=fz
mov ebp,[esp+_P+8] ;=n
shl ebp,2
add ebp,eax ; fn = fz + n, この関数終了まで不変
xor ecx,ecx ; ecx=k=0
xor eax,eax
mov al,4 ; =k1=1*(sizeof float) // 4, 16, 64, 256,...
xor edx,edx
mov dl,12 ; =k3=3*k1
jmp short .lp2
align 16
.lp2: ; do{
add cl,2 ; k += 2;
shl eax,2
shl edx,2
mov esi,[esp+_P+4] ;esi=fi=fz
mov edi,eax
shr edi,1
add edi,esi ; edi=gi=fi+ki/2
; たかだか2並列しか期待できない部分はFPUのほうが速い。
movss xmm7,[D_SQRT2]
jmp short .lp20
align 16
.lp20: ; do{
; f0 = fi[0 ] + fi[k1];
; f2 = fi[k2] + fi[k3];
; f1 = fi[0 ] - fi[k1];
; f3 = fi[k2] - fi[k3];
; fi[0 ] = f0 + f2;
; fi[k1] = f1 + f3;
; fi[k2] = f0 - f2;
; fi[k3] = f1 - f3;
fld dword [esi]
fadd dword [esi+eax]
fld dword [esi+eax*2]
fadd dword [esi+edx]
fld dword [esi]
fsub dword [esi+eax]
fld dword [esi+eax*2]
fsub dword [esi+edx]
fld st1
fadd st0,st1
fstp dword [esi+eax]
fsubp st1,st0
fstp dword [esi+edx]
fld st1
fadd st0,st1
fstp dword [esi]
fsubp st1,st0
fstp dword [esi+eax*2]
lea esi,[esi + eax*4] ; = fi += (k1 * 4);
; add esi,eax
; add esi,edx
; g0 = gi[0 ] + gi[k1];
; g2 = SQRT2 * gi[k2];
; g1 = gi[0 ] - gi[k1];
; g3 = SQRT2 * gi[k3];
; gi[0 ] = g0 + g2;
; gi[k2] = g0 - g2;
; gi[k1] = g1 + g3;
; gi[k3] = g1 - g3;
fld dword [edi]
fadd dword [edi+eax]
fld dword [D_SQRT2]
fmul dword [edi+eax*2]
fld dword [edi]
fsub dword [edi+eax]
fld dword [D_SQRT2]
fmul dword [edi+edx]
fld st1
fadd st0,st1
fstp dword [edi+eax]
fsubp st1,st0
fstp dword [edi+edx]
fld st1
fadd st0,st1
fstp dword [edi]
fsubp st1,st0
fstp dword [edi+eax*2]
lea edi,[edi + eax*4] ; = gi += (k1 * 4);
cmp esi,ebp
jl near .lp20 ; while (fi