www.pudn.com > ETSI_ES_202_212_software.rar > ExtAdvFrontEnd.c
/*=============================================================================== * ETSI ES 202 212 Distributed Speech Recognition * Extended Advanced Front-End Feature Extraction Algorithm & Compression Algorithm * Speech Reconstruction Algorithm. * C-language software implementation * Version 1.1.1 October, 2003 *===============================================================================*/ /*--------------------------------------------------------------------------- * * FILE NAME: ExtAdvFrontEnd.c * PURPOSE: This file contains the main part of Extended Advanced DSR FE. * Speech samples are read from input waveform file frame by frame. * Feature extraction is performed for each frame by calling the * function DoAdvProcess () (see ParmInterface.c). * Feature vectors are output to file in HTK format. VAD information * is output to file in ASCII format. * Command line arguments are handled by a command line parsing * function. * *---------------------------------------------------------------------------*/ /*----------------- * File Inclusions *-----------------*/ #include#include #include #include #include "fileio.h" #include "ParmInterface.h" #include "pitchInterface.h" #define VERSION_STR "Version 1.1.1, October 2003" /*------------------------------- * Global Definitions and Macros *-------------------------------*/ #define PRINTMOD 15 #define IEEE_LE 0 #define IEEE_BE 1 /*--------------- * HTK constants *---------------*/ #define HTK_PR_MFCC 6 #define HTK_PR_E 0x40 #define HTK_PR_0 0x2000 /*---------------------------------------------- * Global Variable Definitions and Declarations *----------------------------------------------*/ BOOLEAN QuietMode = FALSE, /* Supress output to stderr */ FsSpecified = FALSE, /* Sampling frequency specified */ SwapSpecified = FALSE, /* Byte swap for raw data files specified */ InputKindSpecified = FALSE, /* Input file format specified */ NoOutHeaderSpecified = FALSE, /* No output HTK header option specified */ Noc0 = FALSE, /* No c0 coefficient to output feature vector */ NologE = FALSE; /* No logE component to output feature vector */ FILE *fp_in = NULL, /* Input HTK, NIST or raw data file */ *fp_out_htk = NULL, /* Output HTK file */ *fp_out_vad = NULL, /* Output VAD file */ *fp_out_pitch = NULL, /* Output pitch file */ *fp_out_class = NULL; /* Output class file */ int HeaderBytesToSkip = 0; char InFilename[199], /* Name of input file */ OutFilename[199], /* Name of output file */ VADFilename[199], /* Name of output VAD file */ InputKind[10] = "NIST"; /* Input file format */ int SamplingFrequency = 8000, /* SamplingFrequency */ NativeByteOrder = IEEE_LE, /* Native byte ordering */ InputByteOrder, /* Default input byte ordering */ OutParmKind=HTK_PR_MFCC + HTK_PR_E + HTK_PR_0; /* Output parameter kind MFCC_0_E as default */ /*------------ * Prototypes *------------*/ static int ReadBufWave (FILE *fp_in, FILE_TYPE *buf, int nSamples, int Swap); static BOOLEAN ParseCommLine (int argc, char *argv[]); /*----------- * Functions *-----------*/ /*---------------------------------------------------------------------------- * FUNCTION NAME: ReadBufWave * * PURPOSE: Read buffer from input file * * INPUT: * fp_in Pointer to file * nSamples Number of samples to read * swap Byte swap specification * * OUTPUT * buf Pointer to buffer * * RETURN VALUE * FALSE In case of fread() error * TRUE Otherwise * *---------------------------------------------------------------------------*/ static int ReadBufWave (FILE *fp_in, FILE_TYPE *buf, int nSamples, int Swap) { FILE_TYPE s; int i; for (i=0; i > 8); buf[i] = (FILE_TYPE)s; } return TRUE; } /*---------------------------------------------------------------------------- * FUNCTION NAME: ParseCommLine * * PURPOSE: Parses command line arguments, opens input and output files * * INPUT: * argc Number of command line arguments * argv Array of command line arguments * * OUTPUT * none * * RETURN VALUE * FALSE In case of any errors * TRUE Otherwise * *---------------------------------------------------------------------------*/ static BOOLEAN ParseCommLine (int argc, char *argv[]) { int mark = 0, mand_arg_no = 1; for ( ; argc; argc--, mark++) { if (strcmp (argv[mark], "-q") == 0) { QuietMode = TRUE; } else if (strcmp (argv[mark], "-fs") == 0) { FsSpecified = TRUE; --argc; ++mark; SamplingFrequency = 1000 * atoi (argv[mark]); } else if (strcmp (argv[mark], "-swap") == 0) { SwapSpecified = TRUE; } else if (strcmp (argv[mark], "-F") == 0) { InputKindSpecified = TRUE; --argc; ++mark; strcpy (InputKind, argv[mark]); } else if (strcmp (argv[mark], "-noh") == 0) { NoOutHeaderSpecified = TRUE; } else if (strcmp (argv[mark], "-noc0") == 0) { Noc0 = TRUE; } else if (strcmp (argv[mark], "-nologE") == 0) { NologE = TRUE; } else if (strcmp (argv[mark], "-skip_header_bytes") == 0) { --argc; ++mark; HeaderBytesToSkip = atoi (argv[mark]); } else if (argv[mark][0] == '-') { fprintf (stderr, "WARNING: Un-recognized flag '%s' !\r\n", argv[mark]); } else { switch (mand_arg_no) { case 1: /* First mandatory argument - input file */ strcpy (InFilename, argv[mark]); fp_in = fopen (argv[mark], "rb"); if (fp_in == NULL) { fprintf (stderr, " ERROR: Could not open file '%s' !\r\n", argv[mark]); return FALSE; } break; case 2: /* Second mandatory argument - output HTK file */ strcpy (OutFilename, argv[mark]); fp_out_htk = fopen (OutFilename, "wb"); if (fp_out_htk == NULL) { fprintf (stderr, " ERROR: Could not open file '%s' !\r\n", OutFilename); return FALSE; } strcpy (VADFilename, strcat(strtok(argv[mark], "."), ".vad")); fp_out_vad = fopen (VADFilename, "wb"); if (fp_out_vad == NULL) { fprintf (stderr, "ERROR: Could not open file '%s' !\r\n", VADFilename); return FALSE; } break; case 3: /* Third mandatory argument - output pitch file */ strcpy (OutFilename, argv[mark]); fp_out_pitch = fopen (OutFilename, "wb"); if (fp_out_pitch == NULL) { fprintf (stderr, " ERROR: Could not open file '%s' !\r\n", OutFilename); return FALSE; } break; case 4: /* Fourth mandatory argument - output class file */ strcpy (OutFilename, argv[mark]); fp_out_class = fopen (OutFilename, "w"); // ASCII output if (fp_out_class == NULL) { fprintf (stderr, " ERROR: Could not open file '%s' !\r\n", OutFilename); return FALSE; } break; default: fprintf (stderr, " ERROR: Too many input arguments!\r\n"); return FALSE; } /* end switch on mandatory argument no */ mand_arg_no++; } // end of loop on arguments } if (!fp_in || !fp_out_htk || !fp_out_pitch || !fp_out_class) { fprintf (stderr, "ERROR: Input and output files must be given!\r\n"); return FALSE; } if (strcmp (InputKind, "NIST") && strcmp (InputKind, "HTK") && strcmp (InputKind, "RAW")) { fprintf (stderr, "ERROR: Invalid input file format '%s'!\r\n", InputKind); return FALSE; } if (strcmp (InputKind, "RAW") && HeaderBytesToSkip != 0) { fprintf(stderr, "ERROR: skip_header_bytes can be specified for RAW format only.\r\n"); return FALSE; } if (strcmp (InputKind, "RAW") && FsSpecified) fprintf (stderr, "WARNING: Sampling frequency needs to be specified only for raw data files.\r\n"); if (strcmp (InputKind, "RAW") && SwapSpecified) fprintf (stderr, "WARNING: Byte swapping needs to be specified only for raw data files if necessary.\r\n"); if (Noc0 && NologE) TextToParmKind( "MFCC", &OutParmKind); else if (Noc0) TextToParmKind( "MFCC_E", &OutParmKind); else if (NologE) TextToParmKind( "MFCC_0", &OutParmKind); return TRUE; } /*---------------------------------------------------------------------------- * FUNCTION NAME: main * * PURPOSE: Main front-end operations from input speech samples to output * feature vectors. See embedded comments. * * INPUT: * argc Number of command line arguments (passed to ParseCommLine) * argv Array of command line arguments (passed to ParseCommLine) * * OUTPUT * none * * RETURN VALUE * TRUE In case of any errors * FALSE Otherwise * *---------------------------------------------------------------------------*/ extern int main (int argc, char *argv[]) { FILE_TYPE *SigBuf; X_FLOAT32 FeatureBuffer[NUM_CEP_COEFF + 2]; FEParamsX *pFEParX; HTK_Header InHheader, OutHheader; NIST_Header InNheader; int rc; int NbSamplesToRead; long FrameCounter = 0; long SpeechFrameCounter = 0; long NonSpeechFrameCounter = 0; if (!ParseCommLine (argc - 1, argv + 1)) { fprintf (stderr,"\r\n ETSI ES 202 212 DSR Extended Advanced Front-End Feature Extraction Algorithm"); fprintf (stderr,"\r\n C-language software implementation"); fprintf (stderr,"\r\n %s\r\n\n",VERSION_STR); fprintf (stderr, "\r\n USAGE:"); fprintf (stderr, " %s infile HTK_outfile pitch_outfile class_outfile [options]\r\n", argv[0]); fprintf (stderr, "\n The program outputs also .vad ASCII VAD file\r\n"); fprintf (stderr, "\r\n OPTIONS:\r\n"); fprintf (stderr, " -q Quiet Mode (%s)\r\n", QuietMode ? "TRUE" : "FALSE"); fprintf (stderr, " -F format Input file format (NIST,HTK,RAW) (%s)\r\n", InputKind); fprintf (stderr, " -fs freq Sampling frequency in kHz (%d,%d,%d) (%d)\r\n", SAMPLING_FREQ_1, SAMPLING_FREQ_2, SAMPLING_FREQ_3, SamplingFrequency / 1000); fprintf (stderr, " -swap Change input byte ordering (%s)\r\n", SwapSpecified ? "Swapped" : "Native"); fprintf (stderr, " (Native byte ordering is %s)\r\n", NativeByteOrder ? "ieee-be" : "ieee-le"); fprintf (stderr, " -noh No HTK header to output file (%s)\r\n", NoOutHeaderSpecified ? "TRUE" : "FALSE" ); fprintf (stderr, " -noc0 No c0 coefficient to output feature vector (%s)\r\n", Noc0 ? "TRUE" : "FALSE" ); fprintf (stderr, " -nologE No logE component to output feature vector (%s)\r\n", NologE ? "TRUE" : "FALSE" ); fprintf (stderr, " -skip_header_bytes n - Skip header, first n bytes (Only for F RAW)\r\n"); return TRUE; } if (!QuietMode) { fprintf (stderr,"\r\n ETSI ES 202 212 DSR Extended Advanced Front-End Feature Extraction Algorithm"); fprintf (stderr,"\r\n C-language software implementation"); fprintf (stderr,"\r\n %s\r\n\n",VERSION_STR); } /*----------------*/ /* Initialization */ /*----------------*/ InputByteOrder = NativeByteOrder; /*----------------------------------------------------------------- * Read input header, extract sampling frequency and byte ordering *-----------------------------------------------------------------*/ if (!strcmp (InputKind, "NIST")) { if (!ReadNISTHeader (fp_in, &InNheader)) { fprintf (stderr, "ERROR: Invalid NIST header !\r\n"); return TRUE; } SamplingFrequency = InNheader.SampleRate; if (strcmp (InNheader.SampleByteFormat, "10")) InputByteOrder = IEEE_LE; else InputByteOrder = IEEE_BE; } else if (!strcmp (InputKind, "HTK")) { if (!ReadHTKHeader (fp_in, &InHheader, InputByteOrder!=IEEE_BE)) { fprintf (stderr, "ERROR: Invalid HTK header !\r\n"); return TRUE; } // 625->16kHz, 1250->8kHz, 909->11kHz SamplingFrequency = 10 * floor ((float) 1e6 / (float) InHheader.sampPeriod); InputByteOrder = IEEE_BE; } /*------------------------------------------------------------ * Write output header (number of frames to be updated later) *------------------------------------------------------------*/ OutHheader.nSamples = 0; OutHheader.sampPeriod = 100000; /* 10000.0 us (100 Hz) */ OutHheader.sampSize = (NUM_CEP_COEFF - (Noc0 ? 1:0) + (NologE ? 0:1)) * 4; OutHheader.sampKind = OutParmKind; if (!NoOutHeaderSpecified) WriteHTKHeader (fp_out_htk, &OutHheader); /*------------------------------------------ * Memory allocation for FE data structures *------------------------------------------*/ pFEParX = AdvProcessAlloc (SamplingFrequency); /* For pitch and class extraction */ pFEParX->pfInpSpeech = (X_FLOAT32*)malloc(sizeof(X_FLOAT32)*FRAME_LENGTH); if ( !(pFEParX->pfInpSpeech)) { fprintf (stderr, "ERROR: Memory allocation error occured!\r\n"); return TRUE; } pFEParX->pfUBSpeech = (X_FLOAT32*)malloc(sizeof(X_FLOAT32)*FRAME_LENGTH); if ( !(pFEParX->pfUBSpeech)) { fprintf (stderr, "ERROR: Memory allocation error occured!\r\n"); return TRUE; } pFEParX->pfProcSpeech = (X_FLOAT32*)malloc(sizeof(X_FLOAT32)*(FRAME_LENGTH+HISTORY_LENGTH)); if ( !(pFEParX->pfProcSpeech)) { fprintf (stderr, "ERROR: Memory allocation error occured!\r\n"); return TRUE; } pFEParX->pfDownSampledProcSpeech = (X_FLOAT32*) malloc(sizeof(X_FLOAT32)*(FRAME_LENGTH+HISTORY_LENGTH)/DOWN_SAMP_FACTOR+1); if ( !(pFEParX->pfDownSampledProcSpeech)) { fprintf (stderr, "ERROR: Memory allocation error occured!\r\n"); return TRUE; } /*------------------------------------------------------- * Initialization of FE data structures and input buffer *-------------------------------------------------------*/ AdvProcessInit (pFEParX); rc = InitPitchRom(&(pFEParX->pPitchRom)); if (rc != 0) { fprintf(stderr,"ERROR: Can't initialize PITCH ROM, RC = %d\r\n",rc); return TRUE; } rc = InitPitchEstimator(pFEParX->pPitchRom,&(pFEParX->pPitchEstimator)); if ( rc!= 0) { fprintf (stderr, "ERROR: can't initialize PITCH ESTIMATOR, RC = %d\r\n",rc); return TRUE; } pFEParX->speech_fid = fp_in; pFEParX->htk_fid = fp_out_htk; pFEParX->vad_fid = fp_out_vad; pFEParX->pitch_fid = fp_out_pitch; pFEParX->class_fid = fp_out_class; NbSamplesToRead = pFEParX->NbSamplesToRead; SigBuf = calloc (1, sizeof (SigBuf[0]) * NbSamplesToRead); if (SigBuf == NULL) { fprintf (stderr,"ERROR: Memory allocation error occured!\r\n"); return TRUE; } /*------------ * Processing *------------*/ fseek(fp_in,HeaderBytesToSkip,SEEK_SET); while (ReadBufWave (fp_in, SigBuf, NbSamplesToRead, (SwapSpecified || InputByteOrder != NativeByteOrder))) { FrameCounter++; if (DoAdvProcess (SigBuf, FeatureBuffer, pFEParX)) { /*--------------- * Output result *---------------*/ WriteHTKFeature (fp_out_htk, FeatureBuffer, (short) (NUM_CEP_COEFF - (Noc0 ? 1:0) + (NologE ? 0:1))); fprintf (fp_out_vad, "%d ", pFEParX->VAD); if (pFEParX->VAD == SPEECH_FRAME) SpeechFrameCounter++; else NonSpeechFrameCounter++; } /*--------------------------- * Display processing status *---------------------------*/ if (!QuietMode && !(FrameCounter % PRINTMOD)) { fprintf (stderr, "\rProcessing status: %ld frames ...", FrameCounter); fflush (stderr); } } /*---------- * Flushing *----------*/ while (FlushAdvProcess (FeatureBuffer, pFEParX)) { /*--------------- * Output result *---------------*/ WriteHTKFeature (fp_out_htk, FeatureBuffer, (short) (NUM_CEP_COEFF - (Noc0 ? 1:0) + (NologE ? 0:1))); fprintf (fp_out_vad, "%d ", pFEParX->VAD); if (pFEParX->VAD == SPEECH_FRAME) SpeechFrameCounter++; else NonSpeechFrameCounter++; } /*---------------- * Memory release *----------------*/ DeallocatePitchRom(pFEParX->pPitchRom); DeallocatePitchEstimator(pFEParX->pPitchEstimator); free(pFEParX->pfUBSpeech); free(pFEParX->pfProcSpeech); free(pFEParX->pfDownSampledProcSpeech); AdvProcessDelete (&pFEParX); free (SigBuf); /*---------------------------------------- * Correct number of frames in HTK header *----------------------------------------*/ OutHheader.nSamples = SpeechFrameCounter + NonSpeechFrameCounter; if (!NoOutHeaderSpecified) WriteHTKHeader (fp_out_htk, &OutHheader); /*------------------------------ * Close input and output files *------------------------------*/ fclose (fp_in); fclose (fp_out_htk); fclose (fp_out_vad); fclose (fp_out_pitch); fclose (fp_out_class); /*---------------------- * Display final status *----------------------*/ if (!QuietMode) fprintf (stderr, "\rProcessed: %ld Frames. \r\n", FrameCounter); fprintf (stderr, "Extended Advanced Front End: "); fprintf (stderr, InFilename); fprintf (stderr, ": %ld Frames, %ld Speech, %ld Non Speech, %ld Lost.\r\n", FrameCounter, SpeechFrameCounter, NonSpeechFrameCounter, FrameCounter - SpeechFrameCounter - NonSpeechFrameCounter); if (SpeechFrameCounter == 0) fprintf (stderr, "NO SPEECH DETECTED !\r\n"); return FALSE; }