www.pudn.com > myHMMsource.rar > myHMM.c, change:2004-02-26,size:44333b


/************************************************************************ 
 *                                                                      * 
 *  Program packages 'myHMM' :                                          * 
 *                                                                      * 
 *  myHMM.c                                                             * 
 *   -the main program for myHMM hidden Markov model                    * 
 *                                                                      * 
 *  Version 0.1                                                         * 
 *  Date: 3 May 2003                                                    * 
 *                                                                      * 
 *  NOTE: This program package is copyrighted in the sense that it      * 
 *  may be used for scientific purposes. The package as a whole, or     * 
 *  parts thereof, cannot be included or used in any commercial         * 
 *  application without written permission granted by its producents.   * 
 *  No programs contained in this package may be copied for commercial  * 
 *  distribution.                                                       * 
 *                                                                      * 
 *  All comments concerning this program package may be sent to the     * 
 *  e-mail address 'dcslgl@nus.edu.sg'.                                 * 
 *                                                                      * 
 ************************************************************************/ 
 
/************************************************************************* 
NAME 
     myHMM - train the hidden Markov model 
 
SYNOPSIS 
     myHMM -hmm [HMM] [-train trainFile] [-trainedHmm trainedHmmFile] 
     [-test testFile] [-tested testedFile] [-mode mode] 
 
DESCRIPTION 
 
     Input: 
       required: pre-defined hidden Markov model 
       optional: training data, testing data 
 
     Output: 
       optional: trained hidden Markov model, tested result 
 
     Global variables: 
       N: integer, number of states 
       M: integer, number of observations 
       T: integer, the length of the longest input sequence 
       S: integer, finite set of possible states 
       O: integer, finite set of possible observations 
       t: double, transition matrix: N x N 
       e: integer, emission matrix: N x M 
       pi: integer, initial state distribution: 1 x N 
 
       %% A hidden Markov model (HMM) is a five-tuple (S,O,t,e,pi). 
       %% Let lambda = {t,e,pi} denote the parameters for a given HMM 
       %% with fixed S and O. 
 
       %% There are three basic problems in HMM: 
       %% 1) probability of the observation sequence given the HMM model 
       %%   -> forward algorithm & backward algorithm 
       %% 2) the most probable state path of the observation sequence 
       %%   given the HMM model 
       %%   -> Viterbi algorithm 
       %% 3) Building the model given a training set of sequence 
       %%   -> Baum-Welch algorithm 
 
OPTIONS 
     -hmm [HMM] 
         pre-defined hidden Markov Model 
 
     -train trainFile 
         training file 
 
     -trainedHmmFile trainedHmmFile 
         trained hidden Markov model 
 
     -test testFile 
         testing file 
 
     -tested testedFile 
         tested result file 
 
     -mode mode 
         mode must be one of 1, 2 or 3. '1' represents TRAINING, '2' 
         represents TESTING, and '3' represents TRAINING_TESTING. 
*************************************************************************/ 
#include <stdio.h> 
#include <stdlib.h> 
#include <string.h> 
#include <malloc.h> 
#include <math.h> 
#include "tools.h" 
#include "myhmm.h" 
#include "arg.h" 
 
static char usage[] = 
	"myHMM - hidden Markov model\n" 
	"command: myHMM -hmm <hmm> [option]\n" 
	"Required parameters:\n" 
	"  -hmm filename        the file for hmm definition\n" 
	"Optional parameters:\n" 
	"  -train filename      the file for training\n" 
	"  -test filename       the file for testing\n" 
	"  -trainedHmm filename the file for trained hmm\n" 
	"  -tested filename     the file for tested result\n" 
	"  -mode int            the mode for training, testing or both\n" 
   "                       1: training, 2: testing, 3: both\n"; 
 
/************************************************************************ 
NAME 
     main - main function in the program 
 
DESCRIPTION 
     This function extracts parameters from the argument list and perform 
     the specified actions by the parameters. 
 
     Input: 
       arguments in the command line 
	 Output: 
	   specified output(s): trained HMM file and/or tesed result file 
	 Global variable list: 
	   iTrain, iTest, T, trainData, testData, observationDefined, 
	   extraSpace. 
*************************************************************************/ 
int main(int argc, char** argv) 
{ 
    char* hmmFile; 
    char* trainFile; 
    char* testFile; 
    char* trainedHmmFile; 
    char* testedFile; 
    int mode; 
 
    /*---------- get the parameters ----------*/ 
    if (argc <= 1 || extract_parameter(argc, argv, "-help", OPTION2) || extract_parameter(argc, argv, "-h", OPTION2) ) 
    { 
        printf("%s", usage);
 
        exit(0); 
    } 
    hmmFile   = extract_parameter(argc, argv, HMM_FILE, ALWAYS); 
    trainFile = extract_parameter(argc, argv, TRAINING_FILE, OPTION); 
    testFile  = extract_parameter(argc, argv, TESTING_FILE, OPTION); 
    trainedHmmFile = extract_parameter(argc, argv, TRAINED_HMM_FILE, OPTION); 
    testedFile    = extract_parameter(argc, argv, TESTED_FILE, OPTION); 
    mode = oatoi(extract_parameter(argc, argv, MODE, OPTION), TRAINING); 
 
    /* check whether the mode is consistant with the provided files */ 
    check_mode(mode, trainFile, testFile); 
 
    /* default values for trainedHmmFile and testedFile */ 
    if(trainedHmmFile == NULL) 
        trainedHmmFile = defTrainedHmmFile; 
    if(testedFile == NULL) 
        testedFile = defTestedFile; 
 
    /*---------- load HMM ----------*/ 
    loadHMM(hmmFile); 
 
    /*---------- train the HMM model ----------*/ 
    if(mode == TRAINING || mode == TRAINING_TESTING) 
    { 
        /*---------- load training data ----------*/ 
        iTrain = cal_lines(trainFile); 
        T = getLengthOfLongestLine(trainFile) + extraSpace; 
        loadSeq(&trainData, trainFile); 
        if(observationsDefined == FALSE) 
        { 
            readObservations(trainData); 
        } 
        /*---------- initialization ----------*/ 
        init(); 
        /*---------- train HMM ----------*/ 
        baumWelch(); 
        /*---------- save trained HMM ----------*/ 
        saveHmm(trainedHmmFile); 
    } 
 
    /*---------- test with the HMM model ----------*/ 
    if(mode == TESTING || mode == TRAINING_TESTING) 
    { 
        /*---------- 5. load testing data ----------*/ 
        if(mode == TESTING) 
        { 
            T = getLengthOfLongestLine(testFile) + extraSpace; 
            init(); 
        } 
        iTest = cal_lines(testFile); 
        loadSeq(&testData, testFile); 
        /*---------- test ----------*/ 
        test(testedFile); 
    } 
    /* post-processing */ 
    post_processing(); 
 
    return 0; 
} 
 
/************************************************************************ 
NAME 
     post_processing - post processing after the main process 
 
DESCRIPTION 
     This function releases the allocated memery for the global variables. 
 
     Input: 
	   pointers to global variables. 
	 Output: 
	   None. 
	 Global variable list: 
	   transitions, emissions, pi, observations, alpha, beta, trainData, 
	   testData. 
*************************************************************************/ 
void post_processing() 
{ 
    int i; 
 
    /* free space for transition matrix */ 
    if(transitions != NULL) 
    { 
        for(i=0; i < N; i++) 
        { 
            if(transitions[i] != NULL) free(transitions[i]); 
        } 
        free(transitions); 
    } 
    /* free space for emission matrix */ 
    if(emissions != NULL) 
    { 
        for(i=0; i < N; i++) 
        { 
            if(emissions[i] != NULL) free(emissions[i]); 
        } 
        free(emissions); 
    } 
    /* free space for initial distribution vector */ 
    if(pi != NULL) free(pi); 
    /* free space for observation list */ 
    if(observations != NULL) free(observations); 
    /* free space for matrix alpha, used in forward algorithm */ 
    if(alpha != NULL) 
    { 
        for(i=0; i < T; i++) 
        { 
            if(alpha[i] != NULL) free(alpha[i]); 
        } 
        free(alpha); 
    } 
    /* free space for matrix beta, used in backward algorithm */ 
    if(beta != NULL) 
    { 
        for(i=0; i < T; i++) 
        { 
            if(beta[i] != NULL) free(beta[i]); 
        } 
        free(beta); 
    } 
    /* free space for training data */ 
    if(trainData != NULL) 
    { 
        for(i=0; i < iTrain; i++) 
        { 
            if(trainData[i] != NULL) free(trainData[i]); 
        } 
        free(trainData); 
    } 
    /* free space for testing data */ 
    if(testData != NULL) 
    { 
        for(i=0; i < iTest; i++) 
        { 
            if(testData[i] != NULL) free(testData[i]); 
        } 
        free(testData); 
    } 
} 
 
/************************************************************************ 
NAME 
     saveHmm - save the trained hidden Markov model in pre-defined format 
 
DESCRIPTION 
     This function saves the trained hidden Markov model in pre-defined 
	 format. 
 
     Input: 
	   trained HMM & output file name 
	 Output: 
	   output file 
	 Global variables: 
	   N, M, T, transitions, emissions, pi. 
*************************************************************************/ 
void saveHmm(char *out) 
{ 
    FILE *fp; 
    int i, j; 
 
    /* Open for write (will fail if inputfile does not exist) */ 
    if( (fp  = fopen( out, "w" )) == NULL ) 
    { 
        printf( "The file '%s' was not opened\n", out); 
        exit(1); 
    } 
    fprintf(fp, "%d %d %d\n", N, M, T); 
    fprintf(fp, "// Observation list\n"); 
    fprintf(fp, "O: "); 
    for(i=0; i < M; i++) 
    { 
        fprintf(fp, "%c ", observations[i]); 
    } 
    fprintf(fp, "\n"); 
    /* transition matrix */ 
    fprintf(fp, "// Transition matrix\n"); 
    for(i=0; i < N; i++) 
    { 
        for(j=0; j < N; j++) 
        { 
            fprintf(fp, "%6.4f ", transitions[i][j]); 
        } 
        fputc('\n', fp); 
    } 
    /* emission matrix */ 
    fprintf(fp, "// Emission matrix\n"); 
    // emission matrix 
    for(i=0; i < N; i++) 
    { 
        for(j=0; j < M; j++) 
        { 
            fprintf(fp, "%6.4f ", emissions[i][j]); 
        } 
        fputc('\n', fp); 
    } 
    /* initial state distribution */ 
    fprintf(fp, "// Initial state distribution\n"); 
    for(i=0; i < N; i++) 
    { 
        fprintf(fp, "%6.4f ", pi[i]); 
    } 
    fputc('\n', fp); 
 
    fclose( fp ); 
} 
 
/************************************************************************ 
NAME 
     test - test the test file with given hidden Markov model 
 
DESCRIPTION 
     This function tests the test file with given hidden Markov model. 
 
     Input: 
	   trained HMM & tested file name 
	 Output: 
	   tested file 
	 Global variables list: 
	   T, iTest, testData, extraObservations. 
*************************************************************************/ 
void test(char* testedFile) 
{ 
    FILE *out_fp; 
    int i; 
    double prob; 
    /* the most probable state path of the current observation sequence */ 
    char *buffer; 
    /* the most probable state paths of all the observation sequences */ 
    double score; 
 
    buffer = (char *) malloc(T * sizeof(char)); 
 
    /* Open for write (will fail if inputfile does not exist) */ 
    if( (out_fp  = fopen( testedFile, "w" )) == NULL ) 
    { 
        printf( "The file '%s' was not opened\n", testedFile); 
        exit(1); 
    } 
 
    for(i=0; i < iTest; i++) 
    { 
        prob = forward( testData[i] ); 
        /* score is equal to the natural logarithm of the probability 
           divided by the length */ 
        score = log(prob) / (double)(strlen(testData[i]) - extraObservations); 
        viterbi(testData[i], buffer); 
        /* output probability and the most probable path */ 
        fprintf(out_fp, "%f\n%s\n%s\n", score, testData[i], buffer); 
    } 
     
    free( buffer ); 
 
    fclose( out_fp ); 
} 
 
/************************************************************************ 
NAME 
     getSymbol - get the symbol corresponding to each state 
 
DESCRIPTION 
     This function .. There are only 52 distinct symbols in this function. 
 
     Input: 
	   state number 
	 Output: 
	   corresponding symbol. 
	 Global variables list: 
	   None. 
*************************************************************************/ 
char getSymbol(int k) 
{ 
    char result; 
     
    if(k >= 0 && k < 26) 
        result = 'a' + k; 
    else if(k >= 26 && k < 52) 
        result = 'A' + k - 26; 
    else 
        result = '-'; 
     
    return result; 
} 
 
/************************************************************************ 
NAME 
     getObservation - get the order of the specified observation 
 
DESCRIPTION 
     This function gets the order of the specified observation in the 
	 observation list with binary search  
 
     Input: 
	   specified observation 
	 Output: 
	   the order of the specified observation 
	 Global variables list: 
	   M. 
*************************************************************************/ 
int getObservation(char ch) 
{ 
    int start, end, middle; 
    /* initial value */ 
    start = 0; 
    end = M-1; 
    middle= -1; 
    //exception: the input observation is not in the observation list 
    if(ch < observations[start] || ch > observations[end]) 
    { 
        printf("the input observation is not in the observation list!\n"); 
        exit(1); 
    } 
 
    if(observations[start] == ch) 
        return start; 
    if(observations[end] == ch) 
        return end; 
    middle = (start + end)/2; 
 
    /* why middle != start: 
        if middle == start, it means end = start + 1, and 'ch' is a 
        character which lies between observation 'start' and 
        observation 'end' */ 
    while(observations[middle]!=ch && start < end && middle != start) 
    { 
        if(observations[middle] < ch) 
            start = middle; 
        else 
            end = middle; 
 
        middle = (start + end) / 2; 
    } 
 
    if(observations[middle] != ch) 
    { 
        printf("The char is not in observation list!\n"); 
        exit(1); 
    } 
 
    return middle; 
} 
 
 
/************************************************************************ 
NAME 
     forward - forward algorithm 
 
DESCRIPTION 
     This function calculates the probability of the observation sequence 
	 with forward algorithm given the HMM model. 
 
     Input: 
	   HMM model & the observation sequence 
	 Output: 
	   Probability 
	 Global variables list: 
	   N, alpha, pi, transitions, emissions. 
*************************************************************************/ 
double forward(char *line) 
{ 
    int i, j, t; 
    int length = strlen( line ); 
    /* the order of the real observation in the observation set */ 
    int iObservation; 
    /* transition probability to state i given the observation sequence */ 
    double sumTransProb; 
    double prob; 
 
    /* basic condition in forward algorithm */ 
    t=0; 
    iObservation = getObservation( line[t] ); 
    for(i=0; i < N; i++) 
    { 
        alpha[t][i] = pi[i]*emissions[i][ iObservation ]; 
    } 
 
    /* the induction step in forward algorithm */ 
    for(t=1; t < length; t++) 
    { 
        iObservation = getObservation( line[t] ); 
        for(i=0; i < N; i++) 
        { 
            sumTransProb = 0; 
            for(j=0; j < N; j++) 
            { 
                sumTransProb = sumTransProb + alpha[t-1][j] * transitions[j][i]; 
            } 
 
            alpha[t][i] = sumTransProb * emissions[i][iObservation]; 
        } 
    } 
 
    /* the termination step */ 
    t = length-1; 
    prob = 0; 
    for(i=0; i < N; i++) 
    { 
        prob = prob + alpha[t][i]; 
    } 
 
    return prob; 
} 
 
 
/************************************************************************ 
NAME 
     backward - backward algorithm 
 
DESCRIPTION 
     This function calculates the probability of the observation sequence 
	 with backward algorithm given the HMM model. 
 
     Input: 
	   HMM model & the observation sequence 
	 Output: 
	   Probability 
	 Global variables list: 
	   N, beta, pi, transitions, emissions. 
*************************************************************************/ 
double backward(char *line) 
{ 
    int i, j, t; 
    int length = strlen( line ); 
    /* the order of the real observation in the observation set */ 
    int iObservation; 
    /* probability to state i given the sequence and the observation */ 
    double sumProbi; 
    double prob; 
 
    /* basic condition in backward algorithm */ 
    t = length-1; 
    for(i=0; i < N; i++) 
    { 
        beta[t][i] = 1; 
    } 
 
    /* the induction step in backward algorithm */ 
    for(t=length-2; t >= 0; t--) 
    { 
        iObservation = getObservation( line[t+1] ); 
        for(i=0; i < N; i++) 
        { 
            sumProbi = 0; 
            for(j=0; j < N; j++) 
            { 
                sumProbi = sumProbi + transitions[i][j]*emissions[j][iObservation] * beta[t+1][j]; 
            } 
 
            beta[t][i] = sumProbi; 
        } 
    } 
 
    /* the termination step */ 
    t = 0; 
    iObservation = getObservation( line[t] ); 
    prob = 0; 
    for(i=0; i < N; i++) 
    { 
        prob = prob + pi[i]*emissions[i][iObservation]*beta[t][i]; 
    } 
 
    return prob; 
} 
 
/************************************************************************ 
NAME 
     viterbi - viterbi algorithm 
 
DESCRIPTION 
     This function calculates the most probable state path of the 
	 observation sequence with viterbi algorithm given the HMM model. 
 
     Input: 
	   HMM model & the observation sequence 
	 Output: 
	   the most probable state path 
	 Global variables list: 
	   N, beta, pi, transitions, emissions. 
*************************************************************************/ 
void viterbi(char *line, char *buffer) 
{ 
    /* delta_t0[ N ] */ 
    double* delta_t0; 
    /* delta_t1[ N ] */ 
    double* delta_t1; 
    /* the matrix to store the optimal states, used for backtracking 
       size: N * length */ 
    int *phi; 
    /* position in the matrix phi */ 
    int pos; 
    int length = strlen( line ); 
    int i, j, t; 
    int fromState, inState; 
    int iObservation; 
    double sum_for_scale; 
    double maxProb, currentProb; 
 
    /* allocate space for buffer */ 
    delta_t0 = (double*)malloc(N*sizeof(double)); 
    delta_t1 = (double*)malloc(N*sizeof(double)); 
    phi = (int*)malloc(length * N * sizeof( int )); 
 
    /* the initial step */ 
    iObservation = getObservation( line[0] ); 
    for(i=0; i < N; i++) 
    { 
        delta_t0[i] = pi[i] * emissions[i][iObservation]; 
        phi[i] = 0; 
    } 
 
    /* recursion step */ 
    pos = N; 
    for(t=1; t < length; t++) 
    { 
        iObservation = getObservation( line[t] ); 
        sum_for_scale = 0; 
        for(j=0; j < N; j++) 
        { 
            /* find the optimal state from which transfered to state j */ 
            maxProb = delta_t0[0] * transitions[0][j]; 
            fromState = 0; 
            for(i=1; i < N; i++) 
            { 
                currentProb = delta_t0[i] * transitions[i][j]; 
                if(maxProb < currentProb) 
                { 
                    maxProb = currentProb; 
                    fromState = i; 
                } 
            } 
            /* calculate the transition probability */ 
            delta_t1[j] = emissions[j][iObservation] * maxProb; 
            sum_for_scale = sum_for_scale + delta_t1[j]; 
            phi[ pos++ ] = fromState; 
        } 
 
        /* update current probabilities */ 
        for(j=0; j < N; j++) 
        { 
            if(sum_for_scale < 0.1) 
                delta_t0[j] = delta_t1[j] * 10; 
            else 
                delta_t0[j] = delta_t1[j]; 
        } 
    } 
 
    /* termination */ 
    maxProb = delta_t0[0]; 
    inState = 0; 
    for(i=1; i < N; i++) 
    { 
        currentProb = delta_t0[i]; 
        if(maxProb < currentProb) 
        { 
            maxProb = currentProb; 
            inState = i; 
        } 
    } 
 
    /* path backtracking */ 
    i = inState; 
    pos = length * N; 
    for(t=length-1; t >= 0 ; t--) 
    { 
        buffer[ t ] = getSymbol( i ); 
        pos = pos - N; 
        if(pos+i < 0) 
        { 
            printf("Error in the subscribe 1: pos = %d, statei = %d\n", pos, i); 
            exit(1); 
        } 
        if(pos+i >= length * N) 
        { 
            printf("Error in the subscribe 2\n"); 
            exit(1); 
        } 
        i = phi[pos + i]; 
    } 
 
    buffer[length] = '\0'; 
    free(delta_t0); 
    free(delta_t1); 
    free( phi ); 
 
    return; 
} 
 
/************************************************************************ 
NAME 
     baumWelch - baumWelch algorithm 
 
DESCRIPTION 
     This function estimates the parameters of HMM with specified topology 
	 given the training data and initial parameters of HMM 
 
     Input: 
	   HMM topology, HMM initial parameters, and training data 
	 Output: 
	   the estimated parameters 
	 Global variables list: 
	   iTrain, trainData, T, M, N, alpha, beta, pi, transitions, emissions. 
*************************************************************************/ 
void baumWelch() 
{ 
    int loops; 
    int i, j, k, t; 
    int iObservation; 
    int length; 
    double** _transitions; /* _transitions[ N ][ N ] estimated transition matrix T */ 
    double** _emissions;   /* _emissions[ N ][ M ]   estimated emission matrix E */ 
    double* _pi;           /* _pi[ N ]               estimated initial distribution */ 
    double oldLikelihood, Likelihood; 
    double error; 
    double sumGamma1; 
    double* gamma1;  // gamma1[N]: for estimating initial distribution of the states 
    double** Eij;  // Eij[N][N]: expected numbers transited from state i to state j 
    double** Eia;  // Eia[N][M]: expected numbers of emitting a in state i 
	/* X_d_t[N][N]: expected numbers transited from state i to state j 
         when it is with observation O in the position t of d-th sequence */ 
    double** X_d_t; 
    double* ei;    // ei[N]: expected numbers in state i 
    double* ej;    // ej[N]: expected numbers of emissions in state j 
    double Prob; 
    double temp, temp2, tempM; 
 
    /* allocate space */ 
    AllocateDataSpace( &_transitions, N, N ); 
    AllocateDataSpace( &_emissions, N, M ); 
    _pi = (double*)malloc(N*sizeof(double)); 
    gamma1 = (double*)malloc(N*sizeof(double)); 
    AllocateDataSpace( &Eij, N, N ); 
    AllocateDataSpace( &Eia, N, M ); 
    AllocateDataSpace( &X_d_t, N, N ); 
    ei = (double*)malloc(N*sizeof(double)); 
    ej = (double*)malloc(N*sizeof(double)); 
 
    /* initialization */ 
    error = 1; 
    oldLikelihood = forward( trainData[0] ); 
    Likelihood = backward( trainData[0] ); 
    for(k=1; k < iTrain; k++) 
    { 
        oldLikelihood = oldLikelihood + forward(trainData[k]); 
        Likelihood = Likelihood + backward(trainData[k]); 
    } 
    /* check whether forward and backward algorithms work well */ 
    if(fabs((Likelihood-oldLikelihood)/Likelihood) > 0.01) 
    { 
        printf("there are errors in forward or backward algorithm\n"); 
        exit(1); 
    } 
    Likelihood = oldLikelihood; 
    printf("Likelihood from forward: %e\n", Likelihood); 
    loops = 0; 
    while((error > threshold && loops < 1000) && Likelihood > threshold) 
    { 
        printf("loops: %d\n", loops); 
        loops++; 
        /* initialize the expected matrix */ 
        for(i=0; i < N; i++) 
        { 
            gamma1[i] = 0; 
            for(j=0; j < N; j++) 
            { 
                Eij[i][j] = 0; 
            } 
            for(j=0; j < M; j++) 
            { 
                Eia[i][j] = 0; 
            } 
        } 
 
        // loop for all the sequences 
        for(k=0; k < iTrain; k++) 
        { 
            // initialize the matrix alpha and beta 
            Prob = backward(trainData[k]); 
            Prob = forward(trainData[k]); 
            length = strlen(trainData[k]); 
            // loop for one sequence 
            for(t=0; t < length-1; t++) 
            { 
                iObservation = getObservation(trainData[k][t+1]); 
                for(i=0; i < N; i++) 
                { 
                    for(j=0; j < N; j++) 
                    { 
                        X_d_t[i][j] = (alpha[t][i] * transitions[i][j] * emissions[j][iObservation] * beta[t+1][j]) / Prob; 
                        Eij[i][j] = Eij[i][j] + X_d_t[i][j]; 
                        Eia[j][iObservation] = Eia[j][iObservation] + X_d_t[i][j]; 
                        if(t == 0) 
                        { 
                            gamma1[i] = gamma1[i] + X_d_t[i][j]; 
                        } 
                    } 
                } 
            } 
        } 
        sumGamma1 = 0; 
        for(i=0; i < N; i++) 
        { 
            sumGamma1 = sumGamma1 + gamma1[i]; 
            ei[i] = Eij[i][0]; 
            for(j=1; j < N; j++) 
            { 
                ei[i] = ei[i] + Eij[i][j]; 
            } 
            ej[i] = Eia[i][0]; 
            for(j=1; j < M; j++) 
            { 
                ej[i] = ej[i] + Eia[i][j]; 
            } 
        } 
 
        temp = 1.0 / (double)N; 
        temp2= temp/ (double)N; 
        tempM= 1.0 / (double)M; 
        for(i=0; i < N; i++) 
        { 
            // estimate initial distribution 
            if(sumGamma1 < threshold2) 
                _pi[i] = temp; /* uniform distribution */ 
            else 
                _pi[i] = gamma1[i] / sumGamma1; 
            // estimate the transition matrix 
            for(j=0; j < N; j++) 
            { 
                if(ei[i] < threshold2) 
                { 
                    if(i == j) 
                        _transitions[i][j] = 1 - temp + temp2; 
                    else 
                    _transitions[i][j] = temp2; 
                } 
                else 
                { 
                    _transitions[i][j] = Eij[i][j] / ei[i]; 
                } 
            } 
            // estimate the emission matrix 
            for(j=0; j < M; j++) 
            { 
                if(ej[i] < threshold2) 
                    _emissions[i][j] = tempM; 
                else 
                    _emissions[i][j] = Eia[i][j] / ej[i]; 
            } 
        } 
 
        /* assign the estimated new values to the HMM model */ 
        for(i=0; i < N; i++) 
        { 
            /* do not need to update the initial state distribution */ 
            /*pi[i] = _pi[i];*/ 
 
            /* transition matrix */ 
            for(j=0; j < N; j++) 
            { 
                transitions[i][j] = _transitions[i][j]; 
            } 
 
            /* emission matrix */ 
            for(j=0; j < M; j++) 
            { 
                emissions[i][j] = _emissions[i][j]; 
            } 
        } 
        /* compute the error */ 
        Likelihood = forward( trainData[0] ); 
        for(k=1; k < iTrain; k++) 
        { 
            Likelihood = Likelihood + forward(trainData[k]); 
        } 
        error = fabs(Likelihood - oldLikelihood); 
        printf("oldLikelihood = %e, Likelihood = %e, error = %e\n", oldLikelihood, Likelihood, error); 
        oldLikelihood = Likelihood; 
    } 
} 
 
/************************************************************************ 
NAME 
     loadHMM - load hidden Markov model from the specified file 
 
DESCRIPTION 
     This function ... 
 
     Input: 
	   the specified HMM file name 
	 Output: 
	   N, M, transitions, emissions, pi 
	 Global variables list: 
	   M, N, pi, transitions, emissions. 
*************************************************************************/ 
void loadHMM(char* hmmFile) 
{ 
    FILE *in_fp; 
    char* token; 
    int i, j, m; 
    int end; 
    int nLines = cal_lines(hmmFile); 
    int nBuffer = getLengthOfLongestLine(hmmFile)+extraSpace; 
    char *line, *buffer; 
    /* Open for read (will fail if inputfile does not exist) */ 
    if( (in_fp  = fopen( hmmFile, "r" )) == NULL ) 
    { 
        printf( "The file '%s' was not opened\n", hmmFile); 
        exit(1); 
    } 
    line = (char *) malloc(nBuffer * sizeof(char)); 
    buffer = (char *) malloc(nBuffer * sizeof(char)); 
 
    // Part 1 
    // read the first line 
    fgets(line, nBuffer, in_fp); 
 
    // number of states 
    token = strtok( line, seps ); 
    N = atoi(token); 
     
    /* exceptions for number of states */ 
    if(N < 1) 
    { 
        printf("the number of states is too small!\n"); 
        exit(1); 
    } 
    if(N > 100) 
    { 
        printf("the program can't deal with states more than 100!\n"); 
        exit(1); 
    } 
    /* number of observations */ 
    token = strtok( NULL, seps ); 
    M = atoi(token); 
    observationsDefined = TRUE; 
    if(M == noDefObservations) 
    { 
        observationsDefined = FALSE; 
    } 
    /* exceptions for number of observations */ 
    if((M <= 0 && M != noDefObservations) || M > 100) 
    { 
        printf("the number of the observations is not correct!\n"); 
        exit(1); 
    } 
    /* the possible length of the longest line */ 
    token = strtok( NULL, seps ); 
    T = atoi(token); 
 
    // skip one line 
    fgets(line, nBuffer, in_fp); 
 
    // Part 2 
    fgets(line, nBuffer, in_fp); 
    /* whether to read the observations */ 
    if(observationsDefined == TRUE) 
    { 
        strcpy(buffer, line); 
        m = 0; 
        token = strtok( buffer, seps ); 
        // count how many observations there are in the line 
        while(token != NULL) 
        { 
            token = strtok( NULL, seps ); 
            m++; 
        } 
        m = m - 1; // because there is a prefix in the sequence 
 
        /* check whether the nunber of the observations is consistent */ 
        /* if not, exit the program */ 
        if(M != m) 
        { 
            printf("the number of observations is inconsistent!\n"); 
            exit(1); 
        } 
 
        observations = (char*)malloc((M+1)*sizeof(char)); 
        observations[M] = '\0'; 
        token = strtok( line, seps ); 
        token = strtok(NULL, seps); 
        end=0; 
        // read the observations and sort them 
        // insertion sort 
        while(token != NULL) 
        { 
            int pos = end; 
            while(pos > 0 && observations[pos-1] > token[0]) 
            { 
                observations[pos] = observations[pos-1]; 
                pos--; 
            } 
            observations[pos] = token[0]; 
            end++; 
            token = strtok(NULL, seps); 
        } 
    } 
    fgets(line, nBuffer, in_fp); // skip one line 
 
    // Part 3 
    if(AllocateDataSpace( &transitions, N, N ) != correctAction) 
    { 
        printf("allocate memory for transitions error!\n"); 
        exit(1); 
    } 
    for(i=0; i < N; i++) 
    { 
        for(j=0; j < N; j++) 
        { 
            transitions[i][j] = 0; 
        } 
    } 
    // read transition matrix 
    for(i=0; i < N; i++) 
    { 
        fgets(line, nBuffer, in_fp); 
        token = strtok( line, seps ); 
        j=0; 
        while(token != NULL) 
        { 
            transitions[i][j] = atof(token); 
            j++; 
            token = strtok(NULL, seps); 
        } 
    } 
    fgets(line, nBuffer, in_fp); // skip one line 
 
    // Part 4 
    // read emission matrix 
    if(observationsDefined == TRUE) 
    { 
        if(AllocateDataSpace( &emissions, N, M ) != correctAction) 
        { 
            printf("allocate memory for emissions error!\n"); 
            exit(1); 
        } 
        // initializing the emission matrix 
        for(i=0; i < N; i++) 
        { 
            for(j=0; j < M; j++) 
            { 
                emissions[i][j] = 0; 
            } 
        } 
        // read emission matrix 
        for(i=0; i < N; i++) 
        { 
            fgets(line, nBuffer, in_fp); 
            token = strtok( line, seps ); 
            j=0; 
            while(token != NULL) 
            { 
                emissions[i][j] = atof(token); 
                j++; 
                token = strtok(NULL, seps); 
            } 
        } 
    } 
    fgets(line, nBuffer, in_fp); // skip one line 
 
    // Part 5 
    pi = (double*)malloc(N*sizeof(double)); 
    for(i=0; i < N; i++) 
    { 
        pi[i] = 0; // initial value 
    } 
    // read initial distributions 
    fgets(line, nBuffer, in_fp); 
    token = strtok( line, seps ); 
    i=0; 
    while(token != NULL) 
    { 
        pi[i] = atof(token); 
        i++; 
        token = strtok(NULL, seps); 
    } 
 
    fclose(in_fp); 
    free(line); 
    free(buffer); 
} 
 
/************************************************************************ 
NAME 
     loadSeq - load sequences from the specified file 
 
DESCRIPTION 
     This function ... 
 
     Input: 
	   the specified sequence file name 
	 Output: 
	   an string array for the input file 
	 Global variables list: 
	   trainData or testData. 
*************************************************************************/ 
void loadSeq(char***pSeq, char* seqFile) 
{ 
    FILE *in_fp; 
    int i; 
    char * token; 
    int nLines = cal_lines(seqFile); 
    int nBuffer = getLengthOfLongestLine(seqFile)+extraSpace; 
    char *line; 
    char **seq; 
    line = (char *) malloc(nBuffer * sizeof(char)); 
 
    if(AllocateDataSpaceChar(&seq, nLines, nBuffer) != correctAction) 
    { 
        printf("allocate memory error\n"); 
        exit(1); 
    } 
    /* Open for read (will fail if inputfile does not exist) */ 
    if( (in_fp  = fopen( seqFile, "r" )) == NULL ) 
    { 
        printf( "The file '%s' was not opened\n", seqFile); 
        exit(1); 
    } 
 
    for(i=0; i < nLines; i++) 
    { 
        fgets(seq[i], nBuffer, in_fp); 
        // deletet the invalid characters 
        token = strtok(seq[i], seps); 
        /* if there are errors in the input sequence */ 
        if(strlen(seq[i]) <= 0) 
        { 
            printf("the length of the sequence is less than 1 in line %d of file %s!\n", i, seqFile); 
            exit(1); 
        } 
    } 
    fclose(in_fp); 
 
    free(line); 
 
    (*pSeq) = seq; 
} 
 
/************************************************************************ 
NAME 
     AllocateDataSpaceChar - allocate memory with specified space 
 
DESCRIPTION 
     This function ... 
 
     Input: 
	   row and col of the space, the pointer 
	 Output: 
	   correctAction or not 
	 Global variables list: 
	   None. 
*************************************************************************/ 
int AllocateDataSpaceChar(char ***pData, int row, int col) 
{ 
    int i; 
    char** Data; 
    Data = (char**)malloc(sizeof(char*) * row); 
    SUCCESS( Data ); 
    for(i=0; i < row; i++) 
    { 
        Data[ i ] = (char *)malloc(sizeof(char) * col); 
        SUCCESS( Data[ i ] ); 
    } 
 
    (*pData) = Data; 
    return correctAction; 
} 
 
/************************************************************************ 
NAME 
     readObservations - read observations from the specified data 
 
DESCRIPTION 
     This function ... 
 
     Input: 
	   specifed data 
	 Output: 
	   observation list 
	 Global variables list: 
	   M, iTrain, trainData. 
*************************************************************************/ 
void readObservations(char** trainData) 
{ 
    int nBuffer = strlen(trainData[0]); // initial size of the buffer 
    char* buffer; 
    char* buffer2; 
    int nObserv = 0; 
    int length; 
    int i, j, k, ii; 
    // allocate memory for the buffer 
    buffer = (char*) malloc((nBuffer+1)*sizeof(char)); 
 
    // initialize that there is one observation in the observation array 
    buffer[0] = trainData[0][0]; 
    nObserv = 1; 
    for(i=0; i < iTrain; i++) 
    { 
        length = strlen(trainData[i]); 
        for(j=0; j < length; j++) 
        { 
            // find the position of the current observation in the observation array 
            for(k=0; k < nObserv; k++) 
            { 
                if(trainData[i][j] <= buffer[k]) 
                break; 
            } 
            // if it is a new observation 
            // insert it into the observation array with insertion sort 
            if(k == nObserv || trainData[i][j] < buffer[k]) 
            { 
                nObserv++; 
                // adjust the buffer's size 
                if(nObserv == nBuffer) 
                { 
                    nBuffer = nBuffer + nBuffer; 
                    buffer2 = (char*) malloc((nBuffer+1)*sizeof(char)); 
                    for(ii=0; ii < nObserv; ii++) 
                    {            buffer2[ii] = buffer[ii]; 
                    } 
                    free(buffer); 
                    buffer = buffer2; 
                } 
 
                // insert the new observation into the array 
                if(trainData[i][j] < buffer[k]) 
                { 
                    for(ii=nObserv-2; ii >= k; ii--) 
                    { 
                        buffer[ii+1] = buffer[ii]; 
                    } 
                } 
                buffer[k] = trainData[i][j]; 
            } 
        } 
    } 
 
    // assign the known observations to the observation array 
    M = nObserv; 
    observations = (char*)malloc(M*sizeof(char)); 
    for(i=0; i < M; i++) 
        observations[i] = buffer[i]; 
 
    free(buffer); 
} 
 
/************************************************************************ 
NAME 
     initEmissions - initialize the emission matrix 
 
DESCRIPTION 
     If the emission matrix is read from file, it will be normalized. 
	 If the emission matrix is initlized at the first time, the emission 
	 matrix will be initialized as follows. 
	 START state will emit '$' with probability 1 and other observations 
	 with probabilities with 0. END state will emit '#' with probabilities 
	 with probability 1 and other observations with probabilities with 0. 
	 Other states will emit '$' and '#' with probabilities 0, and other 
	 observations with uniformly distribution. 
 
     Input: 
	   None 
	 Output: 
	   emission matrix 
	 Global variables list: 
	   N, M, emissions, extraObservations. 
*************************************************************************/ 
void initEmissions() 
{ 
    double temp; 
    int i, j; 
    int iObservationStart, iObservationEnd; 
    if(observationsDefined == FALSE) 
    { 
        if(AllocateDataSpace( &emissions, N, M ) != correctAction) 
        { 
            printf("allocate memory for emissions error!\n"); 
            exit(1); 
        } 
        /* START state and END state */ 
        // emit start observation with probability 1 in START state 
        // emit ens   observation with probability 1 in END   state 
        for(j=0; j < M; j++) 
        { 
            emissions[0][j]  = 0; 
            emissions[N-1][j] = 0; 
        } 
        iObservationStart = getObservation('$'); 
        emissions[0][iObservationStart] = 1; 
        iObservationEnd = getObservation('#'); 
        emissions[N-1][iObservationEnd] = 1; 
 
        /* other states */ 
        temp = 1.0 / (double)(M - extraObservations); 
        for(i=1; i < N-1; i++) 
        { 
            /* emission matrix */ 
            for(j=0; j < M; j++) 
            { 
                emissions[i][j] = temp; 
            } 
            // other states will not emit the start and end observations 
            emissions[i][iObservationStart] = 0; 
            emissions[i][iObservationEnd] = 0; 
        } 
        //  observationsDefined = TRUE; 
    } 
    else 
    { 
        for(i=0; i < N; i++) 
        { 
            double sum = 0; 
            /* emission probabilities in one state */ 
            for(j=0; j < M; j++) 
            { 
                sum = sum + emissions[i][j]; 
            } 
            // if all of the emission probabilities in one state is very small 
            if(sum < 0.1) 
            { 
                printf("error in the emission matrix!\n"); 
                exit(1); 
            } 
			/* normalize the emission probabilities */ 
            for(j=0; j < M; j++) 
            { 
                emissions[i][j] = emissions[i][j] / sum; 
            } 
        } 
    } 
} 
 
/************************************************************************ 
NAME 
     initTransitions - initialize the transition matrix 
 
DESCRIPTION 
     The function normalizes the transition matrix. 
 
     Input: 
	   None 
	 Output: 
	   transition matrix 
	 Global variables list: 
	   N, transitions. 
*************************************************************************/ 
void initTransitions() 
{ 
    int i, j; 
    for(i=0; i < N; i++) 
    { 
        double sum = 0; // used for normalizing the transition matrix 
        /* transition probabilities from one state */ 
        for(j=0; j < N; j++) 
        { 
            sum = sum + transitions[i][j]; 
        } 
        // if all of the transition probabilities left one state is very small 
        if(sum < 0.1) 
        { 
            printf("error in the transition matrix!\n"); 
            exit(1); 
        } 
		/* normalization */ 
        for(j=0; j < N; j++) 
        { 
            transitions[i][j] = transitions[i][j] / sum; 
        } 
    } 
} 
 
/************************************************************************ 
NAME 
     initPi - initialize the initial state distribution 
 
DESCRIPTION 
     It is assumed that the model is in the first state with probability 1 
 
     Input: 
	   None 
	 Output: 
	   pi 
	 Global variables list: 
	   N, pi. 
*************************************************************************/ 
void initPi() 
{ 
    int i; 
    pi[0] = 1; 
    for(i=1; i < N; i++) 
    { 
        pi[i] = 0; 
    } 
} 
 
/************************************************************************ 
NAME 
     init - initialize the hidden Markov model and the space 
 
DESCRIPTION 
 
     Input: 
	   None 
	 Output: 
	   None 
	 Global variables list: 
	   N, T, alpha, beta. 
*************************************************************************/ 
void init() 
{ 
    initTransitions(); 
    initEmissions(); 
    initPi(); 
    AllocateDataSpace(&alpha, T, N); 
    AllocateDataSpace(&beta, T, N); 
} 
 
/************************************************************************ 
NAME 
     check_mode - check whether the files are provided for the specified 
	 mode. 
 
DESCRIPTION 
     This function ... Different files must be provided for different 
	 mode. 
 
     TRAINING: training file must be provided 
	 TESTING:  testing file must be provided 
	 TRAINING_TESTING: both training file and testing file must be provided 
 
     Input: 
	   trainFile and testFile pointers 
	 Output: 
	   None 
	 Global variables list: 
	   None. 
*************************************************************************/ 
void check_mode(int mode, char* trainFile, char* testFile) 
{ 
    switch(mode) 
    { 
    case TRAINING: 
        if(trainFile == NULL) 
        { 
            printf("In TRAINING mode, the training file was not provided!\n"); 
            exit(1); 
        } 
        break; 
    case TESTING: 
        if(testFile == NULL) 
        { 
            printf("In TESTING mode, the testing file was not provided!\n"); 
            exit(1); 
        } 
        break; 
    case TRAINING_TESTING: 
        if(trainFile == NULL || testFile == NULL) 
        { 
            printf("Training file and/or testing file was not provided\n"); 
            exit(1); 
        } 
        break; 
    default: 
        printf("The mode is invalid! Please read the usage message!\n\n"); 
        printf("%s", usage); 
        exit(1); 
        break; 
    } 
}