www.pudn.com > myHMMsource.rar > myHMM.c, change:2004-02-26,size:44333b

```/************************************************************************
*                                                                      *
*  Program packages 'myHMM' :                                          *
*                                                                      *
*  myHMM.c                                                             *
*   -the main program for myHMM hidden Markov model                    *
*                                                                      *
*  Version 0.1                                                         *
*  Date: 3 May 2003                                                    *
*                                                                      *
*  NOTE: This program package is copyrighted in the sense that it      *
*  may be used for scientific purposes. The package as a whole, or     *
*  parts thereof, cannot be included or used in any commercial         *
*  application without written permission granted by its producents.   *
*  No programs contained in this package may be copied for commercial  *
*  distribution.                                                       *
*                                                                      *
*  All comments concerning this program package may be sent to the     *
*                                                                      *
************************************************************************/

/*************************************************************************
NAME
myHMM - train the hidden Markov model

SYNOPSIS
myHMM -hmm [HMM] [-train trainFile] [-trainedHmm trainedHmmFile]
[-test testFile] [-tested testedFile] [-mode mode]

DESCRIPTION

Input:
required: pre-defined hidden Markov model
optional: training data, testing data

Output:
optional: trained hidden Markov model, tested result

Global variables:
N: integer, number of states
M: integer, number of observations
T: integer, the length of the longest input sequence
S: integer, finite set of possible states
O: integer, finite set of possible observations
t: double, transition matrix: N x N
e: integer, emission matrix: N x M
pi: integer, initial state distribution: 1 x N

%% A hidden Markov model (HMM) is a five-tuple (S,O,t,e,pi).
%% Let lambda = {t,e,pi} denote the parameters for a given HMM
%% with fixed S and O.

%% There are three basic problems in HMM:
%% 1) probability of the observation sequence given the HMM model
%%   -> forward algorithm & backward algorithm
%% 2) the most probable state path of the observation sequence
%%   given the HMM model
%%   -> Viterbi algorithm
%% 3) Building the model given a training set of sequence
%%   -> Baum-Welch algorithm

OPTIONS
-hmm [HMM]
pre-defined hidden Markov Model

-train trainFile
training file

-trainedHmmFile trainedHmmFile
trained hidden Markov model

-test testFile
testing file

-tested testedFile
tested result file

-mode mode
mode must be one of 1, 2 or 3. '1' represents TRAINING, '2'
represents TESTING, and '3' represents TRAINING_TESTING.
*************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <malloc.h>
#include <math.h>
#include "tools.h"
#include "myhmm.h"
#include "arg.h"

static char usage[] =
"myHMM - hidden Markov model\n"
"command: myHMM -hmm <hmm> [option]\n"
"Required parameters:\n"
"  -hmm filename        the file for hmm definition\n"
"Optional parameters:\n"
"  -train filename      the file for training\n"
"  -test filename       the file for testing\n"
"  -trainedHmm filename the file for trained hmm\n"
"  -tested filename     the file for tested result\n"
"  -mode int            the mode for training, testing or both\n"
"                       1: training, 2: testing, 3: both\n";

/************************************************************************
NAME
main - main function in the program

DESCRIPTION
This function extracts parameters from the argument list and perform
the specified actions by the parameters.

Input:
arguments in the command line
Output:
specified output(s): trained HMM file and/or tesed result file
Global variable list:
iTrain, iTest, T, trainData, testData, observationDefined,
extraSpace.
*************************************************************************/
int main(int argc, char** argv)
{
char* hmmFile;
char* trainFile;
char* testFile;
char* trainedHmmFile;
char* testedFile;
int mode;

/*---------- get the parameters ----------*/
if (argc <= 1 || extract_parameter(argc, argv, "-help", OPTION2) || extract_parameter(argc, argv, "-h", OPTION2) )
{
printf("%s", usage);

exit(0);
}
hmmFile   = extract_parameter(argc, argv, HMM_FILE, ALWAYS);
trainFile = extract_parameter(argc, argv, TRAINING_FILE, OPTION);
testFile  = extract_parameter(argc, argv, TESTING_FILE, OPTION);
trainedHmmFile = extract_parameter(argc, argv, TRAINED_HMM_FILE, OPTION);
testedFile    = extract_parameter(argc, argv, TESTED_FILE, OPTION);
mode = oatoi(extract_parameter(argc, argv, MODE, OPTION), TRAINING);

/* check whether the mode is consistant with the provided files */
check_mode(mode, trainFile, testFile);

/* default values for trainedHmmFile and testedFile */
if(trainedHmmFile == NULL)
trainedHmmFile = defTrainedHmmFile;
if(testedFile == NULL)
testedFile = defTestedFile;

/*---------- train the HMM model ----------*/
if(mode == TRAINING || mode == TRAINING_TESTING)
{
iTrain = cal_lines(trainFile);
T = getLengthOfLongestLine(trainFile) + extraSpace;
if(observationsDefined == FALSE)
{
}
/*---------- initialization ----------*/
init();
/*---------- train HMM ----------*/
baumWelch();
/*---------- save trained HMM ----------*/
saveHmm(trainedHmmFile);
}

/*---------- test with the HMM model ----------*/
if(mode == TESTING || mode == TRAINING_TESTING)
{
/*---------- 5. load testing data ----------*/
if(mode == TESTING)
{
T = getLengthOfLongestLine(testFile) + extraSpace;
init();
}
iTest = cal_lines(testFile);
/*---------- test ----------*/
test(testedFile);
}
/* post-processing */
post_processing();

return 0;
}

/************************************************************************
NAME
post_processing - post processing after the main process

DESCRIPTION
This function releases the allocated memery for the global variables.

Input:
pointers to global variables.
Output:
None.
Global variable list:
transitions, emissions, pi, observations, alpha, beta, trainData,
testData.
*************************************************************************/
void post_processing()
{
int i;

/* free space for transition matrix */
if(transitions != NULL)
{
for(i=0; i < N; i++)
{
if(transitions[i] != NULL) free(transitions[i]);
}
free(transitions);
}
/* free space for emission matrix */
if(emissions != NULL)
{
for(i=0; i < N; i++)
{
if(emissions[i] != NULL) free(emissions[i]);
}
free(emissions);
}
/* free space for initial distribution vector */
if(pi != NULL) free(pi);
/* free space for observation list */
if(observations != NULL) free(observations);
/* free space for matrix alpha, used in forward algorithm */
if(alpha != NULL)
{
for(i=0; i < T; i++)
{
if(alpha[i] != NULL) free(alpha[i]);
}
free(alpha);
}
/* free space for matrix beta, used in backward algorithm */
if(beta != NULL)
{
for(i=0; i < T; i++)
{
if(beta[i] != NULL) free(beta[i]);
}
free(beta);
}
/* free space for training data */
if(trainData != NULL)
{
for(i=0; i < iTrain; i++)
{
if(trainData[i] != NULL) free(trainData[i]);
}
free(trainData);
}
/* free space for testing data */
if(testData != NULL)
{
for(i=0; i < iTest; i++)
{
if(testData[i] != NULL) free(testData[i]);
}
free(testData);
}
}

/************************************************************************
NAME
saveHmm - save the trained hidden Markov model in pre-defined format

DESCRIPTION
This function saves the trained hidden Markov model in pre-defined
format.

Input:
trained HMM & output file name
Output:
output file
Global variables:
N, M, T, transitions, emissions, pi.
*************************************************************************/
void saveHmm(char *out)
{
FILE *fp;
int i, j;

/* Open for write (will fail if inputfile does not exist) */
if( (fp  = fopen( out, "w" )) == NULL )
{
printf( "The file '%s' was not opened\n", out);
exit(1);
}
fprintf(fp, "%d %d %d\n", N, M, T);
fprintf(fp, "// Observation list\n");
fprintf(fp, "O: ");
for(i=0; i < M; i++)
{
fprintf(fp, "%c ", observations[i]);
}
fprintf(fp, "\n");
/* transition matrix */
fprintf(fp, "// Transition matrix\n");
for(i=0; i < N; i++)
{
for(j=0; j < N; j++)
{
fprintf(fp, "%6.4f ", transitions[i][j]);
}
fputc('\n', fp);
}
/* emission matrix */
fprintf(fp, "// Emission matrix\n");
// emission matrix
for(i=0; i < N; i++)
{
for(j=0; j < M; j++)
{
fprintf(fp, "%6.4f ", emissions[i][j]);
}
fputc('\n', fp);
}
/* initial state distribution */
fprintf(fp, "// Initial state distribution\n");
for(i=0; i < N; i++)
{
fprintf(fp, "%6.4f ", pi[i]);
}
fputc('\n', fp);

fclose( fp );
}

/************************************************************************
NAME
test - test the test file with given hidden Markov model

DESCRIPTION
This function tests the test file with given hidden Markov model.

Input:
trained HMM & tested file name
Output:
tested file
Global variables list:
T, iTest, testData, extraObservations.
*************************************************************************/
void test(char* testedFile)
{
FILE *out_fp;
int i;
double prob;
/* the most probable state path of the current observation sequence */
char *buffer;
/* the most probable state paths of all the observation sequences */
double score;

buffer = (char *) malloc(T * sizeof(char));

/* Open for write (will fail if inputfile does not exist) */
if( (out_fp  = fopen( testedFile, "w" )) == NULL )
{
printf( "The file '%s' was not opened\n", testedFile);
exit(1);
}

for(i=0; i < iTest; i++)
{
prob = forward( testData[i] );
/* score is equal to the natural logarithm of the probability
divided by the length */
score = log(prob) / (double)(strlen(testData[i]) - extraObservations);
viterbi(testData[i], buffer);
/* output probability and the most probable path */
fprintf(out_fp, "%f\n%s\n%s\n", score, testData[i], buffer);
}

free( buffer );

fclose( out_fp );
}

/************************************************************************
NAME
getSymbol - get the symbol corresponding to each state

DESCRIPTION
This function .. There are only 52 distinct symbols in this function.

Input:
state number
Output:
corresponding symbol.
Global variables list:
None.
*************************************************************************/
char getSymbol(int k)
{
char result;

if(k >= 0 && k < 26)
result = 'a' + k;
else if(k >= 26 && k < 52)
result = 'A' + k - 26;
else
result = '-';

return result;
}

/************************************************************************
NAME
getObservation - get the order of the specified observation

DESCRIPTION
This function gets the order of the specified observation in the
observation list with binary search

Input:
specified observation
Output:
the order of the specified observation
Global variables list:
M.
*************************************************************************/
int getObservation(char ch)
{
int start, end, middle;
/* initial value */
start = 0;
end = M-1;
middle= -1;
//exception: the input observation is not in the observation list
if(ch < observations[start] || ch > observations[end])
{
printf("the input observation is not in the observation list!\n");
exit(1);
}

if(observations[start] == ch)
return start;
if(observations[end] == ch)
return end;
middle = (start + end)/2;

/* why middle != start:
if middle == start, it means end = start + 1, and 'ch' is a
character which lies between observation 'start' and
observation 'end' */
while(observations[middle]!=ch && start < end && middle != start)
{
if(observations[middle] < ch)
start = middle;
else
end = middle;

middle = (start + end) / 2;
}

if(observations[middle] != ch)
{
printf("The char is not in observation list!\n");
exit(1);
}

return middle;
}

/************************************************************************
NAME
forward - forward algorithm

DESCRIPTION
This function calculates the probability of the observation sequence
with forward algorithm given the HMM model.

Input:
HMM model & the observation sequence
Output:
Probability
Global variables list:
N, alpha, pi, transitions, emissions.
*************************************************************************/
double forward(char *line)
{
int i, j, t;
int length = strlen( line );
/* the order of the real observation in the observation set */
int iObservation;
/* transition probability to state i given the observation sequence */
double sumTransProb;
double prob;

/* basic condition in forward algorithm */
t=0;
iObservation = getObservation( line[t] );
for(i=0; i < N; i++)
{
alpha[t][i] = pi[i]*emissions[i][ iObservation ];
}

/* the induction step in forward algorithm */
for(t=1; t < length; t++)
{
iObservation = getObservation( line[t] );
for(i=0; i < N; i++)
{
sumTransProb = 0;
for(j=0; j < N; j++)
{
sumTransProb = sumTransProb + alpha[t-1][j] * transitions[j][i];
}

alpha[t][i] = sumTransProb * emissions[i][iObservation];
}
}

/* the termination step */
t = length-1;
prob = 0;
for(i=0; i < N; i++)
{
prob = prob + alpha[t][i];
}

return prob;
}

/************************************************************************
NAME
backward - backward algorithm

DESCRIPTION
This function calculates the probability of the observation sequence
with backward algorithm given the HMM model.

Input:
HMM model & the observation sequence
Output:
Probability
Global variables list:
N, beta, pi, transitions, emissions.
*************************************************************************/
double backward(char *line)
{
int i, j, t;
int length = strlen( line );
/* the order of the real observation in the observation set */
int iObservation;
/* probability to state i given the sequence and the observation */
double sumProbi;
double prob;

/* basic condition in backward algorithm */
t = length-1;
for(i=0; i < N; i++)
{
beta[t][i] = 1;
}

/* the induction step in backward algorithm */
for(t=length-2; t >= 0; t--)
{
iObservation = getObservation( line[t+1] );
for(i=0; i < N; i++)
{
sumProbi = 0;
for(j=0; j < N; j++)
{
sumProbi = sumProbi + transitions[i][j]*emissions[j][iObservation] * beta[t+1][j];
}

beta[t][i] = sumProbi;
}
}

/* the termination step */
t = 0;
iObservation = getObservation( line[t] );
prob = 0;
for(i=0; i < N; i++)
{
prob = prob + pi[i]*emissions[i][iObservation]*beta[t][i];
}

return prob;
}

/************************************************************************
NAME
viterbi - viterbi algorithm

DESCRIPTION
This function calculates the most probable state path of the
observation sequence with viterbi algorithm given the HMM model.

Input:
HMM model & the observation sequence
Output:
the most probable state path
Global variables list:
N, beta, pi, transitions, emissions.
*************************************************************************/
void viterbi(char *line, char *buffer)
{
/* delta_t0[ N ] */
double* delta_t0;
/* delta_t1[ N ] */
double* delta_t1;
/* the matrix to store the optimal states, used for backtracking
size: N * length */
int *phi;
/* position in the matrix phi */
int pos;
int length = strlen( line );
int i, j, t;
int fromState, inState;
int iObservation;
double sum_for_scale;
double maxProb, currentProb;

/* allocate space for buffer */
delta_t0 = (double*)malloc(N*sizeof(double));
delta_t1 = (double*)malloc(N*sizeof(double));
phi = (int*)malloc(length * N * sizeof( int ));

/* the initial step */
iObservation = getObservation( line[0] );
for(i=0; i < N; i++)
{
delta_t0[i] = pi[i] * emissions[i][iObservation];
phi[i] = 0;
}

/* recursion step */
pos = N;
for(t=1; t < length; t++)
{
iObservation = getObservation( line[t] );
sum_for_scale = 0;
for(j=0; j < N; j++)
{
/* find the optimal state from which transfered to state j */
maxProb = delta_t0[0] * transitions[0][j];
fromState = 0;
for(i=1; i < N; i++)
{
currentProb = delta_t0[i] * transitions[i][j];
if(maxProb < currentProb)
{
maxProb = currentProb;
fromState = i;
}
}
/* calculate the transition probability */
delta_t1[j] = emissions[j][iObservation] * maxProb;
sum_for_scale = sum_for_scale + delta_t1[j];
phi[ pos++ ] = fromState;
}

/* update current probabilities */
for(j=0; j < N; j++)
{
if(sum_for_scale < 0.1)
delta_t0[j] = delta_t1[j] * 10;
else
delta_t0[j] = delta_t1[j];
}
}

/* termination */
maxProb = delta_t0[0];
inState = 0;
for(i=1; i < N; i++)
{
currentProb = delta_t0[i];
if(maxProb < currentProb)
{
maxProb = currentProb;
inState = i;
}
}

/* path backtracking */
i = inState;
pos = length * N;
for(t=length-1; t >= 0 ; t--)
{
buffer[ t ] = getSymbol( i );
pos = pos - N;
if(pos+i < 0)
{
printf("Error in the subscribe 1: pos = %d, statei = %d\n", pos, i);
exit(1);
}
if(pos+i >= length * N)
{
printf("Error in the subscribe 2\n");
exit(1);
}
i = phi[pos + i];
}

buffer[length] = '\0';
free(delta_t0);
free(delta_t1);
free( phi );

return;
}

/************************************************************************
NAME
baumWelch - baumWelch algorithm

DESCRIPTION
This function estimates the parameters of HMM with specified topology
given the training data and initial parameters of HMM

Input:
HMM topology, HMM initial parameters, and training data
Output:
the estimated parameters
Global variables list:
iTrain, trainData, T, M, N, alpha, beta, pi, transitions, emissions.
*************************************************************************/
void baumWelch()
{
int loops;
int i, j, k, t;
int iObservation;
int length;
double** _transitions; /* _transitions[ N ][ N ] estimated transition matrix T */
double** _emissions;   /* _emissions[ N ][ M ]   estimated emission matrix E */
double* _pi;           /* _pi[ N ]               estimated initial distribution */
double oldLikelihood, Likelihood;
double error;
double sumGamma1;
double* gamma1;  // gamma1[N]: for estimating initial distribution of the states
double** Eij;  // Eij[N][N]: expected numbers transited from state i to state j
double** Eia;  // Eia[N][M]: expected numbers of emitting a in state i
/* X_d_t[N][N]: expected numbers transited from state i to state j
when it is with observation O in the position t of d-th sequence */
double** X_d_t;
double* ei;    // ei[N]: expected numbers in state i
double* ej;    // ej[N]: expected numbers of emissions in state j
double Prob;
double temp, temp2, tempM;

/* allocate space */
AllocateDataSpace( &_transitions, N, N );
AllocateDataSpace( &_emissions, N, M );
_pi = (double*)malloc(N*sizeof(double));
gamma1 = (double*)malloc(N*sizeof(double));
AllocateDataSpace( &Eij, N, N );
AllocateDataSpace( &Eia, N, M );
AllocateDataSpace( &X_d_t, N, N );
ei = (double*)malloc(N*sizeof(double));
ej = (double*)malloc(N*sizeof(double));

/* initialization */
error = 1;
oldLikelihood = forward( trainData[0] );
Likelihood = backward( trainData[0] );
for(k=1; k < iTrain; k++)
{
oldLikelihood = oldLikelihood + forward(trainData[k]);
Likelihood = Likelihood + backward(trainData[k]);
}
/* check whether forward and backward algorithms work well */
if(fabs((Likelihood-oldLikelihood)/Likelihood) > 0.01)
{
printf("there are errors in forward or backward algorithm\n");
exit(1);
}
Likelihood = oldLikelihood;
printf("Likelihood from forward: %e\n", Likelihood);
loops = 0;
while((error > threshold && loops < 1000) && Likelihood > threshold)
{
printf("loops: %d\n", loops);
loops++;
/* initialize the expected matrix */
for(i=0; i < N; i++)
{
gamma1[i] = 0;
for(j=0; j < N; j++)
{
Eij[i][j] = 0;
}
for(j=0; j < M; j++)
{
Eia[i][j] = 0;
}
}

// loop for all the sequences
for(k=0; k < iTrain; k++)
{
// initialize the matrix alpha and beta
Prob = backward(trainData[k]);
Prob = forward(trainData[k]);
length = strlen(trainData[k]);
// loop for one sequence
for(t=0; t < length-1; t++)
{
iObservation = getObservation(trainData[k][t+1]);
for(i=0; i < N; i++)
{
for(j=0; j < N; j++)
{
X_d_t[i][j] = (alpha[t][i] * transitions[i][j] * emissions[j][iObservation] * beta[t+1][j]) / Prob;
Eij[i][j] = Eij[i][j] + X_d_t[i][j];
Eia[j][iObservation] = Eia[j][iObservation] + X_d_t[i][j];
if(t == 0)
{
gamma1[i] = gamma1[i] + X_d_t[i][j];
}
}
}
}
}
sumGamma1 = 0;
for(i=0; i < N; i++)
{
sumGamma1 = sumGamma1 + gamma1[i];
ei[i] = Eij[i][0];
for(j=1; j < N; j++)
{
ei[i] = ei[i] + Eij[i][j];
}
ej[i] = Eia[i][0];
for(j=1; j < M; j++)
{
ej[i] = ej[i] + Eia[i][j];
}
}

temp = 1.0 / (double)N;
temp2= temp/ (double)N;
tempM= 1.0 / (double)M;
for(i=0; i < N; i++)
{
// estimate initial distribution
if(sumGamma1 < threshold2)
_pi[i] = temp; /* uniform distribution */
else
_pi[i] = gamma1[i] / sumGamma1;
// estimate the transition matrix
for(j=0; j < N; j++)
{
if(ei[i] < threshold2)
{
if(i == j)
_transitions[i][j] = 1 - temp + temp2;
else
_transitions[i][j] = temp2;
}
else
{
_transitions[i][j] = Eij[i][j] / ei[i];
}
}
// estimate the emission matrix
for(j=0; j < M; j++)
{
if(ej[i] < threshold2)
_emissions[i][j] = tempM;
else
_emissions[i][j] = Eia[i][j] / ej[i];
}
}

/* assign the estimated new values to the HMM model */
for(i=0; i < N; i++)
{
/* do not need to update the initial state distribution */
/*pi[i] = _pi[i];*/

/* transition matrix */
for(j=0; j < N; j++)
{
transitions[i][j] = _transitions[i][j];
}

/* emission matrix */
for(j=0; j < M; j++)
{
emissions[i][j] = _emissions[i][j];
}
}
/* compute the error */
Likelihood = forward( trainData[0] );
for(k=1; k < iTrain; k++)
{
Likelihood = Likelihood + forward(trainData[k]);
}
error = fabs(Likelihood - oldLikelihood);
printf("oldLikelihood = %e, Likelihood = %e, error = %e\n", oldLikelihood, Likelihood, error);
oldLikelihood = Likelihood;
}
}

/************************************************************************
NAME

DESCRIPTION
This function ...

Input:
the specified HMM file name
Output:
N, M, transitions, emissions, pi
Global variables list:
M, N, pi, transitions, emissions.
*************************************************************************/
{
FILE *in_fp;
char* token;
int i, j, m;
int end;
int nLines = cal_lines(hmmFile);
int nBuffer = getLengthOfLongestLine(hmmFile)+extraSpace;
char *line, *buffer;
/* Open for read (will fail if inputfile does not exist) */
if( (in_fp  = fopen( hmmFile, "r" )) == NULL )
{
printf( "The file '%s' was not opened\n", hmmFile);
exit(1);
}
line = (char *) malloc(nBuffer * sizeof(char));
buffer = (char *) malloc(nBuffer * sizeof(char));

// Part 1
fgets(line, nBuffer, in_fp);

// number of states
token = strtok( line, seps );
N = atoi(token);

/* exceptions for number of states */
if(N < 1)
{
printf("the number of states is too small!\n");
exit(1);
}
if(N > 100)
{
printf("the program can't deal with states more than 100!\n");
exit(1);
}
/* number of observations */
token = strtok( NULL, seps );
M = atoi(token);
observationsDefined = TRUE;
if(M == noDefObservations)
{
observationsDefined = FALSE;
}
/* exceptions for number of observations */
if((M <= 0 && M != noDefObservations) || M > 100)
{
printf("the number of the observations is not correct!\n");
exit(1);
}
/* the possible length of the longest line */
token = strtok( NULL, seps );
T = atoi(token);

// skip one line
fgets(line, nBuffer, in_fp);

// Part 2
fgets(line, nBuffer, in_fp);
/* whether to read the observations */
if(observationsDefined == TRUE)
{
strcpy(buffer, line);
m = 0;
token = strtok( buffer, seps );
// count how many observations there are in the line
while(token != NULL)
{
token = strtok( NULL, seps );
m++;
}
m = m - 1; // because there is a prefix in the sequence

/* check whether the nunber of the observations is consistent */
/* if not, exit the program */
if(M != m)
{
printf("the number of observations is inconsistent!\n");
exit(1);
}

observations = (char*)malloc((M+1)*sizeof(char));
observations[M] = '\0';
token = strtok( line, seps );
token = strtok(NULL, seps);
end=0;
// read the observations and sort them
// insertion sort
while(token != NULL)
{
int pos = end;
while(pos > 0 && observations[pos-1] > token[0])
{
observations[pos] = observations[pos-1];
pos--;
}
observations[pos] = token[0];
end++;
token = strtok(NULL, seps);
}
}
fgets(line, nBuffer, in_fp); // skip one line

// Part 3
if(AllocateDataSpace( &transitions, N, N ) != correctAction)
{
printf("allocate memory for transitions error!\n");
exit(1);
}
for(i=0; i < N; i++)
{
for(j=0; j < N; j++)
{
transitions[i][j] = 0;
}
}
for(i=0; i < N; i++)
{
fgets(line, nBuffer, in_fp);
token = strtok( line, seps );
j=0;
while(token != NULL)
{
transitions[i][j] = atof(token);
j++;
token = strtok(NULL, seps);
}
}
fgets(line, nBuffer, in_fp); // skip one line

// Part 4
if(observationsDefined == TRUE)
{
if(AllocateDataSpace( &emissions, N, M ) != correctAction)
{
printf("allocate memory for emissions error!\n");
exit(1);
}
// initializing the emission matrix
for(i=0; i < N; i++)
{
for(j=0; j < M; j++)
{
emissions[i][j] = 0;
}
}
for(i=0; i < N; i++)
{
fgets(line, nBuffer, in_fp);
token = strtok( line, seps );
j=0;
while(token != NULL)
{
emissions[i][j] = atof(token);
j++;
token = strtok(NULL, seps);
}
}
}
fgets(line, nBuffer, in_fp); // skip one line

// Part 5
pi = (double*)malloc(N*sizeof(double));
for(i=0; i < N; i++)
{
pi[i] = 0; // initial value
}
fgets(line, nBuffer, in_fp);
token = strtok( line, seps );
i=0;
while(token != NULL)
{
pi[i] = atof(token);
i++;
token = strtok(NULL, seps);
}

fclose(in_fp);
free(line);
free(buffer);
}

/************************************************************************
NAME

DESCRIPTION
This function ...

Input:
the specified sequence file name
Output:
an string array for the input file
Global variables list:
trainData or testData.
*************************************************************************/
{
FILE *in_fp;
int i;
char * token;
int nLines = cal_lines(seqFile);
int nBuffer = getLengthOfLongestLine(seqFile)+extraSpace;
char *line;
char **seq;
line = (char *) malloc(nBuffer * sizeof(char));

if(AllocateDataSpaceChar(&seq, nLines, nBuffer) != correctAction)
{
printf("allocate memory error\n");
exit(1);
}
/* Open for read (will fail if inputfile does not exist) */
if( (in_fp  = fopen( seqFile, "r" )) == NULL )
{
printf( "The file '%s' was not opened\n", seqFile);
exit(1);
}

for(i=0; i < nLines; i++)
{
fgets(seq[i], nBuffer, in_fp);
// deletet the invalid characters
token = strtok(seq[i], seps);
/* if there are errors in the input sequence */
if(strlen(seq[i]) <= 0)
{
printf("the length of the sequence is less than 1 in line %d of file %s!\n", i, seqFile);
exit(1);
}
}
fclose(in_fp);

free(line);

(*pSeq) = seq;
}

/************************************************************************
NAME
AllocateDataSpaceChar - allocate memory with specified space

DESCRIPTION
This function ...

Input:
row and col of the space, the pointer
Output:
correctAction or not
Global variables list:
None.
*************************************************************************/
int AllocateDataSpaceChar(char ***pData, int row, int col)
{
int i;
char** Data;
Data = (char**)malloc(sizeof(char*) * row);
SUCCESS( Data );
for(i=0; i < row; i++)
{
Data[ i ] = (char *)malloc(sizeof(char) * col);
SUCCESS( Data[ i ] );
}

(*pData) = Data;
return correctAction;
}

/************************************************************************
NAME

DESCRIPTION
This function ...

Input:
specifed data
Output:
observation list
Global variables list:
M, iTrain, trainData.
*************************************************************************/
{
int nBuffer = strlen(trainData[0]); // initial size of the buffer
char* buffer;
char* buffer2;
int nObserv = 0;
int length;
int i, j, k, ii;
// allocate memory for the buffer
buffer = (char*) malloc((nBuffer+1)*sizeof(char));

// initialize that there is one observation in the observation array
buffer[0] = trainData[0][0];
nObserv = 1;
for(i=0; i < iTrain; i++)
{
length = strlen(trainData[i]);
for(j=0; j < length; j++)
{
// find the position of the current observation in the observation array
for(k=0; k < nObserv; k++)
{
if(trainData[i][j] <= buffer[k])
break;
}
// if it is a new observation
// insert it into the observation array with insertion sort
if(k == nObserv || trainData[i][j] < buffer[k])
{
nObserv++;
if(nObserv == nBuffer)
{
nBuffer = nBuffer + nBuffer;
buffer2 = (char*) malloc((nBuffer+1)*sizeof(char));
for(ii=0; ii < nObserv; ii++)
{            buffer2[ii] = buffer[ii];
}
free(buffer);
buffer = buffer2;
}

// insert the new observation into the array
if(trainData[i][j] < buffer[k])
{
for(ii=nObserv-2; ii >= k; ii--)
{
buffer[ii+1] = buffer[ii];
}
}
buffer[k] = trainData[i][j];
}
}
}

// assign the known observations to the observation array
M = nObserv;
observations = (char*)malloc(M*sizeof(char));
for(i=0; i < M; i++)
observations[i] = buffer[i];

free(buffer);
}

/************************************************************************
NAME
initEmissions - initialize the emission matrix

DESCRIPTION
If the emission matrix is read from file, it will be normalized.
If the emission matrix is initlized at the first time, the emission
matrix will be initialized as follows.
START state will emit '\$' with probability 1 and other observations
with probabilities with 0. END state will emit '#' with probabilities
with probability 1 and other observations with probabilities with 0.
Other states will emit '\$' and '#' with probabilities 0, and other
observations with uniformly distribution.

Input:
None
Output:
emission matrix
Global variables list:
N, M, emissions, extraObservations.
*************************************************************************/
void initEmissions()
{
double temp;
int i, j;
int iObservationStart, iObservationEnd;
if(observationsDefined == FALSE)
{
if(AllocateDataSpace( &emissions, N, M ) != correctAction)
{
printf("allocate memory for emissions error!\n");
exit(1);
}
/* START state and END state */
// emit start observation with probability 1 in START state
// emit ens   observation with probability 1 in END   state
for(j=0; j < M; j++)
{
emissions[0][j]  = 0;
emissions[N-1][j] = 0;
}
iObservationStart = getObservation('\$');
emissions[0][iObservationStart] = 1;
iObservationEnd = getObservation('#');
emissions[N-1][iObservationEnd] = 1;

/* other states */
temp = 1.0 / (double)(M - extraObservations);
for(i=1; i < N-1; i++)
{
/* emission matrix */
for(j=0; j < M; j++)
{
emissions[i][j] = temp;
}
// other states will not emit the start and end observations
emissions[i][iObservationStart] = 0;
emissions[i][iObservationEnd] = 0;
}
//  observationsDefined = TRUE;
}
else
{
for(i=0; i < N; i++)
{
double sum = 0;
/* emission probabilities in one state */
for(j=0; j < M; j++)
{
sum = sum + emissions[i][j];
}
// if all of the emission probabilities in one state is very small
if(sum < 0.1)
{
printf("error in the emission matrix!\n");
exit(1);
}
/* normalize the emission probabilities */
for(j=0; j < M; j++)
{
emissions[i][j] = emissions[i][j] / sum;
}
}
}
}

/************************************************************************
NAME
initTransitions - initialize the transition matrix

DESCRIPTION
The function normalizes the transition matrix.

Input:
None
Output:
transition matrix
Global variables list:
N, transitions.
*************************************************************************/
void initTransitions()
{
int i, j;
for(i=0; i < N; i++)
{
double sum = 0; // used for normalizing the transition matrix
/* transition probabilities from one state */
for(j=0; j < N; j++)
{
sum = sum + transitions[i][j];
}
// if all of the transition probabilities left one state is very small
if(sum < 0.1)
{
printf("error in the transition matrix!\n");
exit(1);
}
/* normalization */
for(j=0; j < N; j++)
{
transitions[i][j] = transitions[i][j] / sum;
}
}
}

/************************************************************************
NAME
initPi - initialize the initial state distribution

DESCRIPTION
It is assumed that the model is in the first state with probability 1

Input:
None
Output:
pi
Global variables list:
N, pi.
*************************************************************************/
void initPi()
{
int i;
pi[0] = 1;
for(i=1; i < N; i++)
{
pi[i] = 0;
}
}

/************************************************************************
NAME
init - initialize the hidden Markov model and the space

DESCRIPTION

Input:
None
Output:
None
Global variables list:
N, T, alpha, beta.
*************************************************************************/
void init()
{
initTransitions();
initEmissions();
initPi();
AllocateDataSpace(&alpha, T, N);
AllocateDataSpace(&beta, T, N);
}

/************************************************************************
NAME
check_mode - check whether the files are provided for the specified
mode.

DESCRIPTION
This function ... Different files must be provided for different
mode.

TRAINING: training file must be provided
TESTING:  testing file must be provided
TRAINING_TESTING: both training file and testing file must be provided

Input:
trainFile and testFile pointers
Output:
None
Global variables list:
None.
*************************************************************************/
void check_mode(int mode, char* trainFile, char* testFile)
{
switch(mode)
{
case TRAINING:
if(trainFile == NULL)
{
printf("In TRAINING mode, the training file was not provided!\n");
exit(1);
}
break;
case TESTING:
if(testFile == NULL)
{
printf("In TESTING mode, the testing file was not provided!\n");
exit(1);
}
break;
case TRAINING_TESTING:
if(trainFile == NULL || testFile == NULL)
{
printf("Training file and/or testing file was not provided\n");
exit(1);
}
break;
default:
printf("%s", usage);
exit(1);
break;
}
}
```