www.pudn.com > Gaussian.zip > Gaussian.tex, change:2004-06-07,size:51727b

\documentclass[a4paper]{article}

\usepackage{array,amsmath,amssymb,rotating,graphicx}

\input{tutorial_style.texinc}

\newcommand{\mat}[1]{{\tt >> #1} \\}
\newcommand{\com}[1]{{\tt #1}}

%\newcommand{\tit}[1]{{\noindent \bf #1 \\}}
\newcommand{\tab}{\hspace{1em}}

% for MATH MODE
\newcommand{\trn}{^{\mathsf T}} % transposition
\newcommand{\xv}{\ensuremath\mathbf{x}}  % vector x
\newcommand{\muv}{\ensuremath\boldsymbol{\mu}}  % vector mu
\newcommand{\Sm}{\ensuremath\boldsymbol{\Sigma}}  % matrix Sigma
\newcommand{\Tm}{\ensuremath\boldsymbol{\Theta}}  % matrix Sigma
\newcommand{\Rf}{\ensuremath\mathbb{R}}

\setlength{\hoffset}{-1in}
\setlength{\voffset}{-1in}

\setlength{\topskip}{0cm}

\setlength{\textwidth}{16cm}
\setlength{\evensidemargin}{2.5cm}
\setlength{\oddsidemargin}{2.5cm}

\setlength{\textheight}{24cm}
\setlength{\topmargin}{2.5cm}

%\pagestyle{fancyplain}

\begin{document}

%%%%%%%%%%%%%%%%%%%%%%%%% Make Title %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\author{\small Barbara Resch (minor changes by Erhard Rank)\\
Signal Processing and Speech Communication Laboratory\\
Inffeldgasse 16c/II\\
phone 873--4436}
\date{}

\MakeTutorialTitle{Gaussian Statistics and Unsupervised Learning}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\subsubsection*{Abstract}
This tutorial presents the properties of the Gaussian probability
density function. Subsequently, supervised and unsupervised pattern
recognition methods are treated. Supervised classification algorithms
are based on a labeled data set. The knowledge about the class
membership of this training data set is used for the classification of
new samples. Unsupervised learning methods establish clusters from an
unlabeled training data set.  Clustering algorithms such as the
$K$-means, the EM (expectation-maximization) algorithm, and the
Viterbi-EM algorithm are presented.

\subsubsection*{Usage} To make full use of this tutorial you have to
\begin{enumerate}

\HREF{http://www.igi.tugraz.at/lehre/CI/tutorials/Gaussian.zip}
{\texttt{Gaussian.zip}} which contains this tutorial \html{in
printable format (\href{Gaussian.pdf}{PDF} and
\href{Gaussian.ps.gz}{ps.gz})} and the accompanying Matlab
programs.

\item Unzip \texttt{Gaussian.zip} which will generate a
subdirectory named \texttt{Gaussian/matlab} where you can find
all the Matlab programs.

\item Add the path \texttt{Gaussian/matlab} to the matlab search path,
for example with a command like
\verb#addpath('C:\Work\Gaussian\matlab')# if you are using a Windows
machine, or by using a command like
\verb#addpath('/home/jack/Gaussian/matlab')# if you are on a
Unix/Linux machine.

\end{enumerate}

\subsubsection*{Sources}
This tutorial is based on

% ftp://ftp.idiap.ch/pub/sacha/labs/labman1.pdf
% or http://www.ai.mit.edu/~murphyk/Software/HMM/labman2.pdf
\begin{itemize}
\item EPFL lab notes
Introduction to Gaussian Statistics and Statistical Pattern
Recognition'' by Herv\'e Bourlard, Sacha Krstulovi\'c, and Mathew
Magimai-Doss.
\end{itemize}

\html{
\subsubsection*{Contents}
}

\section{Gaussian statistics}
%%%%%%%%%
%%%%%%%%%

%%%%%%%%%
\subsection{Samples from a Gaussian density}
\label{samples}
%%%%%%%%%

\subsubsection*{Useful formulas and definitions:}\label{sec:gausspdf}
\begin{itemize}

\item The {\em Gaussian probability density function (pdf)} for the
$d$-dimensional random variable $\xv \circlearrowleft {\cal N}(\muv,\Sm)$ (i.e., variable $\xv \in \Rf^d$ following the
Gaussian, or Normal, probability law) is given by:
$$\label{eq:gauss} g_{(\muv,\Sm)}(\xv) = \frac{1}{\sqrt{2\pi}^d \sqrt{\det\left(\Sm\right)}} \, e^{-\frac{1}{2} (\xv-\muv)\trn \Sm^{-1} (\xv-\muv)}$$
where $\muv$ is the mean vector and $\Sm$ is the covariance matrix.
$\muv$ and $\Sm$ are the {\em parameters} of the Gaussian
distribution.

\item The mean vector $\muv$ contains the mean values of each
dimension, $\mu_i = E(x_i)$, with $E(x)$ being the \emph{expected
value} of $x$.

\item All of the variances $c_{ii}$ and covariances $c_{ij}$ are
collected together into the covariance matrix $\Sm$ of dimension
$d\times d$:

\begin{equation*}
\Sm =
\left[
\begin{array}{*{4}{c}}
c_{11} & c_{12} & \cdots & c_{1n} \\
c_{21} & c_{22} & \cdots & c_{2n} \\
\vdots & \vdots & \ddots & \vdots \\
c_{n1} & c_{n2} & \cdots & c_{nn} \\
\end{array}
\right]
\end{equation*}

The covariance $c_{ij}$ of two components $x_i$ and $x_j$ of $\xv$
measures their tendency to vary together, i.e., to co-vary,
$c_{ij} = E\left((x_i-\mu_i)\trn\,(x_j-\mu_j)\right).$
If two components $x_i$ and $x_j$, $i\ne j$, have zero covariance
$c_{ij} = 0$ they are {\em orthogonal} in the statistical sense, which
transposes to a geometric sense (the expectation is a scalar product
of random variables; a null scalar product means orthogonality).  If
all components of $\xv$ are mutually orthogonal the covariance matrix
has a diagonal form.

\item $\sqrt{\Sm}$ defines the {\em standard deviation} of the random
variable $\xv$. Beware: this square root is meant in the {\em matrix
sense}.

\item If $\xv \circlearrowleft {\cal N}(\mathbf{0},\mathbf{I})$ ($\xv$
follows a normal law with zero mean and unit variance; $\mathbf{I}$
denotes the identity matrix), and if $\mathbf{y} = \muv + \sqrt{\Sm}\,\xv$, then $\mathbf{y} \circlearrowleft {\cal N}(\muv,\Sm)$.

\end{itemize}

\subsubsection{Experiment:}
Generate samples $X$ of $N$ points, $X=\{\xv_1, \xv_2,\ldots,\xv_N\}$, with $N=10000$, coming from a 2-dimensional
Gaussian process that has mean
$\muv = \left[ \begin{array}{c} 730 \\ 1090 \end{array} \right]$
and variance
\begin{itemize}
%%%%%
\item
8000 for both dimensions ({\em spherical process}) (sample $X_1$):
$\Sm_1 = \left[ \begin{array}{cc} 8000 & 0 \\ 0 & 8000 \end{array} \right]$
%%%%%
\item
expressed as a {\em diagonal} covariance matrix (sample $X_2$):
$\Sm_2 = \left[ \begin{array}{cc} 8000 & 0 \\ 0 & 18500 \end{array} \right]$
%%%%%
\item
expressed as a {\em full} covariance matrix (sample $X_3$):
$\Sm_3 = \left[ \begin{array}{cc} 8000 & 8400 \\ 8400 & 18500 \end{array} \right]$
%%%%%
\end{itemize}
%
Use the function \com{gausview} (\com{>> help gausview}) to plot the
results as clouds of points in the 2-dimensional plane, and to view the
corresponding 2-dimensional probability density functions (pdfs) in 2D and
3D.

\subsubsection*{Example:}
\mat{N = 10000;}
\mat{mu = [730 1090]; sigma\_1 = [8000 0; 0 8000];}
\mat{X1 = randn(N,2) * sqrtm(sigma\_1) + repmat(mu,N,1);}
\mat{gausview(X1,mu,sigma\_1,'Sample X1');}
%
Repeat for the two other variance matrices $\Sm_2$ and $\Sm_3$.
Use the radio buttons to switch the plots on/off. Use the view''
buttons to switch between 2D and 3D. Use the mouse to rotate the plot
(must be enabled in \com{Tools} menu: \com{Rotate 3D}, or by the
$\circlearrowleft$ button).

\subsubsection*{Questions:}
By simple inspection of 2D views of the data and of the corresponding
pdf contours, how can you tell which sample corresponds to a spherical
process (as the sample $X_1$), which sample corresponds to a process
with a diagonal covariance matrix (as $X_2$), and which to a process
with a full covariance matrix (as $X_3$)?

\subsubsection*{Find the right statements:}
\begin{itemize}
\item[$\Box$] In process 1 the first and the second component of the
vectors $\xv_i$ are independent.

\item[$\Box$] In process 2 the first and the second component of the
vectors $\xv_i$ are independent.

\item[$\Box$] In process 3 the first and the second component of the
vectors $\xv_i$ are independent.

\item[$\Box$] If the first and second component of the vectors $\xv_i$
are independent, the cloud of points and the pdf contour has the
shape of a circle.

\item[$\Box$] If the first and second component of the vectors $\xv_i$
are independent, the cloud of points and pdf contour has to be
elliptic with the principle axes of the ellipse aligned with the
abscissa and ordinate axes.

\item[$\Box$] For the covariance matrix $\Sm$ the elements have to
satisfy $c_{ij} = c_{ji}$.

\item[$\Box$] The covariance matrix has to be positive definite
($\xv\trn \Sm\, \xv \ge 0$). (If yes, what happens if not? Try it
out in \textsc{Matlab}).
\end{itemize}

%\pagebreak
%%%%%%%%%
\subsection{Gaussian modeling: Mean and variance of a sample}
%%%%%%%%%

We will now estimate the parameters $\muv$ and $\Sm$ of the Gaussian
models from the data samples.

\subsubsection*{Useful formulas and definitions:}
\begin{itemize}
\item Mean estimator: $\displaystyle \hat{\muv} = \frac{1}{N} \sum_{i=1}^{N} \xv_i$
\item Unbiased covariance estimator: $\displaystyle \hat{\Sm} = \frac{1}{N-1} \; \sum_{i=1}^{N} (\xv_i-\muv)\trn (\xv_i-\muv)$
\end{itemize}

\subsubsection{Experiment:}
Take the sample $X_3$ of 10000 points generated from ${\cal N}(\muv,\Sm_3)$. Compute an estimate $\hat{\muv}$ of its mean and an
estimate $\hat{\Sm}$ of its variance:
\begin{enumerate}
\item with all the available points \hspace{1cm}$\hat{\muv}_{(10000)} =$\hspace{3.5cm}$\hat{\Sm}_{(10000)} =$ \vspace{0.8cm}
\item with only 1000 points \hspace{1.95cm}$\hat{\muv}_{(1000)} =$\hspace{3.7cm}$\hat{\Sm}_{(1000)} =$ \vspace{0.8cm}
\item with only 100 points \hspace{2.2cm}$\hat{\muv}_{(100)} =$\hspace{3.9cm}$\hat{\Sm}_{(100)} =$ \vspace{0.8cm}
\end{enumerate}
Compare the estimated mean vector $\hat{\muv}$ to the original mean
vector $\muv$ by measuring the Euclidean distance that separates them.
Compare the estimated covariance matrix $\hat{\Sm}$ to the original
covariance matrix $\Sm_3$ by measuring the matrix 2-norm of their
difference (the norm $\|\mathbf{A}-\mathbf{B}\|_2$ constitutes a
measure of similarity of two matrices $\mathbf{A}$ and $\mathbf{B}$;
use {\sc Matlab}'s \com{norm} command).

\subsubsection{Example:}
In the case of 1000 points (case 2.): \\
\mat{X = X3(1:1000,:);}
\mat{N = size(X,1)}
\mat{mu\_1000 = sum(X)/N}
\textit{--or--}\\
\mat{mu\_1000 = mean(X)}
\mat{sigma\_1000 = (X - repmat(mu\_1000,N,1))' * (X - repmat(mu\_1000,N,1)) / (N-1)}
\textit{--or--}\\
\mat{sigma\_1000 = cov(X)}

\noindent
\mat{\% Comparison of means and covariances:}
\mat{e\_mu =  sqrt((mu\_1000 - mu) * (mu\_1000 - mu)')}
\mat{\% (This is the Euclidean distance between mu\_1000 and mu)}
\mat{e\_sigma = norm(sigma\_1000 - sigma\_3)}
\com{>> \% (This is the 2-norm of the difference between sigma\_1000 and sigma\_3)}

\subsubsection{Question:}
When comparing the estimated values $\hat{\muv}$ and $\hat{\Sm}$ to
the original values of $\muv$ and $\Sm_3$ (using the Euclidean
distance and the matrix 2-norm), what can you observe?

\subsubsection*{Find the right statements:}
\begin{itemize}
\item[$\Box$] An accurate mean estimate requires more points than an
accurate variance estimate.
\item[$\Box$] It is very important to have enough training examples to
estimate the parameters of the data generation process accurately.
\end{itemize}

%\pagebreak

\subsection{Likelihood of a sample with respect to a Gaussian model}
\label{sec:likelihood}

In the following we compute the likelihood of a sample point $\xv$,
and the joint likelihood of a series of samples $X$ for a given model
$\Tm$ with one Gaussian.  The likelihood will be used in the formula
for classification later on (sec.~\ref{sec:classification}).

\subsubsection*{Useful formulas and definitions:}
\begin{itemize}
\item {\em Likelihood}: the likelihood of a sample point $\xv_i$ given
a data generation model (i.e., given a set of parameters $\Tm$ for
the model pdf) is the value of the pdf $p(\xv_i|\Tm)$ for that
point. In the case of Gaussian models $\Tm = (\muv,\Sm)$, this
amounts to the evaluation of equation~\ref{eq:gauss}.
\item {\em Joint likelihood}: for a set of independent identically
distributed (i.i.d.) samples, say $X = \{\xv_1, \xv_2, \ldots, \xv_N \}$, the joint (or total) likelihood is the product of the
likelihoods for each point. For instance, in the Gaussian case:
$$\label{eq:joint-likelihood} p(X|\Tm) = \prod_{i=1}^{N} p(\xv_i|\Tm) = \prod_{i=1}^{N} p(\xv_i|\muv,\Sm) = \prod_{i=1}^{N} g_{(\muv,\Sm)}(\xv_i)$$
\end{itemize}

\subsubsection{Question:}
Why do we might want to compute the {\em log-likelihood} rather than
the simple {\em likelihood}?
\\

Computing the log-likelihood turns the product into a sum:
$p(X|\Tm) = \prod_{i=1}^{N} p(\xv_i|\Tm) \quad \Leftrightarrow \quad \log p(X|\Tm) = \log \prod_{i=1}^{N} p(\xv_i|\Tm) = \sum_{i=1}^{N} \log p(\xv_i|\Tm)$

In the Gaussian case, it also avoids the computation of the exponential:
\begin{eqnarray}
\label{eq:loglikely}
p(\xv|\Tm) & = & \frac{1}{\sqrt{2\pi}^d \sqrt{\det\left(\Sm\right)}}
\, e^{-\frac{1}{2} (\xv-\muv)\trn \Sm^{-1} (\xv-\muv)} \nonumber \\
\log p(\xv|\Tm) & = &
\frac{1}{2} \left[-d \log \left( 2\pi \right)
-  \log \left( \det\left(\Sm\right) \right)
-  (\xv-\muv)\trn \Sm^{-1} (\xv-\muv)\right]
\end{eqnarray}
Furthermore, since $\log(x)$ is a monotonically growing function, the
log-likelihoods have the same relations of order as the likelihoods
$p(x|\Tm_1) > p(x|\Tm_2) \quad \Leftrightarrow \quad \log p(x|\Tm_1) > \log p(x|\Tm_2),$
so they can be used directly for classification.

\subsubsection*{Find the right statements:}
We can further simplify the computation of the log-likelihood in
eq.~\ref{eq:loglikely} for classification by
\begin{itemize}
\item[$\Box$] dropping the division by two: $\frac{1}{2} \left[\ldots\right]$,
\item[$\Box$] dropping term $d\log \left( 2\pi \right)$,
\item[$\Box$] dropping term $\log \left( \det\left(\Sm\right) \right)$,
\item[$\Box$] dropping term $(\xv-\muv)\trn \Sm^{-1} (\xv-\muv)$,
\item[$\Box$] calculating the term $\log \left( \det\left(\Sm\right) \right)$ in advance.
\end{itemize}

\medskip
\noindent
We can drop term(s) because:
\begin{itemize}
\item[$\Box$] The term(s) are independent of $\muv$.
\item[$\Box$] The terms are negligible small.
\item[$\Box$] The term(s) are independent of the classes.
\end{itemize}

\noindent As a summary, log-likelihoods use simpler computation and are readily

\subsubsection{Experiment:}
Given the following 4 Gaussian models $\Tm_i = (\muv_i,\Sm_i)$
\begin{center}
\begin{tabular}{ccc}
${\cal N}_1: \; \Tm_1 = \left( \left[\begin{array}{c}730 \\ 1090\end{array}\right], \left[\begin{array}{cc}8000 & 0 \\ 0 & 8000\end{array}\right] \right)$ & \hspace{2cm} &
${\cal N}_2: \; \Tm_2 = \left( \left[\begin{array}{c}730 \\ 1090\end{array}\right], \left[\begin{array}{cc}8000 & 0 \\ 0 & 18500\end{array}\right] \right)$ \$2em] {\cal N}_3: \; \Tm_3 = \left( \left[\begin{array}{c}730 \\ 1090\end{array}\right], \left[\begin{array}{cc}8000 & 8400 \\ 8400 & 18500\end{array}\right] \right) & \hspace{2cm} & {\cal N}_4: \; \Tm_4 = \left( \left[\begin{array}{c}270 \\ 1690\end{array}\right], \left[\begin{array}{cc}8000 & 8400 \\ 8400 & 18500\end{array}\right] \right) \end{tabular} \end{center} \vspace{0.5em} compute the following {log-likelihoods} for the whole sample X_3 (10000 points): \[ \log p(X_3|\Tm_1),\; \log p(X_3|\Tm_2),\; \log p(X_3|\Tm_3),\; \text{and}\; \log p(X_3|\Tm_4).$

\subsubsection{Example:}
\mat{N = size(X3,1)}
\mat{mu\_1 = [730 1090]; sigma\_1 = [8000 0; 0 8000];}
\mat{logLike1 = 0;}
\mat{for i = 1:N;}
\com{logLike1 = logLike1 + (X3(i,:) - mu\_1) * inv(sigma\_1) * (X3(i,:) - mu\_1)';} \\
\com{end;} \\
\mat{logLike1 =  - 0.5 * (logLike1 + N*log(det(sigma\_1)) + 2*N*log(2*pi))}

\noindent Note: Use the function \com{gausview} to compare the
relative positions of the models ${\cal N}_1$, ${\cal N}_2$, ${\cal N}_3$ and ${\cal N}_4$ with respect to the data set $X_3$, e.g.:
\\
%
\mat{mu\_1 = [730 1090]; sigma\_1 = [8000 0; 0 8000];}
\mat{gausview(X3,mu\_1,sigma\_1,'Comparison of X3 and N1');}
%
\vspace{-\baselineskip}
\subsubsection*{Question:}
Of ${\cal N}_1$, ${\cal N}_2$, ${\cal N}_3$ and ${\cal N}_4$, which
model explains'' best the data $X_3$?  Which model has the highest
number of parameters (with non-zero values)?  Which model would you
choose for a good compromise between the number of parameters and the
capacity to accurately represent the data?

%%%%%%%%%
%%%%%%%%%
\section{Statistical pattern recognition}
%%%%%%%%%
%%%%%%%%%

%%%%%%%%%
\subsection{A-priori class probabilities}
\label{sec:apriori}
%%%%%%%%%
\subsubsection{Experiment:}
Load data from file vowels.mat''. This file contains a database of
2-dimensional samples of speech features in the form of formant
frequencies (the first and the second spectral formants, $[F_1,F_2]$).
The formant frequency samples represent features that would be
extracted from the speech signal for several occurrences of the vowels
/a/, /e/, /i/, /o/, and /y/\footnote{/y/ is the phonetic symbol for
\"u''}.  They are grouped in matrices of size $N\times2$, where
each of the $N$ lines contains the two formant frequencies for one
occurrence of a vowel.

Supposing that the whole database covers adequately an imaginary
language made only of /a/'s, /e/'s, /i/'s, /o/'s, and /y/'s, compute
the probability $P(q_k)$ of each class $q_k$, $k \in \{\text{/a/},\text{/e/},\text{/i/},\text{/o/},\text{/y/}\}$. Which is
the most common and which the least common phoneme in our imaginary
language?

\subsubsection{Example:}
\mat{Na = size(a,1); Ne = size(e,1); Ni = size(i,1); No = size(o,1); Ny = size(y,1);}
\mat{N = Na + Ne + Ni + No + Ny;}
\mat{Pa = Na/N}
\mat{Pi = Ni/N}
etc.

%%%%%%%%
\subsection{Gaussian modeling of classes}
\label{gaussmod}
%%%%%%%%%

\subsubsection*{Experiment:}
Plot each vowel's data as clouds of points in the 2D plane. Train the
Gaussian models corresponding to each class (use directly the
\com{mean} and \com{cov} commands). Plot their contours (use directly
the function \com{plotgaus(mu,sigma,color)} where \com{color =
[R,G,B]}).

\subsubsection{Example:}
\mat{plotvow; \% Plot the clouds of simulated vowel features}
(Do not close the figure obtained, it will be used later on.) \\
Then compute and plot the Gaussian models: \\
\mat{mu\_a = mean(a);}
\mat{sigma\_a = cov(a);}
\mat{plotgaus(mu\_a,sigma\_a,[0 1 1]);}
\mat{mu\_e = mean(e);}
\mat{sigma\_e = cov(e);}
\mat{plotgaus(mu\_e,sigma\_e,[0 1 1]);}
etc.

%%%%%%%%%%%%%%%%%%
%%%%%%%%%
\subsection{Bayesian classification}
\label{sec:classification}
%%%%%%%%%

We will now find how to classify a feature vector $\xv_i$ from a data
sample (or several feature vectors $X$) as belonging to a certain
class $q_k$.

\subsubsection*{Useful formulas and definitions:}
\begin{itemize}
\item {\em Bayes' decision rule}:
$X \in q_k \quad \mbox{if} \quad P(q_k|X,\Tm) \geq P(q_j|X,\Tm), \quad\forall j \neq k$
This formula means: given a set of classes $q_k$, characterized by a
set of known parameters in model $\Tm$, a set of one or more speech
feature vectors $X$ (also called {\em observations}) belongs to the
class which has the highest probability once we actually know (or
see'', or measure'') the sample $X$. $P(q_k|X,\Tm)$ is therefore
called the {\em a posteriori probability}, because it depends on
having seen the observations, as opposed to the {\em a priori}
probability $P(q_k|\Tm)$ which does not depend on any observation
(but depends of course on knowing how to characterize all the
classes $q_k$, which means knowing the parameter set $\Tm$).

\item For some classification tasks (e.g. speech recognition), it is
practical to resort to {\em Bayes' law}, which makes use of {\em
likelihoods} (see sec.~\ref{sec:likelihood}), rather than trying
to directly estimate the posterior probability $P(q_k|X,\Tm)$.
Bayes' law says:
$$\label{eq:decision-rule} P(q_k|X,\Tm) = \frac{p(X|q_k,\Tm)\; P(q_k|\Tm)}{p(X|\Tm)}$$
where $q_k$ is a class, $X$ is a sample containing one or more
feature vectors and $\Tm$ is the parameter set of all the class
models.

\item The speech features are usually considered equi-probable:
$p(X|\Tm)=\text{const.}$ (uniform prior distribution for $X$).
Hence, $P(q_k|X,\Tm)$ is proportional to $p(X|q_k,\Tm) P(q_k|\Tm)$
for all classes:
$P(q_k|X,\Tm) \propto p(X|q_k,\Tm)\; P(q_k|\Tm), \quad \forall k$

\item Once again, it is more convenient to do the computation in the
$\log$ domain:
$$\label{eq:log-decision-rule} \log P(q_k|X,\Tm) \propto \log p(X|q_k,\Tm) + \log P(q_k|\Tm)$$
\end{itemize}

In our case, $\Tm$ represents the set of \emph{all} the means $\muv_k$
and variances $\Sm_k$, $k \in \{\text{/a/},\text{/e/},\text{/i/},\text{/o/},/u/\}$ of our data
generation model.  $p(X|q_k,\Tm)$ and $\log p(X|q_k,\Tm)$ are the
joint likelihood and joint log-likelihood
(eq.~\ref{eq:joint-likelihood} in section~\ref{sec:likelihood}) of the
sample $X$ with respect to the model $\Tm$ for class $q_k$ (i.e., the
model with parameter set $(\muv_k,\Sm_k)$).

The probability $P(q_k|\Tm)$ is the a-priori class probability for the
class $q_k$. It defines an absolute probability of occurrence for the
class $q_k$. The a-priori class probabilities for our phoneme classes
have been computed in section~\ref{sec:apriori}.

\subsubsection{Experiment:}
Now, we have modeled each vowel class with a Gaussian pdf (by
computing means and variances), we know the probability $P(q_k)$ of
each class in the imaginary language (sec.~\ref{sec:apriori}), which
we assume to be the correct a priori probabilities $P(q_k|\Tm)$ for
each class given our model $\Tm$.  Further we assume that the speech
\emph{features} $\xv_i$ (as opposed to speech {\em classes} $q_k$) are
equi-probable\footnote{Note, that -- also for not equi-probable
features -- finding the most probable class $q_k$ according to
eq.~\ref{eq:decision-rule} does not depend on the denominator
$p(X|\Tm)$, since $p(X|\Tm)$ is independent of $q_k$!}.

What is the most probable class $q_k$ for each of the formant pairs
(features) $\xv_i=[F_1,F_2]\trn$ given in the table below?  Compute
the values of the functions $f_k(\xv_i)$ for our model $\Tm$ as the
right-hand side of eq.~\ref{eq:log-decision-rule}: $f_k(\xv_i) = \log p(\xv_i|q_k,\Tm) + \log P(q_k|\Tm)$, proportional to the log of the
posterior probability of $\xv_i$ belonging to class $q_k$.

\medskip
\noindent
\begin{center}
\renewcommand{\arraystretch}{1.5}
\setlength{\tabcolsep}{0.12in}
\begin{tabular}{|c|c|c|c|c|c|c|c|} \hline
i & \small$\xv_i=[F_1,F_2]\trn$ &
\small $f_{\text{/a/}}(\xv_i)$ &
\small $f_{\text{/e/}}(\xv_i)$ &
\small $f_{\text{/i/}}(\xv_i)$ &
\small $f_{\text{/o/}}(\xv_i)$ &
\small $f_{\text{/y/}}(\xv_i)$ &
Most prob.\ class $q_k$ \\
\hline
1 & $[400,1800]\trn$ & & & & & & \\ \hline
2 & $[400,1000]\trn$ & & & & & & \\ \hline
3 & $[530,1000]\trn$ & & & & & & \\ \hline
4 & $[600,1300]\trn$ & & & & & & \\ \hline
5 & $[670,1300]\trn$ & & & & & & \\ \hline
6 & $[420,2500]\trn$ & & & & & & \\ \hline
\end{tabular}
\end{center}

\subsubsection{Example:}
Use function \com{gloglike(point,mu,sigma)} to compute the
log-likelihoods $\log p(\xv_i|q_k,\Tm)$.  Don't forget to add the log
of the prior probability $P(q_k|\Tm)$!
E.g., for the feature set $x_1$ and class /a/ use\\
\com{>> gloglike([400,1800],mu\_a,sigma\_a) + log(Pa)}

\bigskip
%%%%%%%%%
\subsection{Discriminant surfaces}
\label{sec:discr}
%%%%%%%%%
For the Bayesian classification in the last section we made use of the
\emph{discriminant functions} $f_k(\xv_i) = \log p(\xv_i|q_k,\Tm) + \log P(q_k|\Tm)$ to classify data points $\xv_i$.  This corresponds to
establishing \emph{discriminant surfaces} of dimension $d-1$ in the
vector space for $\xv$ (dimension $d$) to separate regions for the
different classes.

\subsubsection*{Useful formulas and definitions:}
\begin{itemize}
\item {\em Discriminant function}: a set of functions $f_k(\xv)$ allows
to classify a sample $\xv$ into $k$ classes $q_k$ if:
$\xv \in q_k \quad \Leftrightarrow \quad f_k(\xv,\Tm_k) \geq f_l(\xv,\Tm_l), \quad \forall l \neq k$
In this case, the $k$ functions $f_k(\xv)$ are called discriminant
functions.
\end{itemize}

The a-posteriori probability $P(q_k|\xv_i)$ that a sample $\xv_i$
belongs to class $q_k$ is itself a discriminant function:
\begin{eqnarray*}
\xv \in q_k
& \Leftrightarrow & P(q_k|\xv_i) \geq P(q_l|\xv_i),\quad \forall l \neq k \\
& \Leftrightarrow & p(\xv_i|q_k)\; P(q_k) \geq p(\xv_i|q_l)\; P(q_l),\quad
\forall l \neq k \\
& \Leftrightarrow & \log p(\xv_i|q_k)+\log P(q_k) \geq \log
p(\xv_i|q_l)+\log P(q_l),\quad \forall l \neq k
\end{eqnarray*}

As in our case the samples $\xv$ are two-dimensional vectors, the
discriminant surfaces are one-dimensional, i.e., lines at equal values
of the discriminant functions for two distinct classes.

\subsubsection{Experiment:}
%%%%%%%%%%%%
\begin{figure}
\centerline{\includegraphics[height=0.95\textheight]{iso}}
\caption{\label{iso}Iso-likelihood lines for the Gaussian pdfs
${\cal N}(\muv_{\text{/i/}},\Sm_{\text{/i/}})$ and ${\cal N}(\muv_{\text{/e/}},\Sm_{\text{/e/}})$ (top), and ${\cal N}(\muv_{\text{/i/}},\Sm_{\text{/e/}})$ and ${\cal N}(\muv_{\text{/e/}},\Sm_{\text{/e/}})$ (bottom).}
\end{figure}
%%%%%%%%%%%%
The iso-likelihood lines for the Gaussian pdfs ${\cal N}(\muv_{\text{/i/}},\Sm_{\text{/i/}})$ and ${\cal N}(\muv_{\text{/e/}},\Sm_{\text{/e/}})$, which we used before to
model the class /i/ and the class /e/, are plotted in
figure~\ref{iso}, first graph. On the second graph in
figure~\ref{iso}, the iso-likelihood lines for ${\cal N}(\muv_{\text{/i/}},\Sm_{\text{/e/}})$ and ${\cal N}(\muv_{\text{/e/}},\Sm_{\text{/e/}})$ (two pdfs with the
\emph{same} covariance matrix $\Sm_{\text{/e/}}$) are represented.

On these figures, use a colored pen to join the intersections of the
level lines that correspond to equal likelihoods.  Assume that the
highest iso-likelihood lines (smallest ellipses) are of the same
height. (You can also use \com{isosurf} in {\sc Matlab} to create a
color plot.)

\subsubsection{Question:}
What is the nature of the surface that separates class /i/ from class
/e/ when the two models have {\em different} variances? Can you
explain the origin of this form?

What is the nature of the surface that separates class /i/ from class
/e/ when the two models have the {\em same} variances? Why is it
different from the previous discriminant surface?
\\

Show that in the case of two Gaussian pdfs with \emph{equal covariance
matrices}, the separation between class 1 and class 2 does not
depend upon the covariance $\Sm$ any more.
\\

\tab As a summary, we have seen that Bayesian classifiers with
Gaussian data models separate the classes with combinations of
parabolic surfaces. If the covariance matrices of the models are
equal, the parabolic separation surfaces become simple hyper-planes.

\bigskip
%%%%%%%%%
%%%%%%%%%

\section{Unsupervised training}
\label{unsup}
%%%%%%%%%
%%%%%%%%%
In the previous section, we have computed the models for classes /a/,
/e/, /i/, /o/, and /y/ by knowing a-priori which training samples
belongs to which class (we were disposing of a {\em labeling} of the
training data). Hence, we have performed a {\em supervised training}
of the Gaussian models.  Now, suppose that we only have unlabeled
training data that we want to separate in several classes (e.g., 5
classes) without knowing a-priori which point belongs to which class.
This is called {\em unsupervised training}. Several algorithms are
available for that purpose, among them: the $K$-means, the EM
(Expectation-Maximization), and the Viterbi-EM algorithm.

%
All these algorithms are characterized by the following components:
\begin{itemize}
\item a set of models for the classes $q_k$ (not necessarily
Gaussian), defined by a parameter set $\Tm$ (means, variances,
priors,...);
\item a measure of membership, telling to which extent a data point
belongs'' to a model;
\item a recipe'' to update the model parameters as a function of the
membership information.
\end{itemize}
The measure of membership usually takes the form of a distance measure
or the form of a measure of probability. It replaces the missing
labeling information to permit the application of standard parameter
estimation techniques.  It also defines implicitly a global criterion
of goodness of fit'' of the models to the data, e.g.:
\begin{itemize}
\item in the case of a distance measure, the models that are closer to
the data characterize it better;
\item in the case of a probability measure, the models with a larger
likelihood for the data explain it better.
\end{itemize}

\begin{latexonly}
Table~\ref{algos} summarizes the components of the algorithms that
will be studied in the following experiments. More detail will be
given in the corresponding subsections.

%\pagebreak
%\newpage
%\pagestyle{empty}
\newcommand{\PBS}[1]{\let\temp=\\#1\let\\=\temp}
\newcommand{\RR}{\PBS\raggedright\hspace{0pt}}
\newcommand{\RL}{\PBS\raggedleft\hspace{0pt}}
\newcommand{\CC}{\PBS\centering\hspace{0pt}}

\begin{sidewaystable}[p]
%\setlength{\arrayrulewidth}{1.2pt}
\setlength{\tabcolsep}{0.05in}
\renewcommand{\arraystretch}{2.2}
%\hspace{-1.5em}
\begin{tabular}{|>{\RR}m{5.2em}|>{\RR}m{7.3em}|>{\CC}m{21em}|>{\RR}m{22.5em}|>{\CC}m{8em}|}
\hline
\centering\bf Algorithm &
\centering\bf Parameters &
\bf Membership measure &
\centering\bf Update method &
\bf Global criterion \\ \hline
%
$K$-means    &
$\bullet$ mean $\muv_k$ &
Euclidean distance \linebreak
$d_k(\xv_n)= \sqrt{(\xv_n-\muv_k)\trn(\xv_n-\muv_k)}$ \linebreak
(or the square of it)
&
Find the points closest to $q_k^{(i)}$, then:

\medskip
$\bullet$ $\muv_{k}^{(i+1)}$ = mean of the points closest to $q_k^{(i)}$ &
Least squares \\
\hline
%
% Kmeans w/ \linebreak Mahalanobis distance &
% $\bullet$ mean $\muv_k$ \linebreak
% $\bullet$ variance $\Sm_k$ &
% Mahalanobis distance, \linebreak (Weighted Euclidean distance)
% $d_k(x_n)= \sqrt{(x_n-\muv_k)\trn\Sm_k^{-1}(x_n-\muv_k)}$
% &
% Find the points closest to $q_k^{(i)}$, then:

% \medskip
% $\bullet$ $\muv_{k}^{(i+1)}$ = mean of the points closest to $q_k^{(i)}$

% \medskip
% $\bullet$ $\Sm_{k}^{(i+1)}$ = variance of the points closest to $q_k^{(i)}$ &
% Minimal \linebreak spread of the data \linebreak (in a Mahalanobis distance sense) \\ \hline
%
Viterbi-EM &
$\bullet$ mean $\muv_k$ \linebreak
$\bullet$ variance $\Sm_k$ \linebreak
$\bullet$ priors $P(q_k|\Tm)$
&
Posterior probability
$d_k(\xv_n) = P(q_k|\xv_n,\Tm)$
\footnotesize
$\propto \frac{1}{\sqrt{2\pi}^d \sqrt{\det\left(\Sm_k\right)}} \, e^{-\frac{1}{2} (\xv_n-\muv_k)\trn \Sm_k^{-1} (\xv_n-\muv_k)} \cdot P(q_k|\Tm)$
\normalsize
&
Do Bayesian classification of each data point, then:

\medskip
$\bullet$ $\muv_{k}^{(i+1)}$ = mean of the points belonging to $q_k^{(i)}$

\medskip
$\bullet$ $\Sm_{k}^{(i+1)}$ = variance of the points belonging to $q_k^{(i)}$

\medskip
$\bullet$ $P(q_k^{(i+1)}|\Tm^{(i+1)})$ = number of training points
belonging to $q_k^{(i)}$ / total number of training points
&
Maximum likelihood \\ \hline
%
EM &
$\bullet$ mean $\muv_k$ \linebreak
$\bullet$ variance $\Sm_k$ \linebreak
$\bullet$ priors $P(q_k|\Tm)$ &
Posterior probability
$d_k(\xv_n) = P(q_k|\xv_n,\Tm)$
\footnotesize
$\propto \frac{1}{\sqrt{2\pi}^d \sqrt{\det\left(\Sm_k\right)}} \, e^{-\frac{1}{2} (\xv_n-\muv_k)\trn \Sm_k^{-1} (\xv_n-\muv_k)} \cdot P(q_k|\Tm)$
\normalsize
&
Compute $P(q_k^{(i)}|\xv_n,\Tm^{(i)})$ (soft classification), then:

\medskip
$\bullet$ $\muv_{k}^{(i+1)} = \frac{\sum_{n=1}^{N} x_n P(q_k^{(i)}|\xv_n,\Tm^{(i)})} {\sum_{n=1}^{N} P(q_k^{(i)}|\xv_n,\Tm^{(i)})}$

\medskip
$\bullet$ $\Sm_{k}^{(i+1)} = \frac{\sum_{n=1}^{N} P(q_k^{(i)}|\xv_n,\Tm^{(i)}) (\xv_n - \muv_k^{(i+1)})(\xv_n - \muv_k^{(i+1)})\trn } {\sum_{n=1}^{N} P(q_k^{(i)}|\xv_n,\Tm^{(i)})}$

\medskip
$\bullet$ $P(q_k^{(i+1)}|\Tm^{(i+1)}) = \frac{1}{N} \sum_{n=1}^{N} P(q_k^{(i)}|\xv_n,\Tm^{(i)})$ &
Maximum likelihood \\ \hline
\end{tabular}

\caption{\label{algos}Characteristics of some usual unsupervised
clustering algorithms.}
\end{sidewaystable}
%
\end{latexonly}

%%%%%%%%%
%\newpage
%\pagestyle{fancyplain}
\subsection{$K$-means algorithm}
%%%%%%%%%
\subsubsection*{Synopsis of the algorithm:}
\begin{itemize}
\item Start with $K$ initial prototypes $\muv_k$, $k=1,\ldots,K$.
\item {\bf Do}:
\begin{enumerate}
\item For each data-point $\xv_n$, $n=1,\ldots,N$, compute the squared
Euclidean distance from the $k^{\text{th}}$ prototype:
\begin{eqnarray}
\label{eq:dist}
d_k(\xv_n) & = & \, \left\| \xv_n - \muv_k \right\|^2 \nonumber \\
& = & (\xv_n-\muv_k)\trn(\xv_n-\muv_k)   \nonumber
\end{eqnarray}
\item Assign each data-point $\xv_n$ to its \emph{closest prototype}
$\muv_k$, i.e., assign $\xv_n$ to the class $q_k$ if
$d_k(\xv_n) \, < \, d_l(\xv_n), \qquad \forall l \neq k$
{\em Note}: using the squared Euclidean distance for the
classification gives the same result as using the true Euclidean
distance, since the square root is a monotonically growing
function. But the computational load is obviously smaller when the
square root is dropped.
\item Replace each prototype with the mean of the data-points assigned to
the corresponding class;
\item Go to 1.
\end{enumerate}

\item {\bf Until}: no further change occurs.
\end{itemize}

The global criterion for the present case is
$J = \sum_{k=1}^{K} \sum_{\xv_n \in q_k} d_k(\xv_n)$
and represents the total squared distance between the data and the models
they belong to. This criterion is locally minimized by the algorithm.

% Alternately, the Euclidean distance used in step 1. can be replaced by the
% Mahalanobis distance, belonging to the class of weighted Euclidean
% distances:
% $% d_k(x_n)= \sqrt{(x_n-\muv_k)\trn\Sm_k^{-1}(x_n-\muv_k)} %$
% where $\Sm_k$ is the covariance of the points associated with the class
% $q_k$.

% After the convergence of the algorithm, the final clusters of points may be
% used to make Gaussian models since we dispose of means and variances.

\subsubsection*{Experiment:}
Use the $K$-means explorer utility:
\begin{verbatim}
KMEANS K-means algorithm exploration tool

Launch it with KMEANS(DATA,NCLUST) where DATA is the matrix
of observations (one observation per row) and NCLUST is the
desired number of clusters.

The clusters are initialized with a heuristic that spreads
them randomly around mean(DATA) with standard deviation
sqrtm(cov(DATA)).

If you want to set your own initial clusters, use
KMEANS(DATA,MEANS) where MEANS is a cell array containing
NCLUST initial mean vectors.

Example: for two clusters
means{1} = [1 2]; means{2} = [3 4];
kmeans(data,means);

\end{verbatim}

%\pagebreak

Launch \com{kmeans} with the data sample \com{allvow}, which was part
of file \com{vowels.mat} and gathers all the simulated vowels data. Do
several runs with different cases of initialization of the algorithm:
\begin{enumerate}
\item 5 initial clusters determined according to the default heuristic;
\item initial \com{MEANS} values equal to some data points;
\item initial \com{MEANS} values equal to $\{\muv_{\text{/a/}}, \muv_{\text{/e/}}, \muv_{\text{/i/}}, \muv_{\text{/o/}}, \muv_{\text{/y/}}\}$.
\end{enumerate}
Iterate the algorithm until its convergence. Observe the evolution of the
cluster centers, of the data-points attribution chart and of the total
squared Euclidean distance. (It is possible to zoom these plots: left
click inside the axes to zoom $2\times$ centered on the point under the
mouse; right click to zoom out; click and drag to zoom into an area; double
click to reset the figure to the original). %Observe the mean and variance
% values found after the convergence of the algorithm.
Observe the mean values found after the convergence of the algorithm.

\subsubsection*{Example:}
\mat{kmeans(allvow,5);}
- or - \\
\mat{means =  \{ mu\_a, mu\_e, mu\_i, mu\_o, mu\_y \};}
\mat{kmeans(allvow,means);}
Enlarge the window, then push the buttons, zoom etc.
After the convergence, use:\\
\mat{for k=1:5, disp(kmeans\_result\_means\{k\}); end}
to see the resulting means.

\begin{enumerate}
\item Does the final solution depend on the initialization of the
algorithm?
\item Describe the evolution of the total squared Euclidean distance.
\item What is the nature of the discriminant surfaces corresponding to
a minimum Euclidean distance classification scheme?
\item Is the algorithm suitable for fitting Gaussian clusters?
\end{enumerate}

%%%%%%%%%
\subsection{Viterbi-EM algorithm for Gaussian clustering}
%%%%%%%%%
\subsubsection*{Synopsis of the algorithm:}
\begin{itemize}
\item Start from $K$ initial Gaussian models ${\cal N}(\muv_{k},\Sm_{k}), \; k=1\ldots K$, characterized by the set of parameters $\Tm$ (i.e., the
set of all means and variances $\muv_k$ and $\Sm_k$, $k=1\ldots K$). Set
the initial prior probabilities $P(q_k)$ to $1/K$.
\item {\bf Do}:
\begin{enumerate}
\item Classify each data-point using Bayes' rule.

This step is equivalent to having a set $Q$ of boolean hidden variables
that give a labeling of the data by taking the value 1 (belongs) or 0 (does
not belong) for each class $q_k$ and each point $\xv_n$. The value of $Q$
that maximizes $p(X,Q|\Tm)$ precisely tells which is the most probable
model for each point of the whole set $X$ of training data.

Hence, each data point $\xv_n$ is assigned to its most probable
cluster $q_k$.
\item Update the parameters ($i$ is the iteration index):
\begin{itemize}
\item update the means:
$\muv_{k}^{(i+1)} = \mbox{mean of the points belonging to } q_k^{(i)}$
\item update the variances:
$\Sm_{k}^{(i+1)} = \mbox{variance of the points belonging to } q_k^{(i)}$
\item update the priors:
$P(q_k^{(i+1)}|\Tm^{(i+1)}) = \frac{\mbox{number of training points belonging to } q_k^{(i)} }{\mbox{total number of training points}}$
\end{itemize}
\item Go to 1.
\end{enumerate}
\item {\bf Until}: no further change occurs.
\end{itemize}
The global criterion in the present case is
\begin{eqnarray*}
{\cal L}(\Tm) &=& \sum_{X} P(X|\Tm) \;=\; \sum_{Q} \sum_{X} p(X,Q|\Tm) \\
&=& \sum_{k=1}^{K} \sum_{\xv_n \in q_k} \log p(\xv_n|\Tm_k),
\end{eqnarray*}
and represents the joint likelihood of the data with respect to the models
they belong to. This criterion is locally maximized by the algorithm.

\subsubsection*{Experiment:}
Use the Viterbi-EM explorer utility:
\begin{verbatim}
VITERB Viterbi version of the EM algorithm

Launch it with VITERB(DATA,NCLUST) where DATA is the matrix
of observations (one observation per row) and NCLUST is the
desired number of clusters.

The clusters are initialized with a heuristic that spreads
them randomly around mean(DATA) with standard deviation
sqrtm(cov(DATA)). Their initial covariance is set to cov(DATA).

If you want to set your own initial clusters, use
VITERB(DATA,MEANS,VARS) where MEANS and VARS are cell arrays
containing respectively NCLUST initial mean vectors and NCLUST
initial covariance matrices. In this case, the initial a-priori
probabilities are set equal to 1/NCLUST.

To set your own initial priors, use VITERB(DATA,MEANS,VARS,PRIORS)
where PRIORS is a vector containing NCLUST a priori probabilities.

Example: for two clusters
means{1} = [1 2]; means{2} = [3 4];
vars{1} = [2 0;0 2]; vars{2} = [1 0;0 1];
viterb(data,means,vars);

\end{verbatim}
Launch \com{viterb} with the dataset \com{allvow}. Do several runs
with different initializations of the algorithm:
\begin{enumerate}
\item 5 initial clusters determined according to the default heuristic;
\item initial \com{MEANS} values equal to some data points, and some
random \com{VARS} values (try for instance \com{cov(allvow)} for all
the classes);
\item the initial \com{MEANS}, \com{VARS} and \com{PRIORS} values
found by the $K$-means algorithm.
\item initial \com{MEANS} values equal to $\{\muv_{\text{/a/}}, \muv_{\text{/e/}}, \muv_{\text{/i/}}, \muv_{\text{/o/}}, \muv_{\text{/y/}}\}$, \com{VARS} values equal to \linebreak
$\{\Sm_{\text{/a/}}, \Sm_{\text{/e/}}, \Sm_{\text{/i/}}, \Sm_{\text{/o/}}, \Sm_{\text{/y/}}\}$, and \com{PRIORS} values
equal to
$[P_{\text{/a/}},P_{\text{/e/}},P_{\text{/i/}},P_{\text{/o/}},P_{\text{/y/}}]$;
\item initial \com{MEANS} and \com{VARS} values chosen by yourself.
\end{enumerate}
Iterate the algorithm until it converges. Observe the evolution of the
clusters, of the data points attribution chart and of the total
likelihood curve. Observe the mean, variance and priors values found
after the convergence of the algorithm. Compare them with the values
computed in section~\ref{gaussmod} (using supervised training).

\subsubsection*{Example:}
\mat{viterb(allvow,5);}
- or - \\
\mat{means =  \{ mu\_a, mu\_e, mu\_i, mu\_o, mu\_y \};}
\mat{vars = \{ sigma\_a, sigma\_e, sigma\_i, sigma\_o, sigma\_y \};}
\mat{viterb(allvow,means,vars);}
Enlarge the window, then push the buttons, zoom, etc.
After convergence, use:\\
\mat{for k=1:5, disp(viterb\_result\_means\{k\}); end}
\mat{for k=1:5, disp(viterb\_result\_vars\{k\}); end}
\mat{for k=1:5, disp(viterb\_result\_priors(k)); end}
to see the resulting means, variances and priors.

\subsubsection*{Question:}
\begin{enumerate}
\item Does the final solution depend on the initialization of the
algorithm?
\item Describe the evolution of the total likelihood. Is it monotonic?
\item In terms of optimization of the likelihood, what does the final
solution correspond to?
\item What is the nature of the discriminant surfaces corresponding to the
Gaussian classification?
\item Is the algorithm suitable for fitting Gaussian clusters?
\end{enumerate}

%%%%%%%%%
\subsection{EM algorithm for Gaussian clustering}
%%%%%%%%%
\subsubsection*{Synopsis of the algorithm:}
\begin{itemize}
\item Start from K initial Gaussian models ${\cal N}(\muv_{k},\Sm_{k}), \; k=1\ldots K$, with equal priors set to $P(q_k) = 1/K$.
\item {\bf Do}:
\begin{enumerate}
\item {\bf Estimation step}: compute the probability
$P(q_k^{(i)}|\xv_n,\Tm^{(i)})$ for each data point $\xv_n$ to belong
to the class $q_k^{(i)}$:
%
\begin{eqnarray*}
P(q_k^{(i)}|\xv_n,\Tm^{(i)}) & = & \frac{P(q_k^{(i)}|\Tm^{(i)})
\cdot p(\xv_n|q_k^{(i)},\Tm^{(i)})}
{p(\xv_n|\Tm^{(i)})} \\
& = & \frac{P(q_k^{(i)}|\Tm^{(i)}) \cdot p(\xv_n|\muv_k^{(i)},\Sm_k^{(i)}) }
{\sum_j P(q_j^{(i)}|\Tm^{(i)}) \cdot p(\xv_n|\muv_j^{(i)},\Sm_j^{(i)}) }
\end{eqnarray*}

This step is equivalent to having a set $Q$ of continuous hidden
variables, taking values in the interval $[0,1]$, that give a labeling
of the data by telling to which extent a point $\xv_n$ belongs to the
class $q_k$. This represents a soft classification, since a point can
belong, e.g., by 60\,\% to class 1 and by 40\,\% to class 2 (think of
Schr\"odinger's cat which is 60\,\% alive and 40\,\% dead as long as
nobody opens the box or performs Bayesian classification).
%
\item {\bf Maximization step}:
\begin{itemize}
\item update the means:
$\muv_{k}^{(i+1)} = \frac{\sum_{n=1}^{N} \xv_n P(q_k^{(i)}|\xv_n,\Tm^{(i)})} {\sum_{n=1}^{N} P(q_k^{(i)}|\xv_n,\Tm^{(i)})}$
\item update the variances:
$\Sm_{k}^{(i+1)} = \frac{\sum_{n=1}^{N} P(q_k^{(i)}|\xv_n,\Tm^{(i)})\; (\xv_n - \muv_k^{(i+1)})(\xv_n - \muv_k^{(i+1)})\trn } {\sum_{n=1}^{N} P(q_k^{(i)}|\xv_n,\Tm^{(i)})}$
\item update the priors:
$P(q_k^{(i+1)}|\Tm^{(i+1)}) = \frac{1}{N} \sum_{n=1}^{N} P(q_k^{(i)}|\xv_n,\Tm^{(i)})$
\end{itemize}
%
In the present case, all the data points participate to the update of all
the models, but their participation is weighted by the value of
$P(q_k^{(i)}|\xv_n,\Tm^{(i)})$.
\item Go to 1.
\end{enumerate}
\item {\bf Until}: the total likelihood increase for the training data
falls under some desired threshold.
\end{itemize}
The global criterion in the present case is the joint likelihood of
all data with respect to all the models:
\begin{align*}
{\cal L}(\Tm) = \log p(X|\Tm)
& = \log \sum_Q p(X,Q|\Tm)\\
& = \log \sum_Q P(Q|X,\Tm)p(X|\Tm) \quad\text{(Bayes)} \\
& = \log \sum_{k=1}^{K} P(q_k|X,\Tm) p(X|\Tm)
\end{align*}
Applying Jensen's inequality $\left( \log \sum_j \lambda_j y_j \geq \sum_j \lambda_j \log y_j \mbox{ if } \sum_j \lambda_j = 1 \right)$,
we obtain:
\begin{align*}
{\cal L}(\Tm) \ge J(\Tm)
&= \sum_{k=1}^{K} P(q_k|X,\Tm) \log p(X|\Tm)\\
&= \sum_{k=1}^{K} \sum_{n=1}^{N} P(q_k|\xv_n,\Tm) \log p(\xv_n|\Tm)
\end{align*}
Hence, the criterion $J(\Tm)$ represents a lower boundary for ${\cal L}(\Tm)$. This criterion is locally maximized by the algorithm.

\subsubsection*{Experiment:}
Use the EM explorer utility:
\begin{verbatim}
EMALGO EM algorithm explorer

Launch it with EMALGO(DATA,NCLUST) where DATA is the matrix
of observations (one observation per row) and NCLUST is the
desired number of clusters.

The clusters are initialized with a heuristic that spreads
them randomly around mean(DATA) with standard deviation
sqrtm(cov(DATA)*10). Their initial covariance is set to cov(DATA).

If you want to set your own initial clusters, use
EMALGO(DATA,MEANS,VARS) where MEANS and VARS are cell arrays
containing respectively NCLUST initial mean vectors and NCLUST
initial covariance matrices. In this case, the initial a-priori
probabilities are set equal to 1/NCLUST.

To set your own initial priors, use VITERB(DATA,MEANS,VARS,PRIORS)
where PRIORS is a vector containing NCLUST a priori probabilities.

Example: for two clusters
means{1} = [1 2]; means{2} = [3 4];
vars{1} = [2 0;0 2]; vars{2} = [1 0;0 1];
emalgo(data,means,vars);

\end{verbatim}
Launch \com{emalgo} with again the same dataset \com{allvow}. Do
several runs with different cases of initialization of the algorithm:
\begin{enumerate}
\item 5 clusters determined according to the default heuristic;
\item initial \com{MEANS} values equal to some data points, and some
random \com{VARS} values (e.g., \com{cov(allvow)} for all the
classes);
\item the initial \com{MEANS} and \com{VARS} values found by the
$K$-means algorithm.
\item initial \com{MEANS} values equal to $\{\muv_{\text{/a/}}, \muv_{\text{/e/}}, \muv_{\text{/i/}}, \muv_{\text{/o/}}, \muv_{\text{/y/}}\}$, \com{VARS} values equal to \linebreak
$\{\Sm_{\text{/a/}}, \Sm_{\text{/e/}}, \Sm_{\text{/i/}}, \Sm_{\text{/o/}}, \Sm_{\text{/y/}}\}$, and \com{PRIORS} values
equal to
$[P_{\text{/a/}},P_{\text{/e/}},P_{\text{/i/}},P_{\text{/o/}},P_{\text{/y/}}]$;
\item initial \com{MEANS} and \com{VARS} values chosen by yourself.
\end{enumerate}
(If you have time, also increase the number of clusters and play again with
the algorithm.)

Iterate the algorithm until the total likelihood reaches an asymptotic
convergence. Observe the evolution of the clusters and of the total
likelihood curve. (In the EM case, the data points attribution chart is not
given because each data point participates to the update of each cluster.)
Observe the mean, variance and prior values found after the convergence of
the algorithm. Compare them with the values found in
section~\ref{gaussmod}.

\subsubsection*{Example:}
\mat{emalgo(allvow,5);}
- or - \\
\mat{means =  \{ mu\_a, mu\_e, mu\_i, mu\_o, mu\_y \};}
\mat{vars = \{ sigma\_a, sigma\_e, sigma\_i, sigma\_o, sigma\_y \};}
\mat{emalgo(allvow,means,vars);}
Enlarge the window, then push the buttons, zoom etc.
After convergence, use:\\
\mat{for k=1:5; disp(emalgo\_result\_means\{k\}); end}
\mat{for k=1:5; disp(emalgo\_result\_vars\{k\}); end}
\mat{for k=1:5; disp(emalgo\_result\_priors(k)); end}
to see the resulting means, variances and priors.

\subsubsection*{Question:}
\begin{enumerate}
\item Does the final solution depend on the initialization of the
algorithm?
\item Describe the evolution of the total likelihood. Is it monotonic?
\item In terms of optimization of the likelihood, what does the final
solution correspond to?
\item Is the algorithm suitable for fitting Gaussian clusters?
\end{enumerate}

\subsection*{Questions to $K$-means, Viterbi-EM and EM algorithm:}
\subsubsection*{Find the right statements:}
\begin{itemize}
\item[$\Box$] With the $K$-means algorithm the solution is independent
upon the initialization.
\item[$\Box$] With the $K$-means algorithm the discriminant surfaces are
linear.
\item[$\Box$] The $K$-means algorithm is well suited for fitting
Gaussian clusters
\item[$\Box$] In the $K$-means algorithm the global criterion used to
minimize is the maximum likelihood.
\item[$\Box$] In all 3 algorithms the measure used as global criterion
decreases in a monotonic way.
\item[$\Box$] In the Viterbi-EM and EM algorithm the solution is
highly dependent upon initialization.
\item[$\Box$] The EM algorithm is best suited for fitting Gaussian
clusters.
\item[$\Box$] It is an easy task to guess the parameters for
initialization.
\item[$\Box$] With the Viterbi-EM algorithm the discriminant surfaces
are linear.
\item[$\Box$] With the EM algorithm the discriminant surfaces have the
form of (hyper)parabola.
\item[$\Box$] The EM algorithm needs less computational effort then
the Viterbi-EM algorithm.
\item[$\Box$] In the EM algorithm and the Viterbi-EM algorithm, the
same global criterion is used.
\item[$\Box$] The EM algorithm finds a global optimum.
\end{itemize}

\end{document}