www.pudn.com > FP-GROWTH.rar > chi2.tex


\documentclass[a4paper]{article}
\oddsidemargin 2.1mm
\textwidth     155mm
\topmargin     -12mm
\textheight    230mm

\def\tabstrut{\rule{0pt}{2.4ex}}
\def\eq{\!\!\!=\!\!\!}

\begin{document}

\subsection*{The Normalized $\chi^2$ Measure
             for Association Rule Evaluation}

Let $C$ and $A$ be two attributes with domains
$\mbox{dom}(A) = \{ a_1, \ldots a_{n_A} \}$ and
$\mbox{dom}(C) = \{ c_1, \ldots c_{n_C} \}$, respectively,
and let $\cal X$ be a dataset over $C$ and $A$.
Let $N_{ij}$, $1 \le i \le n_C$, $1 \le j \le n_A$, be the number of
sample cases in $\cal X$, which contain both the attribute values~$c_i$
and $a_j$. Furthermore, let
\[ N_{i.} = \sum_{j=1}^{n_A} N_{ij}, \qquad
   N_{.j} = \sum_{i=1}^{n_C} N_{ij}, \qquad\mbox{and}\qquad
   N_{..} = \sum_{i=1}^{n_C} \sum_{j=1}^{n_A} N_{ij} = |{\cal X}|. \]
Finally, let
\[ p_{i.} = \frac{N_{i.}}{N_{..}}, \qquad
   p_{.j} = \frac{N_{.j}}{N_{..}}, \qquad\mbox{and}\qquad
   p_{ij} = \frac{N_{ij}}{N_{..}} \]
be the probabilities of the attribute values and their combinations,
as they can be estimated from these numbers. Then the well-known
$\chi^2$ measure is usually defined as
\begin{eqnarray*}
\chi^2(C,A)
& = & \sum_{i=1}^{n_C} \sum_{j=1}^{n_A}
      \frac{(E_{ij} -N_{ij})^2}{E_{ij}}
      \qquad\mbox{where}\quad E_{ij} = \frac{N_{i.}N_{.j}}{N_{..}} \\
& = & \sum_{i=1}^{n_C} \sum_{j=1}^{n_A}
      \frac{\left(\frac{N_{i.}N_{.j}}{N_{..}} -N_{ij}\right)^2}
           {\frac{N_{i.}N_{.j}}{N_{..}}}
~~=~~ \sum_{i=1}^{n_C} \sum_{j=1}^{n_A}
      \frac{N_{..}^2 \left(\frac{N_{i.\phantom{j}}}{N_{..}}
                           \frac{N_{.j}}{N_{..}}
                         - \frac{N_{ij}}{N_{..}}\right)^2}
           {N_{..}\;       \frac{N_{i.\phantom{j}}}{N_{..}}
                           \frac{N_{.j}}{N_{..}}} \\
& = & N_{..} \sum_{i=1}^{n_C} \sum_{j=1}^{n_A}
      \frac{(p_{i.}\;p_{.j} - p_{ij})^2}{p_{i.}\;p_{.j}}
~~=~~ N_{..} \sum_{i=1}^{n_C} \sum_{j=1}^{n_A}
      \frac{(N_{i.}\;N_{.j} - N_{..}N_{ij})^2}{N_{i.}\;N_{.j}}.
\end{eqnarray*}
This measure is often normalized by dividing it by the
size~$N_{..} = |{\cal X}|$ of the dataset to remove the
dependence on the number of sample cases.

For association rule evaluation, $C$ refers the consequent and $A$ to
the antecedent of the rule. Both have two values, which we denote by
$c_0$, $c_1$ and $a_0$, $a_1$, respectively. $c_0$ means that the
consequent of the rule is not satisfied, $c_1$ that it is satisfied;
likewise for $A$. Then we have to compute the $\chi^2$ measure from
the $2 \times 2$ contingency table
\begin{center}
\begin{tabular}{|l|c|c|l|} \cline{2-3}
\multicolumn{1}{l|}{}
      & $a_0$    & $a_1$    \\ \hline
$c_0$ & $N_{00}$ & $N_{01}$ & $N_{0.}$\tabstrut \\ \hline
$c_1$ & $N_{10}$ & $N_{11}$ & $N_{1.}$\tabstrut \\ \hline
\multicolumn{1}{l|}{}
      & $N_{.0}$ & $N_{.1}$ & $N_{..}$\tabstrut \\ \cline{2-4}
\end{tabular}
\end{center}
or the estimated probability table
\begin{center}
\begin{tabular}{|l|c|c|l|} \cline{2-3}
\multicolumn{1}{l|}{}
      & $a_0$    & $a_1$    \\ \hline
$c_0$ & $p_{00}$ & $p_{01}$ & $p_{0.}$\tabstrut \\ \hline
$c_1$ & $p_{10}$ & $p_{11}$ & $p_{1.}$\tabstrut \\ \hline
\multicolumn{1}{l|}{}
      & $p_{.0}$ & $p_{.1}$ & $1$\tabstrut \\ \cline{2-4}
\end{tabular}
\end{center}
That is, we have
\begin{eqnarray*}
\frac{\chi^2(C,A)}{N_{..}}
& = & \sum_{i=0}^1 \sum_{j=0}^1
      \frac{(p_{i.}\;p_{.j} - p_{ij})^2}{p_{i.}\;p_{.j}}. \\
& = & \frac{(p_{0.}\;p_{.0} -p_{00})^2}{p_{0.}\;p_{.0}}
  +   \frac{(p_{0.}\;p_{.1} -p_{01})^2}{p_{0.}\;p_{.1}}
  +   \frac{(p_{1.}\;p_{.0} -p_{10})^2}{p_{1.}\;p_{.0}}
  +   \frac{(p_{1.}\;p_{.1} -p_{11})^2}{p_{1.}\;p_{.1}}
\end{eqnarray*}
Now we can exploit
\[ p_{00} + p_{01} = p_{0.}, \quad
   p_{10} + p_{10} = p_{1.}, \quad
   p_{00} + p_{10} = p_{.0}, \quad
   p_{01} + p_{11} = p_{.1}, \quad
   p_{0.} + p_{1.} = 1, \quad
   p_{.0} + p_{.1} = 1, \]
which leads to
\begin{eqnarray*}
p_{0.}\;p_{.0} -p_{00}
& = & (1 -p_{1.})(1 -p_{.1}) -(1 -p_{1.} -p_{.1} +p_{11})
~~=~~ p_{1.}\;p_{.1} -p_{11}, \\
p_{0.}\;p_{.1} -p_{01}
& = & (1 -p_{1.})p_{.1} -(p_{.1} -p_{11})
~~=~~ p_{11} -p_{1.}\;p_{.1}, \\
p_{1.}\;p_{.0} -p_{10}
& = & p_{1.}(1 -p_{.1}) -(p_{1.} -p_{11})
~~=~~ p_{11} -p_{1.}\;p_{.1}. \\
\end{eqnarray*}
Therefore it is
\begin{eqnarray*}
\frac{\chi^2(C,A)}{N_{..}}
& = & \frac{(p_{1.}\;p_{.1} -p_{11})^2}{(1 -p_{1.})(1 -p_{.1})}
  +   \frac{(p_{1.}\;p_{.1} -p_{11})^2}{(1 -p_{1.})\;p_{.1}}
  +   \frac{(p_{1.}\;p_{.1} -p_{11})^2}{p_{1.}(1 -p_{.1})}
  +   \frac{(p_{1.}\;p_{.1} -p_{11})^2}{p_{1.}\;p_{.1}} \\
& = & \frac{(p_{1.}\;p_{.1} -p_{11})^2
            (p_{1.}\;p_{.1}
            +p_{1.}(1 -p_{.1})
            +(1 -p_{1.})p_{.1}
            +(1 -p_{1.})(1 -p_{.1}))}
           {p_{1.}(1 -p_{1.})p_{.1}(1 -p_{.1})} \\
& = & \frac{(p_{1.}\;p_{.1} -p_{11})^2
            (p_{1.}\;p_{.1}
            +p_{1.} -p_{1.}\;p_{.1}
            +p_{.1} -p_{1.}\;p_{.1}
            +1 -p_{1.} -p_{.1} +p_{1.}\;p_{.1})}
           {p_{1.}(1 -p_{1.})p_{.1}(1 -p_{.1})} \\
& = & \frac{(p_{1.}\;p_{.1} -p_{11})^2}
           {p_{1.}(1 -p_{1.})p_{.1}(1 -p_{.1})}.
\end{eqnarray*}
In the program, $p_{1.}$ (argument {\tt head}), $p_{.1}$
(argument {\tt body}) and $p_{1|1} = \frac{p_{11}}{p_{.1}}$
(argument {\tt post}, rule confidence) are passed to the routine
that computes the measure, so the actual computation is
\begin{eqnarray*}
\frac{\chi^2(C,A)}{N_{..}}
& = & \frac{(p_{1.}\;p_{.1} -p_{1|1}\;p_{.1})^2}
           {p_{1.}(1 -p_{1.})p_{.1}(1 -p_{.1})}.
~~=~~ \frac{((p_{1.} -p_{1|1})p_{.1})^2}
           {p_{1.}(1 -p_{1.})p_{.1}(1 -p_{.1})}.
\end{eqnarray*}
In an analogous way the measure can also be computed from the absolute
frequencies $N_{ij}$, $N_{i.}$, $N_{.j}$ and $N_{..}$, namely as
\begin{eqnarray*}
\frac{\chi^2(C,A)}{N_{..}}
& = & \frac{(N_{1.}N_{.1} -N_{..}N_{11})^2}
           {N_{1.}(N_{..} -N_{1.})N_{.1}(N_{..} -N_{.1})}.
\end{eqnarray*}
\end{document}

%%% Local Variables: 
%%% mode: latex
%%% TeX-master: t
%%% End: