www.pudn.com > bayes.rar > nbayes.c


/*----------------------------------------------------------------------
  File    : nbayes.c
  Contents: Naive Bayes classifier management
  Author  : Christian Borgelt
  History : 07.12.1998 file created
            08.12.1998 nbc_create, nbc_dup, nbc_delete, nbc_add prog.
            10.12.1998 function nbc_desc completed
            11.12.1998 function nbc_exec completed
            12.12.1998 function nbc_parse completed
            16.12.1998 all functions debugged
            13.02.1999 tuple parameters added to nbc_add and nbc_exec
            10.01.1999 execution for one att. made a separate function
            11.03.1999 function nbc_induce added
            25.03.1999 distrib. of tuple weight for unknown values added
            27.03.1999 functions nbc_exp und nbc_var added
            15.05.1999 automatic frequency vector resizing added
            10.11.2000 function nbc_exec adapted
            18.11.2000 function nbc_setup added, nbc_exec adapted
            21.11.2000 redesign completed
            11.02.2001 bug in function nbc_mark (> instead of >=) fixed
            15.07.2001 parser improved (global variables removed)
            16.07.2001 adapted to modified module scan
            17.07.2001 parser improved (conditional look ahead)
            26.04.2003 function nbc_rand added
            15.04.2004 zero variances replaced by EPSILON
            12.08.2004 adapted to new module parse
----------------------------------------------------------------------*/
#include 
#include 
#include 
#include 
#include 
#include "nbayes.h"
#ifdef STORAGE
#include "storage.h"
#endif

/*----------------------------------------------------------------------
  Preprocessor Definitions
----------------------------------------------------------------------*/
#define	M_PI        3.14159265358979323846  /* \pi */
#define EPSILON     1e-12       /* to handle roundoff errors */
#define BLKSIZE     16          /* block size for vectors */

/*----------------------------------------------------------------------
  Type Definitions
----------------------------------------------------------------------*/
typedef struct {                /* --- selectable attribute --- */
  int    attid;                 /* attribute identifier */
  double errs;                  /* number of misclassifications */
} SELATT;                       /* (selectable attribute) */

/*----------------------------------------------------------------------
  Auxiliary Functions
----------------------------------------------------------------------*/
#ifdef NBC_INDUCE

static int _clsrsz (NBC *nbc, int clscnt)
{                               /* --- resize class dependent vectors */
  int    i, k, n;               /* loop variables, buffer */
  int    clsvsz;                /* size of the class dep. vectors */
  DVEC   *dvec;                 /* to traverse the distrib. vectors */
  NORMD  *normd;                /* to traverse the normal   distribs. */
  DISCD  *discd;                /* to traverse the discrete distribs. */
  double *frq;                  /* to traverse the frequency vectors */

  assert(nbc && (clscnt >= 0)); /* check the function arguments */

  /* --- resize the class dependent vectors --- */
  clsvsz = nbc->clsvsz;         /* get the class dep. vector size */
  if (clscnt >= clsvsz) {       /* if the vectors are too small */
    clsvsz += (clsvsz > BLKSIZE) ? clsvsz >> 1 : BLKSIZE;
    if (clscnt >= clsvsz) clsvsz = clscnt;
    frq = (double*)realloc(nbc->frqs, clsvsz *4 *sizeof(double));
    if (!frq) return -1;        /* resize the frequencies vector */
    nbc->frqs   = frq;          /* and set the new vector */
    nbc->priors = nbc->frqs   +clsvsz;  /* organize the rest */
    nbc->posts  = nbc->priors +clsvsz;  /* of the allocated */
    nbc->cond   = nbc->posts  +clsvsz;  /* memory block */
    n = clsvsz -nbc->clsvsz;    /* calc. number of new vector fields */
    for (frq += clsvsz, k = n; --k >= 0; )
      *--frq = 0;               /* clear the new vector fields */
    for (dvec = nbc->dvecs +(i = nbc->attcnt); --i >= 0; ) {
      if ((--dvec)->type == 0)  /* traverse all attributes */
        continue;               /* except the class attribute */
      if (dvec->type == AT_SYM){/* if the attribute is symbolic */
        discd = (DISCD*)realloc(dvec->discds, clsvsz *sizeof(DISCD));
        if (!discd) return -1;  /* resize the discrete dists. vector */
        dvec->discds = discd;   /* and set the new vector */
        for (discd += clsvsz, k = n; --k >= 0; ) {
          (--discd)->cnt = 0; discd->frqs = NULL;
        } }                     /* clear the new vector fields */
      else {                    /* if the attribute is numeric */
        normd = (NORMD*)realloc(dvec->normds, clsvsz *sizeof(NORMD));
        if (!normd) return -1;  /* resize the normal dists. vector */
        dvec->normds = normd;   /* and set the new vector */
        for (normd += clsvsz, k = n; --k >= 0; ) {
          (--normd)->cnt = 0; normd->sv = normd->sv2 = 0; }
      }                         /* clear the new vector fields */
    }  /* for (dvec = ... */
    nbc->clsvsz = clsvsz;       /* set new size of the class vectors */
  }  /* if (clscnt >= clsvsz) ... */

  /* --- create new value frequency vectors --- */
  for (dvec = nbc->dvecs +(i = nbc->attcnt); --i >= 0; ) {
    if ((--dvec)->type != AT_SYM)
      continue;                 /* traverse all symbolic attributes */
    discd = dvec->discds +clscnt;
    for (k = clscnt -nbc->clscnt; --k >= 0; ) {
      (--discd)->frqs =         /* allocate a value frequency vector */
      frq = (double*)malloc(dvec->valvsz *2 *sizeof(double));
      if (!frq) break;          /* set the probabilities vector */
      discd->probs = frq +dvec->valvsz;
      for (frq += n = dvec->valvsz; --n >= 0; )
        *--frq = 0;             /* traverse the frequency vectors */
    }                           /* and init. the value frequencies */
    if (k >= 0) break;          /* on error abort the loop */
  }
  if (i >= 0) {                 /* if an error occurred */
    for (i = nbc->attcnt -i; --i >= 0; dvec++) {
      if ((--dvec)->type != AT_SYM) continue;
      discd = dvec->discds +clscnt;
      for (k = clscnt -nbc->clscnt; --k >= 0; )
        if ((--discd)->frqs) { free(discd->frqs); discd->frqs = NULL; }
    }                           /* delete the newly created value */
    return -1;                  /* frequency vectors of the symbolic */
  }                             /* attributes and abort the function */

  nbc->clscnt = clscnt;         /* set the new number of classes */
  return 0;                     /* return 'ok' */
}  /* _clsrsz() */

/*--------------------------------------------------------------------*/

static int _valrsz (DVEC *dvec, int clscnt, int valcnt)
{                               /* --- resize the value freq. vectors */
  int    i, k, n;               /* loop variables, num. of new elems. */
  int    valvsz;                /* size of the value freq. vectors */
  int    bsz;                   /* size of vector in bytes */
  DISCD  *discd;                /* to traverse the discrete distribs. */
  double *frq;                  /* to traverse the frequency vectors */

  assert(dvec                   /* check the function argument */
     && (dvec->type == AT_SYM) && (clscnt >= 0) && (valcnt >= 0));
  valvsz = dvec->valvsz;        /* get the value freq. vector size */
  if (valcnt > valvsz) {        /* if the vectors are too small */
    valvsz += (valvsz > BLKSIZE) ? valvsz >> 1 : BLKSIZE;
    if (valcnt > valvsz) valvsz = valcnt;
    n   = valvsz -dvec->valcnt; /* get the number of new elements */
    bsz = valvsz *2 *sizeof(double);                     
    for (discd = dvec->discds +(i = clscnt); --i >= 0; ) {
      --discd;                  /* traverse the discrete distribs. */
      frq = (double*)realloc(discd->frqs, bsz);
      if (!frq) break;          /* resize the value freq. vector */
      discd->frqs  = frq;       /* and the probabilities vector */
      discd->probs = frq +valvsz;    /* and set the new vectors */
      for (frq += valvsz, k = n; --k >= 0; )
        *--frq = 0;             /* clear the new vector elements */
    }
    if (i < 0) {                /* if an error occurred */
      bsz = dvec->valvsz *2 *sizeof(double);
      for (i = clscnt -i -1; --i >= 0; ) {
        ++discd;                /* traverse the processed distribs. */
        discd->frqs  = (double*)realloc(discd->frqs, bsz);
        discd->probs = discd->frqs +dvec->valvsz;
      }                         /* shrink all value freq. vectors */
      return -1;                /* to their old size */
    }                           /* and then abort */
  }
  dvec->valcnt = valcnt;        /* set the new number of values */
  return 0;                     /* return 'ok' */
}  /* _valrsz() */

#endif
/*--------------------------------------------------------------------*/

static int _exec (const NBC *nbc, int attid, const INST *inst)
{                               /* --- execute for one attribute */
  int         i, k;             /* loop variable, buffer */
  const DVEC  *dvec;            /* to traverse the distrib. vectors */
  const NORMD *normd;           /* to traverse the normal   distribs. */
  const DISCD *discd;           /* to traverse the discrete distribs. */
  double      *prob;            /* to traverse the class probs. */
  double      v, d, s;          /* temporary buffers */

  assert(nbc && inst            /* check the function arguments */
      && (attid >= 0) && (attid < nbc->attcnt));
  dvec = nbc->dvecs +attid;     /* get the distribution vector */
  assert(dvec->type != 0);      /* and check the attribute type */
  if (dvec->type == AT_SYM) {   /* --- if the attribute is symbolic */
    k = inst->i;                /* get and check the attribute value */
    if ((k < 0) || (k >= dvec->valcnt)) return -1;
    discd = dvec->discds +nbc->clscnt;
    prob  = nbc->cond    +nbc->clscnt;
    for (i = nbc->clscnt; --i >= 0; ) {
      d = (--discd)->probs[k];  /* traverse the discrete distribs. */
      *--prob = (d > 0) ? d : EPSILON;
    } }                         /* copy the class probabilities */
  else {                        /* --- if the attribute is numeric */
    if (dvec->type == AT_FLT) { /* if the attribute is real valued */
      if (inst->f <= UV_FLT) return -1;
      v = (double)inst->f; }    /* check and get the attribute value */
    else {                      /* if the attribute is integer valued */
      if (inst->i <= UV_INT) return -1;
      v = (double)inst->i;      /* check and get the attribute value */
    }                           /* (convert it to double) */
    normd = dvec->normds +nbc->clscnt;
    prob  = nbc->cond    +nbc->clscnt;
    for (i = nbc->clscnt; --i >= 0; ) {
      d = v -(--normd)->exp;    /* traverse the normal distributions */
      s = 2 *normd->var;        /* and get their parameters */
      if (s < EPSILON) s = EPSILON;
      *--prob = exp(-d*d/s) /sqrt(M_PI*s);
    }                           /* compute the probability density */
  }                             /* at the value of the attribute */
  return 0;                     /* return 'ok' */
}  /* _exec() */

/*--------------------------------------------------------------------*/

static double _normd (double drand (void))
{                               /* --- compute N(0,1) distrib. number */
  static double b;              /* buffer for random number */
  double x, y, r;               /* coordinates and radius */

  if (b != 0.0) {               /* if the buffer is full, */
    x = b; b = 0; return x; }   /* return the buffered number */
  do {                          /* pick a random point */
    x = 2.0*drand()-1.0;        /* in the unit square [-1,1]^2 */
    y = 2.0*drand()-1.0;        /* and check whether it lies */
    r = x*x +y*y;               /* inside the unit circle */
  } while ((r > 1) || (r == 0));
  r = sqrt(-2*log(r)/r);        /* factor for Box-Muller transform */
  b = x *r;                     /* save one of the random numbers */
  return y *r;                  /* and return the other */
}  /* _normd() */

/*--------------------------------------------------------------------*/
#ifdef NBC_INDUCE

static int _eval (NBC *nbc, TABLE *table, int mode,
                  SELATT *savec, int cnt)
{                               /* --- evaluate selectable attributes */
  int    i, k, n;               /* loop variables, buffers */
  SELATT *sa;                   /* to traverse the selectable atts. */
  TUPLE  *tpl;                  /* to traverse the tuples */
  double *s, *d;                /* to traverse the probabilities */
  double max, tmp;              /* maximum of probabilities, buffer */
  int    old, new;              /* old and new predicted class */
  int    cls;                   /* actual class of a tuple */

  assert(nbc && table && savec  /* check the function arguments */
     && (cnt > 0) && (mode & (NBC_ADD|NBC_REMOVE)));
  for (n = tab_tplcnt(table); --n >= 0; ) {
    tpl = tab_tpl(table, n);    /* traverse the tuples in the table */
    cls = tpl_colval(tpl, nbc->clsid)->i;
    if (cls < 0) continue;      /* skip tuples with an unknown class */
    old = nbc_exec(nbc, tpl, NULL);
    for (sa = savec +(i = cnt); --i >= 0; ) {
      --sa;                     /* traverse the selectable attributes */
      if (_exec(nbc, sa->attid, tpl_colval(tpl, sa->attid)) != 0)
        new = old;              /* evaluate the classifier and */
      else {                    /* on failure use the old class */
        s = nbc->cond;          /* if a probability distribution */
        d = nbc->posts;         /* could be determined, traverse it */
        if (mode & NBC_ADD) {   /* if to add attributes, */
          max = *d * *s;        /* multiply with cond. probability */
          for (new = 0, k = 1; k < nbc->clscnt; k++) {
            tmp = *++d * *++s;  /* compute new probability */
            if (tmp > max) { max = tmp; new = k; }
          } }                   /* find the most probable class */
        else {                  /* if to remove attributes, */
          max = *d / *s;        /* divide by cond. probability */
          for (new = 0, k = 1; k < nbc->clscnt; k++) {
            tmp = *++d / *++s;  /* compute new probability */
            if (tmp > max) { max = tmp; new = k; }
          }                     /* find the most probable class */
        }                       /* for the current tuple */
      }                         /* (det. new classification result) */
      if (new != cls) sa->errs += tpl_getwgt(tpl);
    }                           /* count the misclassifications */
  }                             /* of the modified classifier */
  return 0;                     /* return 'ok' */
}  /* _eval() */

#endif
/*----------------------------------------------------------------------
  Main Functions
----------------------------------------------------------------------*/

NBC* nbc_create (ATTSET *attset, int clsid)
{                               /* --- create a naive Bayes class. */
  int    i, k, n;               /* loop variables */
  NBC    *nbc;                  /* created classifier */
  ATT    *att;                  /* to traverse the attributes */
  DVEC   *dvec;                 /* to traverse the distrib. vectors */
  DISCD  *discd;                /* to traverse the discrete distribs. */
  NORMD  *normd;                /* to traverse the normal   distribs. */
  double *frq;                  /* to traverse the frequency vectors */

  assert(attset && (clsid >= 0) /* check the function arguments */
      && (clsid < as_attcnt(attset))
      && (att_type(as_att(attset, clsid)) == AT_SYM));

  /* --- create the classifier body --- */
  i   = as_attcnt(attset);      /* get the number of attributes */
  nbc = (NBC*)malloc(sizeof(NBC) +(i-1) *sizeof(DVEC));
  if (!nbc) return NULL;        /* allocate the classifier body */
  for (dvec = nbc->dvecs +(k = i); --k >= 0; ) {
    (--dvec)->discds = NULL; dvec->normds = NULL;
  }                             /* clear the distribution vectors */
  nbc->attset = attset;         /* (for a proper clean up on error) */
  nbc->attcnt = i;              /* and initialize the other fields */
  nbc->clsid  = clsid;
  nbc->clsvsz = att_valcnt(as_att(attset, clsid));
  nbc->clscnt = nbc->clsvsz;
  nbc->total  = 0;
  nbc->lcorr  = 0;
  nbc->mode   = 0;

  /* --- initialize the class distributions --- */
  if (nbc->clscnt <= 0) {       /* if there are no classes, */
    nbc->frqs   =               /* no class vectors are needed */
    nbc->priors = nbc->posts = nbc->cond = NULL; }
  else {                        /* if there are classes, */
    nbc->frqs =                 /* allocate class vectors */
    frq = (double*)malloc(nbc->clsvsz *4 *sizeof(double));
    if (!frq) { nbc_delete(nbc, 0); return NULL; }
    nbc->priors = frq         +nbc->clsvsz;
    nbc->posts  = nbc->priors +nbc->clsvsz;
    nbc->cond   = nbc->posts  +nbc->clsvsz;
    for (frq += k = nbc->clsvsz; --k >= 0; )
      *--frq = 0;               /* traverse the frequency vector */
  }                             /* and init. the class frequencies */

  /* --- initialize the conditional distributions --- */
  for (dvec = nbc->dvecs +(i = nbc->attcnt); --i >= 0; ) {
    (--dvec)->mark = -1;        /* traverse and unmark all attributes */
    if (i == clsid) {           /* if this is the class attribute, */
      dvec->type = 0; continue;}/* clear the type for easier recogn. */
    att = as_att(attset, i);    /* get the next attribute */
    dvec->type = att_type(att); /* and its type */
    if (dvec->type == AT_SYM) { /* -- if the attribute is symbolic */
      dvec->valcnt =            /* set the number of att. values */
      dvec->valvsz = att_valcnt(att);
      if (nbc->clscnt <= 0)     /* if there are no classes, */
        continue;               /* there is nothing else to do */
      dvec->discds =            /* create a vector of discrete dists. */
      discd = (DISCD*)calloc(nbc->clsvsz, sizeof(DISCD));
      if (!discd) { nbc_delete(nbc, 0); return NULL; }
      if (dvec->valcnt <= 0)    /* if the attribute has no values, */
        continue;               /* there is nothing else to do */
      for (discd += k = nbc->clscnt; --k >= 0; ) {
        (--discd)->frqs =       /* create a value frequency vector */
        frq = (double*)malloc(dvec->valvsz *2 *sizeof(double));
        if (!frq) { nbc_delete(nbc, 0); return NULL; }
        discd->probs = frq +dvec->valvsz;
        for (frq += n = dvec->valvsz; --n >= 0; )
          *--frq = 0;           /* traverse the frequency vectors */
      } }                       /* and init. the value frequencies */
    else {                      /* -- if the attribute is numeric */
      dvec->valcnt = dvec->valvsz = 0;
      if (nbc->clscnt <= 0)     /* if there are no classes, */
        continue;               /* there is nothing else to do */
      dvec->normds =            /* create a vector of normal dists. */
      normd = (NORMD*)malloc(nbc->clsvsz *sizeof(NORMD));
      if (!normd) { nbc_delete(nbc, 0); return NULL; }
      for (normd += k = nbc->clsvsz; --k >= 0; ) {
        (--normd)->cnt = 0; normd->sv = normd->sv2 = 0; }
    }                           /* clear the sums from which expected */
  }                             /* value and variance are computed */

  return nbc;                   /* return the created classifier */
}  /* nbc_create() */

/*--------------------------------------------------------------------*/

NBC* nbc_dup (NBC *nbc, int dupas)
{                               /* --- duplicate a naive Bayes class. */
  NBC    *dup;                  /* created classifier duplicate */
  ATTSET *attset;               /* duplicate of attribute set */
  int    i, k, n;               /* loop variables */
  DVEC   *dv; const DVEC   *sv; /* to traverse the distrib. vectors */
  NORMD  *dn; const NORMD  *sn; /* to traverse the normal   distribs. */
  DISCD  *dd; const DISCD  *sd; /* to traverse the discrete distribs. */
  double *df; const double *sf; /* to traverse the frequency vectors */

  assert(nbc);                  /* check the function argument */

  /* --- copy the classifier body --- */
  attset = nbc->attset;         /* get the attribute set */
  if (dupas) {                  /* if the corresp. flag is set, */
    attset = as_dup(attset);    /* duplicate the attribute set */
    if (!attset) return NULL;   /* of the original classifier, */
  }                             /* and then create a classifier */
  dup = (NBC*)malloc(sizeof(NBC) +(nbc->attcnt-1) *sizeof(DVEC));
  if (!dup) { if (dupas) as_delete(attset); return NULL; }
  for (dv = dup->dvecs +(i = nbc->attcnt); --i >= 0; ) {
    (--dv)->discds = NULL; dv->normds = NULL;
  }                             /* clear the distribution vectors */
  dup->attset = attset;         /* (for a proper clean up on error) */
  dup->attcnt = nbc->attcnt;    /* and copy the other fields */
  dup->clsid  = nbc->clsid;
  dup->clsvsz = nbc->clscnt;
  dup->clscnt = nbc->clscnt;
  dup->total  = nbc->total;
  dup->lcorr  = nbc->lcorr;
  dup->mode   = nbc->mode;

  /* --- copy the class distributions --- */
  if (nbc->clscnt <= 0)         /* if there are no classes, */
    dup->frqs   =               /* no class vectors are needed */
    dup->priors = dup->posts = dup->cond = NULL;
  else {                        /* if there are classes, */
    dup->frqs =                 /* allocate class vectors */
    df = (double*)malloc(dup->clsvsz *4 *sizeof(double));
    if (!df) { nbc_delete(dup, dupas); return NULL; }
    dup->priors = dup->frqs   +dup->clsvsz;
    dup->posts  = dup->priors +dup->clsvsz;
    dup->cond   = dup->posts  +dup->clsvsz;
    sf = nbc->frqs +2 *dup->clscnt;
    for (df += k = 2 *dup->clscnt; --k >= 0; )
      *--df = *--sf;            /* traverse the frequency vector */
  }                             /* and copy the class frequencies */

  /* --- copy the conditional distributions --- */
  sv = nbc->dvecs +nbc->attcnt; /* get pointers to the */
  dv = dup->dvecs +nbc->attcnt; /* distribution vectors */
  for (i = nbc->attcnt; --i >= 0; ) {
    --sv; --dv;                 /* traverse the distribution vectors */
    dv->mark   = sv->mark;      /* copy the attribute mark, */
    dv->type   = sv->type;      /* the attribute type, */
    dv->valvsz = sv->valcnt;    /* the value vector size, and */
    dv->valcnt = sv->valcnt;    /* the number of attribute values */
    if ((sv->type    == 0)      /* if this is the class attribute */
    ||  (nbc->clscnt <= 0))     /* or if there are no classes, */
      continue;                 /* there is nothing else to do */
    if (sv->type == AT_SYM) {   /* -- if the attribute is symbolic */
      dv->discds =              /* create a vector of discrete dists. */
      dd = (DISCD*)calloc(dup->clsvsz, sizeof(DISCD));
      if (!dd) { nbc_delete(dup, dupas); return NULL; }
      if (sv->valcnt <= 0)      /* if the attribute has no values, */
        continue;               /* there is nothing else to do */
      sd = sv->discds +nbc->clscnt;
      for (dd += (k = nbc->clscnt); --k >= 0; ) {
        --dd; --sd;             /* traverse the discrete distribs. */
        dd->cnt  = sd->cnt;     /* copy the total frequency and */
        dd->frqs =              /* create a value frequency vector */
        df = (double*)malloc(dv->valvsz *2 *sizeof(double));
        if (!df) { nbc_delete(dup, dupas); return NULL; }
        dd->probs = df +dv->valvsz;
        sf = sd->frqs +2 *dv->valvsz;
        for (df += n = 2 *dv->valvsz; --n >= 0; )
          *--df = *--sf;        /* traverse the frequency vectors */
      } }                       /* and copy the value frequencies */
    else {                      /* -- if the attribute is numeric */
      dv->normds =              /* create a vector of normal dists. */
      dn = (NORMD*)malloc(dup->clsvsz *sizeof(NORMD));
      if (!dn) { nbc_delete(dup, dupas); return NULL; }
      sn = sv->normds +dup->clsvsz;
      for (dn += k = dup->clsvsz; --k >= 0; )
        *--dn = *--sn;          /* copy the normal distributions */
    }                           /* (including computed estimates) */
  }
  return dup;                   /* return the created duplicate */
}  /* nbc_dup() */

/*--------------------------------------------------------------------*/

void nbc_delete (NBC *nbc, int delas)
{                               /* --- delete a naive Bayes class. */
  int   i, k;                   /* loop variables */
  DVEC  *dvec;                  /* to traverse the distrib. vectors */
  DISCD *discd;                 /* to traverse the discrete distribs. */

  assert(nbc);                  /* check the function argument */
  for (dvec = nbc->dvecs +(i = nbc->attcnt); --i >= 0; ) {
    if ((--dvec)->discds) {     /* traverse the attributes */
      for (discd = dvec->discds +(k = nbc->clscnt); --k >= 0; )
        if ((--discd)->frqs) free(discd->frqs);
      free(dvec->discds);       /* delete all frequency vectors */
    }                           /* and the distribution vectors */
    if (dvec->normds) free(dvec->normds);
  }                             /* delete the normal distributions */
  if (nbc->frqs) free(nbc->frqs);
  if (delas)     as_delete(nbc->attset);
  free(nbc);                    /* delete the classifier body */
}  /* nbc_delete() */

/*--------------------------------------------------------------------*/

void nbc_clear (NBC *nbc)
{                               /* --- clear a naive Bayes classifier */
  int    i, k, n;               /* loop variables */
  DVEC   *dvec;                 /* to traverse the distrib. vectors */
  DISCD  *discd;                /* to traverse the discrete distribs. */
  NORMD  *normd;                /* to traverse the normal   distribs. */
  double *frq;                  /* to traverse the frequency vectors */

  assert(nbc);                  /* check the function argument */
  nbc->total = 0;               /* clear the total number of cases */
  for (frq = nbc->frqs +(i = nbc->clscnt); --i >= 0; )
    *--frq = 0;                 /* clear the frequency distribution */
  for (dvec = nbc->dvecs +(i = nbc->attcnt); --i >= 0; ) {
    if ((--dvec)->type == 0)    /* traverse all attributes */
      continue;                 /* except the class attribute */
    if (dvec->type == AT_SYM) { /* if the attribute is symbolic */
      for (discd = dvec->discds +(k = nbc->clscnt); --k >= 0; ) {
        (--discd)->cnt = 0;     /* traverse the distributions */
        for (frq = discd->frqs +(n = dvec->valcnt); --n >= 0; )
          *--frq = 0;           /* traverse the frequency vector */
      } }                       /* and clear the value frequencies */
    else {                      /* if the attribute is numeric */
      for (normd = dvec->normds +(k = nbc->clscnt); --k >= 0; ) {
        (--normd)->cnt = 0; normd->sv = normd->sv2 = 0; }
    }                           /* clear the sums from which expected */
  }                             /* value and variance are computed */
}  /* nbc_clear() */

/*--------------------------------------------------------------------*/
#ifdef NBC_INDUCE

int nbc_add (NBC *nbc, const TUPLE *tpl)
{                               /* --- add an instantiation */
  int    i;                     /* loop variable */
  int    cls;                   /* value of class attribute */
  float  wgt;                   /* instantiation weight */
  const  INST *inst;            /* to traverse the instances */
  DVEC   *dvec;                 /* to traverse the distrib. vectors */
  NORMD  *normd;                /* to access normal   distributions */
  DISCD  *discd;                /* to access discrete distributions */
  double v;                     /* buffer (for an attribute value) */

  assert(nbc);                  /* check the function argument */

  /* --- get class and weight --- */
  if (tpl) {                    /* if a tuple is given */
    cls = tpl_colval(tpl, nbc->clsid)->i;
    wgt = tpl_getwgt(tpl); }    /* get the class and the tuple weight */
  else {                        /* if no tuple is given */
    cls = att_inst(as_att(nbc->attset, nbc->clsid))->i;
    wgt = as_getwgt(nbc->attset);
  }                             /* get the class and the inst. weight */
  if (cls < 0) return 0;        /* if the class is unknown, abort */
  assert(wgt >= 0.0F);          /* check the tuple weight */

  /* --- update class distribution --- */
  if ((cls >= nbc->clscnt)      /* if the class is a new one, */
  &&  (_clsrsz(nbc,cls+1) != 0))/* resize the class dependent vectors */
    return -1;                  /* (frequencies and distributions) */
  nbc->frqs[cls] += wgt;        /* update the class frequency */
  nbc->total     += wgt;        /* and the total frequency */

  /* --- update conditional distributions --- */
  for (dvec = nbc->dvecs +(i = nbc->attcnt); --i >= 0; ) {
    if ((--dvec)->type == 0)    /* traverse all attributes */
      continue;                 /* except the class attribute */
    inst = (tpl)                /* get the attribute instantiation */
         ? tpl_colval(tpl, i)   /* from the tuple or the att. set */
         : att_inst(as_att(nbc->attset, i));
    if (dvec->type == AT_SYM) { /* -- if the attribute is symbolic */
      if (inst->i < 0)          /* if the att. value is unknown, */
        continue;               /* there is nothing to do */
      if ((inst->i >= dvec->valcnt)
      &&  (_valrsz(dvec, nbc->clscnt, inst->i+1) != 0))
        return -1;              /* resize the value freq. vectors */
      discd = dvec->discds +cls;   /* get the proper distribution */
      discd->frqs[inst->i] += wgt; /* and update the value frequency */
      discd->cnt += wgt; }         /* and the total frequency */
    else {                      /* -- if the attribute is numeric */
      if (dvec->type == AT_FLT){/* if the attribute is real valued */
        if (inst->f <= UV_FLT) continue;
        v = (double)inst->f;}   /* check and get the attribute value */
      else {                    /* if the attribute is integer valued */
        if (inst->i <= UV_INT) continue;
        v = (double)inst->i;    /* check and get the attribute value */
      }                         /* (convert it to double) */
      normd = dvec->normds +cls;/* get the proper distribution */
      normd->cnt += wgt;        /* update the case counter */
      normd->sv  += wgt *v;     /* the sum of the values, and */
      normd->sv2 += wgt *v*v;   /* the sum of their squares */
    }                           /* (expected value and variance */
  }                             /*  are computed in nbc_setup) */
  return 0;                     /* return 'ok' */
}  /* nbc_add() */

/*--------------------------------------------------------------------*/

NBC* nbc_induce (TABLE *table, int clsid, int mode, double lcorr)
{                               /* --- induce a naive Bayes class. */
  int    i, r = 0;              /* loop variable, buffer */
  int    cnt;                   /* number of selectable attributes */
  int    cls;                   /* predicted class */
  NBC    *nbc;                  /* created classifier */
  ATTSET *attset;               /* attribute set of the classifier */
  SELATT *savec;                /* vector of selectable attributes */
  SELATT *sa, *best;            /* to traverse the selectable atts. */
  TUPLE  *tpl;                  /* to traverse the tuples */
  DVEC   *dvec;                 /* to traverse the distrib. vectors */
  double *p;                    /* to traverse the class probs. */
  double max;                   /* maximum of class probabilities */
  double errs;                  /* weight sum of misclassified tuples */

  assert(table                  /* check the function arguments */
      && (clsid >= 0) && (clsid < tab_colcnt(table))
      && (att_type(as_att(tab_attset(table), clsid)) == AT_SYM));

  /* --- create a classifier --- */
  attset = tab_attset(table);   /* get the attribute set of the table */
  if (mode & NBC_DUPAS) {       /* if the corresp. flag is set, */
    attset = as_dup(attset);    /* duplicate the attribute set */
    if (!attset) return NULL;   /* of the given data table, */
  }                             /* then create a classifier */
  nbc = nbc_create(attset, clsid);
  if (!nbc) { if (mode & NBC_DUPAS) as_delete(attset); return NULL; }

  /* --- build initial classifier --- */
  for (i = tab_tplcnt(table); --i >= 0; )
    nbc_add(nbc, tab_tpl(table, i));    /* start with a */
  nbc_setup(nbc, mode|NBC_ALL, lcorr);  /* full classifier */
  if (!(mode & (NBC_ADD|NBC_REMOVE)))
    return nbc;                 /* if no simp. is requested, abort */

  /* --- evaluate initial classifier --- */
  if (mode & NBC_ADD) {         /* if to add attributes, */
    p = nbc->frqs; errs = max = *p;
    for (i = nbc->clscnt; --i > 0; ) {
      errs += *++p;             /* traverse the class frequencies, */
      if (*p > max) max = *p;   /* sum them, and determine their */
    }                           /* maximum (find the majority class) */
    errs -= max; }              /* compute the number of prior errors */
  else {                        /* if to remove attributes */
    for (errs = 0, i = tab_tplcnt(table); --i >= 0; ) {
      tpl = tab_tpl(table,i);   /* traverse the tuples in the table */
      cls = tpl_colval(tpl, clsid)->i;
      if (cls < 0) continue;    /* skip tuples with an unknown class */
      assert(cls < nbc->clscnt);/* check the class value */
      if (nbc_exec(nbc, tpl, NULL) != cls)
        errs += tpl_getwgt(tpl);/* classify the current tuple */
    }                           /* and determine the number */
  }                             /* of misclassifications */
  #ifndef NDEBUG
  fprintf(stderr, "\n%8g errors initially\n", errs);
  #endif                        /* print a counter for debugging */

  /* --- collect selectable attributes --- */
  savec = malloc(nbc->attcnt *sizeof(SELATT));
  if (!savec) { nbc_delete(nbc, mode & NBC_DUPAS); return NULL; }
  sa = savec;                   /* create vector of selectable atts. */
  for (dvec = nbc->dvecs +(i = nbc->attcnt); --i >= 0; ) {
    (--dvec)->mark = -1;        /* traverse all attributes */
    if ( (dvec->type == 0)      /* except the class attribute */
    ||  ((dvec->type == AT_SYM) /* and all symbolic attributes */
    &&   (dvec->valcnt <= 0)))  /* that do not have any values */
      continue;
    if (mode & NBC_REMOVE)      /* if to remove attributes, */
      dvec->mark = i;           /* mark all selectable attributes */
    sa->attid = i;              /* note selectable attributes and */
    sa->errs  = 0; sa++;        /* initialize the number of errors */
  }
  cnt = (int)(sa -savec);       /* compute the number of attributes */
  nbc->dvecs[nbc->clsid].mark = nbc->clsid;   /* and mark the class */

  /* --- select attributes --- */
  while ((cnt  > 0)             /* while there are selectable atts. */
  &&     (errs > 0)) {          /* and the classifier is not perfect */
    for (sa = savec +(i = cnt); --i >= 0; )
      (--sa)->errs = 0;         /* clear the numbers of errors */
    r = _eval(nbc, table, mode, savec, cnt);
    if (r < 0) break;           /* evaluate selectable attributes */
    best = sa = savec;          /* traverse the selectable attributes */
    for (i = cnt; --i > 0; ) {  /* in order to find the best */
      if (((++sa)->errs <  best->errs)
      ||  ((  sa ->errs <= best->errs) && (mode & NBC_ADD)))
        best = sa;              /* find least number of errors and */
    }                           /* note the corresponding attribute */
    if ( (best->errs >  errs)   /* if more tuples were misclassified */
    ||  ((best->errs >= errs) && (mode & NBC_ADD)))
      break;                    /* abort the selection loop */
    errs = best->errs;          /* note the new number of errors */
    #ifndef NDEBUG
    fprintf(stderr, "%8g errors %s\n", best->errs,
            att_name(as_att(attset, best->attid)));
    #endif                      /* print a counter for debugging */
    nbc->dvecs[best->attid].mark = (mode & NBC_ADD)
      ? best->attid : -1;       /* mark/unmark the selected attribute */
    for (--cnt; best < sa; best++)  /* remove the selected */
      best->attid = best[1].attid;  /* attribute from the  */
  }                                 /* list of attributes  */
  free(savec);                  /* delete the selectable atts. vector */

  if (r < 0) {                  /* if an error occurred, abort */
    nbc_delete(nbc, mode & NBC_DUPAS); return NULL; }
  return nbc;                   /* return the created classifier */
}  /* nbc_induce() */

/*--------------------------------------------------------------------*/

int nbc_mark (NBC *nbc)
{                               /* --- mark selected attributes */
  int  i, m;                    /* loop variable, buffer for marker */
  DVEC *dvec;                   /* to traverse the distrib. vectors */
  int  cnt = 0;                 /* attribute counter */

  assert(nbc);                  /* check the function argument */
  for (dvec = nbc->dvecs +(i = nbc->attcnt); --i >= 0; ) {
    if ((--dvec)->mark < 0) m = -1;
    else           { cnt++; m =  1; }
    att_setmark(as_att(nbc->attset, i), m);
  }                             /* transfer marker to attribute set */
  att_setmark(as_att(nbc->attset, nbc->clsid), 0);
  return cnt;                   /* return number of marked atts. */
}  /* nbc_mark() */

#endif
/*--------------------------------------------------------------------*/

void nbc_setup (NBC *nbc, int mode, double lcorr)
{                               /* --- set up a naive Bayes class. */
  int    i, k, n;               /* loop variables */
  DVEC   *dvec;                 /* to traverse the distrib. vectors */
  NORMD  *normd;                /* to traverse the normal   distribs. */
  DISCD  *discd;                /* to traverse the discrete distribs. */
  double *frq, *prb;            /* to traverse the value frqs./probs. */
  double cnt, sp;               /* number of cases, sum of priors */
  double add;                   /* Laplace corr. + distributed weight */

  assert(nbc && (lcorr >= 0));  /* check the function arguments */
  nbc->mode  = mode & (NBC_DISTUV|NBC_MAXLLH);
  nbc->lcorr = lcorr;           /* note estimation parameters */

  /* --- estimate class probabilities --- */
  cnt = nbc->total +lcorr *nbc->clscnt;
  n   = nbc->clscnt;            /* get the number of classes and */
  prb = nbc->priors +n;         /* traverse the class probabilities */
  if (cnt <= 0)                 /* if the denominator is invalid, */
    while (--n >= 0) *--prb = 0;       /* clear all probabilities */
  else {                        /* if the denominator is valid, */
    frq = nbc->frqs +n;         /* traverse the class frequencies */
    while (--n >= 0) *--prb = (*--frq +lcorr) /cnt;
  }                             /* estimate the class probabilities */

  /* --- estimate conditional probabilities --- */
  for (dvec = nbc->dvecs +(i = nbc->attcnt); --i >= 0; ) {
    if ((--dvec)->type == 0) {  /* traverse all attributes */
      dvec->mark = 0; continue;}/* except the class attribute */
    if      (mode & NBC_ALL)    /* if to use all attributes, */
      dvec->mark = 1;           /* mark attribute as used */
    else if (mode & NBC_MARKED) /* if to use only marked atts. */
      dvec->mark = (att_getmark(as_att(nbc->attset, i)) >= 0)
                 ? 1 : -1;      /* get the attribute mark */
    if (dvec->mark < 0)         /* otherwise keep the */
      continue;                 /* selection of attributes */
    if (dvec->type == AT_SYM) { /* -- if the attribute is symbolic */
      if (dvec->valcnt <= 0) {  /* if the attribute has no values, */
        dvec->mark = -1; continue; }     /* there is nothing to do */
      sp = dvec->valcnt *lcorr; /* compute the sum of the priors */
      for (discd = dvec->discds +(k = nbc->clscnt); --k >= 0; ) {
        --discd;                /* traverse the distributions */
        n   = dvec->valcnt;     /* get the number of att. values */
        prb = discd->probs +n;  /* and the probability vector */
        add = (mode & NBC_DISTUV) ? nbc->frqs[k] : discd->cnt;
        cnt = sp +add;          /* compute denominator of estimator */
        if (cnt <= 0)           /* if the estimator is invalid, */
          while (--n >= 0) *--prb = 0;      /* clear all probs. */
        else {                  /* if the estimator is valid */
          add = lcorr +(add -discd->cnt) /n;
          for (frq = discd->frqs +n; --n >= 0; )
            *--prb = (*--frq +add) /cnt;
        }                       /* traverse the value frequencies */
      } }                       /* and estimate the probabilities */
    else {                      /* -- if the attribute is numeric */
      for (normd = dvec->normds +(k = nbc->clscnt); --k >= 0; ) {
        cnt = (--normd)->cnt;   /* traverse the distributions */
        normd->exp = (cnt > 0) ? normd->sv /cnt : 0;
        if (!(mode & NBC_MAXLLH)) cnt -= 1;
        normd->var = (cnt > 0)
                   ? (normd->sv2 -normd->exp *normd->sv) /cnt : 0;
      }                         /* estimate the expected value and */
    }                           /* the variance (either with unbiased */
  }                             /* or with max. likelihood estimator) */
  nbc->dvecs[nbc->clsid].mark = 1;  /* mark the class attribute */
}  /* nbc_setup() */

/*----------------------------------------------------------------------
Point estimator for the expected value of a normal distribution:
  \hat{\mu} = \frac{1}{n} \sum_{i=1}^n x_i
(unbiased, consistent, and efficient)

Point estimator for the variance of a normal distribution:
  \hat{\sigma}^2 = \frac{1}{n-1} \sum_{i=1}^n (x_i - \hat{\mu})^2
                 = \frac{1}{n-1} (\sum_{i=1}^n x_i^2 - n\hat{\mu}^2)
(unbiased and consistent)

Maximum likelihood estimator for the variance of a normal distribution:
  \hat{\sigma}^2 = \frac{1}{n} \sum_{i=1}^n (x_i - \hat{\mu})^2
                 = \frac{1}{n} (\sum_{i=1}^n x_i^2 - n\hat{\mu}^2)
(consistent)

Source: R.L. Larsen and M.L. Marx.
        An Introduction to Mathematical Statistics and Its Applications.
        Prentice Hall, Englewood Cliffs, NJ, USA 1986, pp. 312 & 314
----------------------------------------------------------------------*/

int nbc_exec (NBC *nbc, const TUPLE *tpl, double *conf)
{                               /* --- execute a naive Bayes class. */
  int        i, k;              /* loop variables */
  DVEC       *dvec;             /* to traverse the distrib. vectors */
  const INST *inst;             /* to traverse the instances */
  double     *s, *d;            /* to traverse the probabilities */
  double     sum;               /* sum of class probabilities */

  assert(nbc);                  /* check the function argument */

  /* --- initialize --- */
  s = nbc->priors +nbc->clscnt; /* init. the posterior distribution */
  d = nbc->posts  +nbc->clscnt; /* with  the prior     distribution */
  for (k = nbc->clscnt; --k >= 0; ) *--d = *--s;

  /* --- process attribute values --- */
  for (dvec = nbc->dvecs +(i = nbc->attcnt); --i >= 0; ) {
    if (((--dvec)->type == 0)   /* traverse all attributes */
    ||  (   dvec ->mark <  0))  /* except the class attribute */
      continue;                 /* and all unmarked attributes */
    inst = (tpl)                /* get the attribute instantiation */
         ? tpl_colval(tpl, i)   /* from the tuple or the att. set */
         : att_inst(as_att(nbc->attset, i));
    if (_exec(nbc, i, inst) < 0)/* execute the classifier */
      continue;                 /* for the current attribute */
    s = nbc->cond +nbc->clscnt; /* traverse the cond. probabilities */
    d = nbc->posts+nbc->clscnt; /* and the posterior distribution */
    for (sum = 0, k = nbc->clscnt; --k >= 0; )
      sum += *--d *= *--s;      /* multiply with cond. probabilities */
    if ((sum > 1e-24) && (sum < 1e24))
      continue;                 /* if the sum is ok, continue */
    if (sum <= 0) break;        /* if the sum is fubar, abort */
    for (d += k = nbc->clscnt; --k >= 0; )
      *--d /= sum;              /* otherwise renormalize in order */
  }                             /* to avoid an over- or underflow */
  s = d = nbc->posts;           /* traverse the final distribution */
  for (sum = *s, k = nbc->clscnt; --k > 0; ) {
    if (*++s > *d) d = s;       /* find the most probable class */
    sum += *s;                  /* and sum all probabilities */
  }                             /* (for the later normalization) */
  if (conf) *conf = (sum > 0) ? *d /sum : 0;
  return (int)(d -nbc->posts);  /* compute a confidence value and */
}  /* nbc_exec() */             /* return the classification result */

/*--------------------------------------------------------------------*/

void nbc_rand (NBC *nbc, double drand (void))
{                               /* --- generate a random tuple */
  int    i, k, n;               /* loop variables */
  double t, sum;                /* random number, sum of probs. */
  double *p;                    /* to access the probabilities */
  DVEC   *dvec;                 /* to traverse the distrib. vectors */
  INST   *inst;                 /* to traverse the instances */
  NORMD  *nd;                   /* normal distribution */

  p = nbc->priors;              /* get the prior probabilities */
  t = drand();                  /* generate a random number */
  for (sum = i = 0; i < nbc->clscnt; i++) {
    sum += p[i]; if (sum >= t) break; }
  if (i >= nbc->clscnt)         /* find the class that corresponds */
    i = nbc->clscnt -1;         /* to the generated random number */
  att_inst(as_att(nbc->attset, nbc->clsid))->i = i;
  for (dvec = nbc->dvecs +(n = nbc->attcnt); --n >= 0; ) {
    if (((--dvec)->type == 0)   /* traverse all attributes */
    ||  (   dvec ->mark <  0))  /* except the class attribute */
      continue;                 /* and all unmarked attributes */
    inst = att_inst(as_att(nbc->attset, n));
    if (dvec->type == AT_SYM) { /* --- if the attribute is symbolic */
      p = dvec->discds[i].probs;/* get the conditional distribution */
      t = drand();              /* generate a random number */
      for (sum = k = 0; k < dvec->valcnt; k++) {
        sum += p[k]; if (sum >= t) break; }
      if (k >= dvec->valcnt)    /* find the value that corresponds */
        k = dvec->valcnt -1;    /* to the generated random number */
      inst->i = k; }            /* and set the attribute instance */
    else {                      /* --- if the attribute is numeric */
      nd = dvec->normds +i;     /* get the conditional distribution */
      t  = sqrt(nd->var) *_normd(drand) +nd->exp;
      if (dvec->type == AT_FLT) inst->f = (float)t;
      else                      inst->i = (int)(t +0.5);
    }                           /* sample from the normal distrib. */
  }                             /* and transform the result */
}  /* nbc_rand() */

/*--------------------------------------------------------------------*/

int nbc_desc (NBC *nbc, FILE *file, int mode, int maxlen)
{                               /* --- describe a naive Bayes class. */
  int         i, k, n;          /* loop variables */
  int         pos, ind;         /* current position and indentation */
  int         len, l;           /* length of class/value name/number */
  const char  *clsname;         /* name of class attribute */
  ATT         *att, *clsatt;    /* to traverse the attributes */
  const DVEC  *dvec;            /* to traverse the distrib. vectors */
  const NORMD *normd;           /* to traverse the normal   distribs. */
  const DISCD *discd;           /* to traverse the discrete distribs. */
  char  name[4*AS_MAXLEN+4];    /* output buffer for names */
  char  num[64];                /* output buffer for numbers */

  assert(nbc && file);          /* check the function arguments */

  /* --- print a header (as a comment) --- */
  if (mode & NBC_TITLE) {       /* if the title flag is set */
    i = k = (maxlen > 0) ? maxlen -2 : 70;
    fputs("/*", file); while (--i >= 0) fputc('-', file);
    fputs("\n  naive Bayes classifier\n", file);
    while (--k >= 0) fputc('-', file); fputs("*/\n", file);
  }                             /* print a title header */
  if (maxlen <= 0) maxlen = INT_MAX;

  /* --- start description --- */
  clsatt  = as_att(nbc->attset, nbc->clsid);
  clsname = att_name(clsatt);   /* note the class attribute name */
  fputs("nbc(", file);          /* (is needed repeatedly below) */
  sc_format(name, clsname, 0);  /* format and print */
  fputs(name, file);            /* the class attribute name */
  fputs(") = {\n", file);       /* and start the description */
  if ((nbc->lcorr > 0)          /* if estimation parameters */
  ||   nbc->mode) {             /* differ from default values */
    fprintf(file, "  params = %g", nbc->lcorr);
    if (nbc->mode & NBC_DISTUV) fputs(", distuv", file);
    if (nbc->mode & NBC_MAXLLH) fputs(", maxllh", file);
    fputs(";\n", file);         /* print Laplace correction */
  }                             /* and estimation mode */

  /* --- print the class distribution --- */
  fputs("  prob(", file);       /* print a distribution indicator */
  fputs(name, file);            /* and the class attribute name and */
  fputs(") = {\n    ", file);   /* start the class distribution */
  pos = 4;                      /* initialize the output position */
  ind = att_valwd(clsatt,0) +2; /* compute position and indentation */
  for (i = 0; i < nbc->clscnt; i++) {   /* traverse the classes */
    if (i > 0)                  /* if this is not the first class, */
      fputs(",\n    ", file);   /* start a new output line */
    len = sc_format(name, att_valname(clsatt, i), 0);
    fputs(name, file);          /* get and print the class name */
    for (pos = len+2; pos < ind; pos++)
      putc(' ', file);          /* pad with blanks to equal width */
    fprintf(file, ": %g", nbc->frqs[i]);
    if (mode & NBC_REL)         /* print the absolute class frequency */
      fprintf(file, " (%.1f%%)", nbc->priors[i] *100);
  }                             /* print the relative class frequency */
  fputs(" };\n", file);         /* terminate the class distribution */

  /* --- print the (conditional) distributions --- */
  for (dvec = nbc->dvecs, n = 0; n < nbc->attcnt; dvec++, n++) {
    if ((dvec->type == 0)       /* traverse all attributes, */
    ||  ((mode & NBC_MARKED)    /* but skip the class attribute */
    &&   (dvec->mark < 0)))     /* and in marked mode also */
      continue;                 /* all unmarked attributes */
    fputs("  prob(", file);     /* print a distribution indicator */
    att = as_att(nbc->attset,n);/* and get the next attribute */
    sc_format(name, att_name(att), 0);
    fputs(name, file);          /* print the attribute name */
    putc('|', file);            /* and the condition separator */
    sc_format(name, clsname,0); /* format and print */
    fputs(name, file);          /* the class attribute name */
    fputs(") = {\n    ", file); /* and start the cond. distribution */
    if (dvec->discds) {         /* if the attribute is symbolic, */
      discd = dvec->discds;     /* traverse the discrete distribs. */
      for (i = 0; i < nbc->clscnt; discd++, i++) {
        if (i > 0)              /* if this is not the first class, */
          fputs(",\n    ", file);       /* start a new output line */
        len = sc_format(name, att_valname(clsatt, i), 0);
        fputs(name, file);      /* get and print the class name */
        for (pos = len+2; pos < ind; pos++)
          putc(' ', file);      /* pad with blanks to equal width */
        fputs(":{", file);      /* start the value distribution and */
        pos += 3;               /* traverse the attribute values */
        for (k = 0; k < dvec->valcnt; k++) {
          if (k > 0) {          /* if this is not the first value, */
            putc(',', file); pos++; }         /* print a separator */
          len  = sc_format(name, att_valname(att, k), 0);
          len += l = sprintf(num, ": %g", discd->frqs[k]);
          if (mode & NBC_REL)   /* format value frequency */
            len += sprintf(num +l, " (%.1f%%)", discd->probs[k]*100);
          if ((pos      > ind)  /* if the line would get too long */
          &&  (pos +len > maxlen -4)) {
            putc('\n', file);   /* start a new line and indent */
            for (pos = 0; pos < ind; pos++) putc(' ', file); }
          else {                /* if there is enough space left, */
            putc(' ', file); pos++; }   /* only print a separator */
          fputs(name, file); fputs(num, file);
          pos += len;           /* print value and its frequency */
        }                       /* and update the output position */
        fputs(" }", file);      /* terminate the value distribution */
      } }
    else {                      /* if the attribute is numeric, */
      normd = dvec->normds;     /* traverse the normal distributions */
      for (i = 0; i < nbc->clscnt; normd++, i++) {
        if (i > 0)              /* if this is not the first class, */
          fputs(",\n    ", file);       /* start a new output line */
        len = sc_format(name, att_valname(clsatt, i), 0);
        fputs(name, file);      /* get and print the class name */
        for (pos = len+2; pos < ind; pos++)
          putc(' ', file);      /* pad with blanks to equal width */
        fprintf(file, ": N(%g, %g) [%g]",
                normd->exp, normd->var, normd->cnt);
      }                         /* print the normal distribution */
      putc(' ', file);          /* with expected value and variance */
    }  /* if (dvec->discds) .. else .. */
    fputs("};\n", file);        /* terminate the distributions */
  }  /* for (n = 0; .. */
  fputs("};\n", file);          /* terminate the classifier */

  return ferror(file) ? -1 : 0; /* return the write status */
}  /* nbc_desc() */

/*--------------------------------------------------------------------*/
#ifdef NBC_PARSE

static int _distin (SCAN *scan, ATT *att, double *frqs, double *sum)
{                               /* --- read a distribution */
  int    i, cnt;                /* loop variable, number of values */
  double *p, f;                 /* to traverse the frequencies */
  int    t;                     /* buffer for token */

  assert(scan && att && frqs && sum); /* check the function arguments */
  GET_CHR('{');                 /* consume '{' (start of distrib.) */
  cnt = att_valcnt(att);        /* get the number of att. values */
  for (p = frqs +(i = cnt); --i >= 0; )
    *--p = -1;                  /* clear the value frequencies */
  while (1) {                   /* attribute value read loop */
    t = sc_token(scan);         /* check for a name */
    if ((t != T_ID) && (t != T_NUM)) ERROR(E_VALEXP);
    if (t != T_NUM) t = ':';    /* if the token is no number, */
    else {                      /* the token must be an att. value, */
      GET_TOK();                /* otherwise consume the token, */
      t = sc_token(scan);       /* note the next token, and */
      sc_back(scan);            /* go back to the previous one */
    }                           /* (look ahead one token) */
    if (t != ':')               /* if no ':' follows, */
      i = (i+1) % cnt;          /* get the cyclic successor id */
    else {                      /* if a  ':' follows */
      i = att_valid(att, sc_value(scan));
      if (i < 0) ERROR(E_UNKVAL);
      GET_TOK();                /* get and consume the value */
      GET_CHR(':');             /* consume ':' */
    }
    if (frqs[i] >= 0)           /* check whether value has been read */
      XERROR(E_DUPVAL, att_valname(att, i));
    if (sc_token(scan) != T_NUM) ERROR(E_NUMEXP);
    f = atof(sc_value(scan));   /* get and check */
    if (f < 0) ERROR(E_ILLNUM); /* the value frequency */
    frqs[i] = f;                /* set the value frequency */
    GET_TOK();                  /* consume the value frequency */
    if (sc_token(scan) == '('){ /* if a relative number follows, */
      GET_TOK();                /* consume '(' */
      if (sc_token(scan) != T_NUM)  ERROR(E_NUMEXP);
      if (atof(sc_value(scan)) < 0) ERROR(E_ILLNUM);
      GET_TOK();                /* consume the relative number */
      GET_CHR('%');             /* consume '%' */
      GET_CHR(')');             /* consume ')' */
    }
    if (sc_token(scan) != ',') break;
    GET_TOK();                  /* if at end of list, abort loop, */
  }                             /* otherwise consume ',' */
  GET_CHR('}');                 /* consume '}' (end of distribution) */
  for (f = 0, p = frqs +(i = cnt); --i >= 0; ) {
    if (*--p < 0) *p = 0;       /* clear the unset frequencies */
    else          f += *p;      /* and sum all other frequencies */
  }                             /* to obtain the total frequency */
  *sum = f;                     /* set the sum of the frequencies */
  return 0;                     /* return 'ok' */
}  /* _distin() */

/*--------------------------------------------------------------------*/

static int _discdin (NBC *nbc, SCAN *scan,
                     ATT *clsatt, ATT *att, DVEC *dvec)
{                               /* --- read discrete distributions */
  int   i = -1, t;              /* class identifier, buffer */
  DISCD *discd;                 /* to access discrete distribution */

  assert(nbc && clsatt && att && dvec); /* check function arguments */
  for (discd = dvec->discds +(i = nbc->clscnt); --i >= 0; )
    (--discd)->cnt = -1;        /* unmark all distributions */
  while (1) {                   /* distribution read loop */
    if (sc_token(scan) == '{')  /* if no class name is given, */
      i = (i+1) % nbc->clscnt;  /* get the cyclic successor */
    else {                      /* if a class name is given, */
      t = sc_token(scan);       /* check for a name */
      if ((t != T_ID) && (t != T_NUM)) ERROR(E_VALEXP);
      i = att_valid(clsatt, sc_value(scan));
      if (i < 0) ERROR(E_UNKVAL);
      GET_TOK();                /* get and consume the value */
      GET_CHR(':');             /* consume ':' */
    }
    discd = dvec->discds +i;    /* get and check the distribution */
    if (discd->cnt >= 0) XERROR(E_DUPVAL, att_valname(clsatt, i));
    discd->cnt = 0;             /* clear the counter as a flag */
    t = _distin(scan, att, discd->frqs, &discd->cnt);
    if (t) return t;            /* read distribution */
    if (sc_token(scan) != ',') break;
    GET_TOK();                  /* if at end of list, abort loop */
  }                             /* otherwise consume ',' */
  for (discd = dvec->discds +(i = nbc->clscnt); --i >= 0; )
    if ((--discd)->cnt < 0) discd->cnt = 0;
                                /* clear the unset counters */
  return 0;                     /* return 'ok' */
}  /* _discdin() */

/*--------------------------------------------------------------------*/

static int _contdin (NBC *nbc, SCAN *scan,
                     ATT *clsatt, ATT *att, DVEC *dvec)
{                               /* --- read continuous distributions */
  int    i = -1;                /* class identifier, buffer */
  NORMD  *normd;                /* to access normal distribution */
  double t;                     /* temporary buffer */

  assert(nbc && clsatt && att && dvec); /* check function arguments */
  for (normd = dvec->normds +(i = nbc->clscnt); --i >= 0; )
    (--normd)->cnt = -1;        /* unmark all distributions */
  while (1) {                   /* distribution read loop */
    t = sc_token(scan);         /* check for a name */
    if ((t != T_ID) && (t != T_NUM)) ERROR(E_VALEXP);
    if (t == T_NUM) t = ':';    /* if the token is a number, */
    else {                      /* the token must be a class */
      GET_TOK();                /* otherwise consume the token, */
      t = sc_token(scan);       /* note the next token, and */
      sc_back(scan);            /* go back to the previous one */
    }                           /* (look ahead one token) */
    if (t != ':')               /* if no class name is given, */
      i = (i+1) % nbc->clscnt;  /* get the cyclic successor id */
    else {                      /* if a  class name is given */
      i = att_valid(clsatt, sc_value(scan));
      if (i < 0) ERROR(E_UNKVAL);
      GET_TOK();                /* get and consume the class */
      GET_CHR(':');             /* consume ':' */
    }
    normd = dvec->normds +i;    /* get the normal distribution and */
    if (normd->cnt >= 0)        /* check whether it is already set */
      XERROR(E_DUPVAL, att_valname(clsatt, i));
    normd->cnt = 0;             /* clear the counter as a flag */
    if ((sc_token(scan) != T_ID)
    ||  (strcmp(sc_value(scan), "N") != 0))
      ERR_STR("N");             /* check for an 'N' */
    GET_TOK();                  /* consume 'N' */
    GET_CHR('(');               /* consume '(' */
    if (sc_token(scan) != T_NUM) ERROR(E_NUMEXP);
    normd->exp = atof(sc_value(scan));
    GET_TOK();                  /* get and consume the exp. value */
    GET_CHR(',');               /* consume ',' */
    if (sc_token(scan) != T_NUM) ERROR(E_NUMEXP);
    normd->var = atof(sc_value(scan));
    if (normd->var < 0)          ERROR(E_ILLNUM);
    GET_TOK();                  /* get and consume the variance */
    GET_CHR(')');               /* consume ')' */
    if (sc_token(scan) != '['){ /* if no number of cases follows, */
      normd->cnt = nbc->frqs[i];/* get the class frequencies */
      if (normd->cnt <= 1) normd->cnt = 2; }
    else {                      /* if a number of cases follows, */
      GET_TOK();                /* consume '[' and */
      if (sc_token(scan) != T_NUM) ERROR(E_NUMEXP);
      normd->cnt = atof(sc_value(scan));
      if (normd->cnt < 0)          ERROR(E_ILLNUM);
      GET_TOK();                /* consume the number of cases */
      GET_CHR(']');             /* consume ']' */
    }                           /* then compute the sums */
    normd->sv  = normd->exp *(t = normd->cnt);
    if (!(nbc->mode & NBC_MAXLLH)) t -= 1;
    normd->sv2 = normd->var *t +normd->exp *normd->sv;
    if (sc_token(scan) != ',') break;
    GET_TOK();                  /* if at end of list, abort loop, */
  }                             /* otherwise consume ',' */
  for (normd = dvec->normds +(i = nbc->clscnt); --i >= 0; )
    if ((--normd)->cnt < 0) normd->cnt = 0;
                                /* clear the unset counters */
  return 0;                     /* return 'ok' */
}  /* _contdin() */

/*--------------------------------------------------------------------*/

static int _dvecsin (NBC *nbc, SCAN *scan, ATT *clsatt)
{                               /* --- read distribution vectors */
  int  t;                       /* temporary buffer */
  int  attid;                   /* attribute identifier */
  ATT  *att;                    /* current attribute */
  DVEC *dvec;                   /* to traverse the distrib. vectors */

  assert(nbc && scan && clsatt);   /* check the function arguments */
  while ((sc_token(scan) == T_ID)  /* while another dist. follows */
  &&     ((strcmp(sc_value(scan), "prob") == 0)
  ||      (strcmp(sc_value(scan), "P")    == 0))) {
    GET_TOK();                  /* consume 'prob' or 'P' */
    GET_CHR('(');               /* consume '(' */
    t = sc_token(scan);         /* check for a name */
    if ((t != T_ID) && (t != T_NUM)) ERROR(E_ATTEXP);
    attid = as_attid(nbc->attset, sc_value(scan));
    if (attid < 0)                   ERROR(E_UNKATT);
    att  = as_att(nbc->attset, attid);
    dvec = nbc->dvecs +attid;   /* get and check the attribute */
    if (dvec->type == 0) ERROR(E_ATTYPE);
    if (dvec->mark >= 0) ERROR(E_DUPATT);
    dvec->mark = 1;             /* set the read flag */
    GET_TOK();                  /* consume the attribute name */
    GET_CHR('|');               /* consume '|' (condition indicator) */
    t = sc_token(scan);         /* get the next token */
    if (((t != T_ID) && (t != T_NUM))
    ||  (strcmp(sc_value(scan), att_name(clsatt)) != 0))
      ERROR(E_CLSEXP);          /* check for the class att. name */
    GET_TOK();                  /* consume the class att. name */
    GET_CHR(')');               /* consume ')' */
    GET_CHR('=');               /* consume '=' */
    GET_CHR('{');               /* consume '{' */
    if (sc_token(scan) != '}'){ /* if a distribution vector follows */
      t = (dvec->type == AT_SYM)
        ? _discdin(nbc, scan, clsatt, att, dvec)
        : _contdin(nbc, scan, clsatt, att, dvec);
      if (t) return t;          /* read conditional distributions */
    }                           /* and check for an error */
    GET_CHR('}');               /* consume '}' */
    GET_CHR(';');               /* consume ';' */
  }  /* while ((sc_token == T_ID) .. */

  return 0;                     /* return 'ok' */
}  /* _dvecsin() */

/*--------------------------------------------------------------------*/

static int _parse (ATTSET *attset, SCAN *scan, NBC **pnbc)
{                               /* --- parse a naive Bayes classifier */
  int  i, t;                    /* loop variable, buffer */
  int  err = 0;                 /* error flag */
  int  clsid;                   /* class attribute index */
  ATT  *att;                    /* class attribute */
  NBC  *nbc;                    /* created naive Bayes classifier */
  DVEC *dvec;                   /* to traverse the distrib. vectors */

  /* --- read header --- */
  if ((sc_token(scan) != T_ID)
  ||  (strcmp(sc_value(scan), "nbc") != 0))
    ERR_STR("nbc");             /* check for 'nbc' */
  GET_TOK();                    /* consume 'nbc' */
  GET_CHR('(');                 /* consume '(' */
  t = sc_token(scan);           /* check for a name */
  if ((t != T_ID) && (t != T_NUM)) ERROR(E_ATTEXP);
  clsid = as_attid(attset, sc_value(scan));
  if (clsid < 0)                   ERROR(E_UNKATT);
  att = as_att(attset, clsid);  /* get and check the class attribute */
  if (att_type(att) != AT_SYM)     ERROR(E_CLSTYPE);
  if (att_valcnt(att) < 1)         ERROR(E_CLSCNT);
  *pnbc = nbc = nbc_create(attset, clsid);
  if (!nbc) ERROR(E_NOMEM);     /* create a naive Bayes classifier */
  GET_TOK();                    /* consume the class name */
  GET_CHR(')');                 /* consume '(' */
  GET_CHR('=');                 /* consume '=' */
  GET_CHR('{');                 /* consume '{' */

  /* --- read parameters --- */
  if ((sc_token(scan) == T_ID)  /* if 'params' follows */
  &&  (strcmp(sc_value(scan), "params") == 0)) {
    GET_TOK();                  /* consume 'params' */
    GET_CHR('=');               /* consume '=' */
    if (sc_token(scan) != T_NUM)  ERROR(E_NUMEXP);
    nbc->lcorr = atof(sc_value(scan));
    if (nbc->lcorr < 0)           ERROR(E_ILLNUM);
    GET_TOK();                  /* get Laplace correction */
    while (sc_token(scan) == ',') {
      GET_TOK();                /* read list of parameters */
      if (sc_token(scan) != T_ID) ERROR(E_PAREXP);
      if      (strcmp(sc_value(scan), "distuv") == 0)
        nbc->mode |= NBC_DISTUV;/* distribute weight for unknowns */
      else if (strcmp(sc_value(scan), "maxllh") == 0)
        nbc->mode |= NBC_MAXLLH;/* use max. likelihood estimate */
      else ERROR(E_PAREXP);     /* abort on all other values */
      GET_TOK();                /* consume the estimator flag */
    }
    GET_CHR(';');               /* consume ';' */
  }

  /* --- read class distribution --- */
  if ((sc_token(scan) != T_ID)
  ||  ((strcmp(sc_value(scan), "prob") != 0)
  &&   (strcmp(sc_value(scan), "P")    != 0)))
    ERR_STR("prob");            /* check for 'prob' or 'P' */
  GET_TOK();                    /* consume   'prob' or 'P' */
  GET_CHR('(');                 /* consume '(' */
  t = sc_token(scan);           /* get the next token */
  if (((t != T_ID) && (t != T_NUM))
  ||  (strcmp(sc_value(scan), att_name(att)) != 0))
    ERROR(E_CLSEXP);            /* check for a class name */
  GET_TOK();                    /* consume the class name */
  GET_CHR(')');                 /* consume ')' */
  GET_CHR('=');                 /* consume '=' */
  t = _distin(scan, att, nbc->frqs, &nbc->total);
  if (t) return t;              /* read the class distribution */
  GET_CHR(';');                 /* consume ';' */
  nbc->dvecs[clsid].mark = 0;   /* mark the class attribute */

  /* --- read conditional distributions --- */
  do {                          /* read the distributions, */
    t = _dvecsin(nbc,scan,att); /* trying to recover on errors */
    if (t) { err = t; sc_recover(scan, ';', 0, 0, 0); }
  } while (t);                  /* while not all dists. read */
  if (err) return err;          /* if an error occurred, abort */
  for (dvec = nbc->dvecs, i = 0; i < nbc->attcnt; dvec++, i++) {
    if ((dvec->mark < 0) && (nbc->frqs[i] > 0))
      XERROR(E_MISATT, att_name(as_att(attset, i)));
  }                             /* check for a complete classifier */

  GET_CHR('}');                 /* consume '}' */
  GET_CHR(';');                 /* consume ';' */
  return 0;                     /* return 'ok' */
}  /* _parse() */

/*--------------------------------------------------------------------*/

NBC* nbc_parse (ATTSET *attset, SCAN *scan)
{                               /* --- parse a naive Bayes class. */
  NBC *nbc = NULL;              /* created naive Bayes classifier */

  assert(attset && scan);       /* check the function arguments */
  pa_init(scan);                /* initialize parsing */
  if (_parse(attset, scan, &nbc) != 0) {
    if (nbc) nbc_delete(nbc,0); /* parse a naive Bayes classifier */
    return NULL;                /* if an error occurred, */
  }                             /* delete the classifier and abort */
  nbc_setup(nbc, nbc->mode, nbc->lcorr);
  return nbc;                   /* set up the created classifier */
}  /* nbc_parse() */            /* and then return it */

#endif