www.pudn.com > bayes.rar > bcdb.c


/*----------------------------------------------------------------------
  File    : bcdb.c
  Contents: generate a database from a Bayes classifier
  Author  : Christian Borgelt
  History : 26.04.2003 file created from file bcx.c
            16.08.2003 slight changes in error message output
----------------------------------------------------------------------*/
#include 
#include 
#include 
#include 
#include 
#include 
#ifndef SC_SCAN
#define SC_SCAN
#endif
#include "scan.h"
#ifndef AS_RDWR
#define AS_RDWR
#endif
#ifndef AS_PARSE
#define AS_PARSE
#endif
#include "attset.h"
#ifndef NBC_PARSE
#define NBC_PARSE
#endif
#include "nbayes.h"
#ifndef FBC_PARSE
#define FBC_PARSE
#endif
#include "fbayes.h"
#ifdef STORAGE
#include "storage.h"
#endif

/*----------------------------------------------------------------------
  Preprocessor Definitions
----------------------------------------------------------------------*/
#define PRGNAME     "bcdb"
#define DESCRIPTION "generate a database from a Bayes classifier"
#define VERSION     "version 1.1 (2003.08.16)         " \
                    "(c) 2003   Christian Borgelt"

/* --- error codes --- */
#define OK            0         /* no error */
#define E_NONE        0         /* no error */
#define E_NOMEM     (-1)        /* not enough memory */
#define E_FOPEN     (-2)        /* cannot open file */
#define E_FREAD     (-3)        /* read error on file */
#define E_FWRITE    (-4)        /* write error on file */
#define E_OPTION    (-5)        /* unknown option */
#define E_OPTARG    (-6)        /* missing option argument */
#define E_ARGCNT    (-7)        /* wrong number of arguments */
#define E_PARSE     (-8)        /* parse error */
#define E_NEGLC     (-9)        /* negative Laplace correction */
#define E_UNKNOWN  (-10)        /* unknown error */

/*----------------------------------------------------------------------
  Constants
----------------------------------------------------------------------*/
static const char *errmsgs[] = {   /* error messages */
  /* E_NONE      0 */  "no error\n",
  /* E_NOMEM    -1 */  "not enough memory\n",
  /* E_FOPEN    -2 */  "cannot open file %s\n",
  /* E_FREAD    -3 */  "read error on file %s\n",
  /* E_FWRITE   -4 */  "write error on file %s\n",
  /* E_OPTION   -5 */  "unknown option -%c\n",
  /* E_OPTARG   -6 */  "missing option argument\n",
  /* E_ARGCNT   -7 */  "wrong number of arguments\n",
  /* E_PARSE    -8 */  "parse error(s) on file %s\n",
  /* E_NEGLC    -9 */  "Laplace correction must not be negative\n",
  /* E_UNKNOWN -10 */  "unknown error\n"
};

/*----------------------------------------------------------------------
  Global Variables
----------------------------------------------------------------------*/
const  char   *prgname = NULL;  /* program name for error messages */
static SCAN   *scan    = NULL;  /* scanner */
static NBC    *nbc     = NULL;  /* naive Bayes classifier */
static FBC    *fbc     = NULL;  /* full  Bayes classifier */
static ATTSET *attset  = NULL;  /* attribute set */
static FILE   *out     = NULL;  /* output file */

/*----------------------------------------------------------------------
  Random Number Functions
----------------------------------------------------------------------*/
#ifdef DRAND48                  /* if library for drand48() available */
extern void   srand48 (long seed);
extern double drand48 (void);   /* use drand48 functions */
#define dseed(s) srand48((long)(s))
#define drand    drand48

#else                           /* if only standard rand() available */
#define dseed(s) srand((unsigned)(s))
static double drand (void)      /* compute value from rand() result */
{ return rand()/(RAND_MAX +1.0); }

#endif
/*----------------------------------------------------------------------
  Functions
----------------------------------------------------------------------*/

static void error (int code, ...)
{                               /* --- print error message */
  va_list    args;              /* list of variable arguments */
  const char *msg;              /* error message */

  assert(prgname);              /* check the program name */
  if (code < E_UNKNOWN) code = E_UNKNOWN;
  if (code < 0) {               /* if to report an error, */
    msg = errmsgs[-code];       /* get error message */
    if (!msg) msg = errmsgs[-E_UNKNOWN];
    fprintf(stderr, "\n%s: ", prgname);
    va_start(args, code);       /* get variable arguments */
    vfprintf(stderr, msg, args);/* print error message */
    va_end(args);               /* end argument evaluation */
  }
  #ifndef NDEBUG
  if (nbc)    nbc_delete(nbc, 0);
  if (fbc)    fbc_delete(fbc, 0);
  if (attset) as_delete(attset);   /* clean up memory */
  if (scan)   sc_delete(scan);     /* and close files */
  if (out && (out != stdout)) fclose(out);
  #endif
  #ifdef STORAGE
  showmem("at end of program"); /* check memory usage */
  #endif
  exit(code);                   /* abort programm */
}  /* error() */

/*--------------------------------------------------------------------*/

int main (int argc, char* argv[])
{                               /* --- main function */
  int    i, k = 0;              /* loop variables, buffer */
  char   *s;                    /* to traverse options */
  char   **optarg = NULL;       /* option argument */
  char   *fn_bc   = NULL;       /* name of classifier file */
  char   *fn_out  = NULL;       /* name of output file */
  char   *blank   = NULL;       /* blank */
  char   *fldsep  = NULL;       /* field  separator */
  char   *recsep  = NULL;       /* record separator */
  int    flags    = AS_ATT;     /* table file write flags */
  double lcorr    = -DBL_MAX;   /* Laplace correction value */
  int    distuv   = 0;          /* distribute weight of unknowns */
  int    maxllh   = 0;          /* max. likelihood est. of variance */
  int    tplcnt   = 1000;       /* number of tuples to generate */
  long   seed;                  /* seed for random number generator */
  int    mode;                  /* classifier setup mode */

  prgname = argv[0];            /* get program name for error msgs. */
  seed    = (long)time(NULL);   /* and get a default seed value */

  /* --- print startup/usage message --- */
  if (argc > 1) {               /* if arguments are given */
    fprintf(stderr, "%s - %s\n", argv[0], DESCRIPTION);
    fprintf(stderr, VERSION); } /* print a startup message */
  else {                        /* if no argument given */
    printf("usage: %s [options] bcfile "
                     "[-d|-h hdrfile] tabfile\n", argv[0]);
    printf("%s\n", DESCRIPTION);
    printf("%s\n", VERSION);
    printf("-n#      number of tuples to generate "
                    "(default: %d)\n", tplcnt);
    printf("-s#      seed for random number generator "
                    "(default: time)\n");
    printf("-L#      Laplace correction "
                    "(default: as specified in classifier)\n");
    printf("-v/V     (do not) distribute tuple weight "
                    "for unknown values\n");
    printf("-m/M     (do not) use maximum likelihood estimate "
                    "for the variance\n");
    printf("-a       align fields (default: do not align)\n");
    printf("-w       do not write field names to the output file\n");
    printf("-b/f/r#  blank character, field and record separator\n"
           "         (default: \" \", \" \", \"\\n\")\n");
    printf("bcfile   file containing classifier description\n");
    printf("tabfile  table file to write\n");
    return 0;                   /* print a usage message */
  }                             /* and abort the program */

  /* --- evaluate arguments --- */
  for (i = 1; i < argc; i++) {  /* traverse arguments */
    s = argv[i];                /* get option argument */
    if (optarg) { *optarg = s; optarg = NULL; continue; }
    if ((*s == '-') && *++s) {  /* -- if argument is an option */
      while (*s) {              /* traverse options */
        switch (*s++) {         /* evaluate option */
          case 'n': tplcnt  = (int)strtol(s, &s, 0); break;
          case 's': seed    =      strtol(s, &s, 0); break;
          case 'L': lcorr   = strtod(s, &s);         break;
          case 'v': distuv  = NBC_ALL;               break;
          case 'V': distuv |= NBC_DISTUV|NBC_ALL;    break;
          case 'm': maxllh  = NBC_ALL;               break;
          case 'M': maxllh |= NBC_MAXLLH|NBC_ALL;    break;
          case 'a': flags  |= AS_ALIGN;              break;
          case 'w': flags  &= ~AS_ATT;               break;
          case 'b': optarg  = ␣                break;
          case 'f': optarg  = &fldsep;               break;
          case 'r': optarg  = &recsep;               break;
          default : error(E_OPTION, *--s);           break;
        }                       /* set option variables */
        if (!*s) break;         /* if at end of string, abort loop */
        if (optarg) { *optarg = s; optarg = NULL; break; }
      } }                       /* get option argument */
    else {                      /* if argument is no option */
      switch (k++) {            /* evaluate non-option */
        case  0: fn_bc  = s;      break;
        case  1: fn_out = s;      break;
        default: error(E_ARGCNT); break;
      }                         /* note filenames */
    }
  }
  if (optarg) error(E_OPTARG);  /* check the option argument */
  if (k != 2) error(E_ARGCNT);  /* and the number of arguments */
  if ((lcorr < 0) && (lcorr > -DBL_MAX))
    error(E_NEGLC);             /* check the Laplace correction */
  if ((flags & AS_ATT) && (flags & AS_ALIGN))
    flags |= AS_ALNHDR;         /* set align to header flag */

  /* --- read Bayes classifier --- */
  scan = sc_create(fn_bc);      /* create a scanner */
  if (!scan) error((!fn_bc || !*fn_bc) ? E_NOMEM : E_FOPEN, fn_bc);
  attset = as_create("domains", att_delete);
  if (!attset) error(E_NOMEM);  /* create an attribute set */
  fprintf(stderr, "\nreading %s ... ", sc_fname(scan));
  if ((sc_nexter(scan)   <  0)  /* start scanning (get first token) */
  ||  (as_parse(attset, scan, AT_ALL) != 0)
  ||  (as_attcnt(attset) <= 0)) /* parse attribute set */
    error(E_PARSE, sc_fname(scan));
  if ((sc_token(scan) == T_ID)  /* determine classifier type */
  &&  (strcmp(sc_value(scan), "fbc") == 0))
       fbc = fbc_parse(attset, scan);
  else nbc = nbc_parse(attset, scan);
  if ((!fbc && !nbc)            /* parse the Bayes classifier */
  ||   !sc_eof(scan))           /* and check for end of file */
    error(E_PARSE, sc_fname(scan));
  sc_delete(scan); scan = NULL; /* delete the scanner */
  fprintf(stderr, "[%d attribute(s)] done.\n", as_attcnt(attset));
  if ((lcorr >= 0) || distuv || maxllh) {
    if (lcorr < 0)              /* get the classifier's parameters */
      lcorr = (fbc) ? fbc_lcorr(fbc) : nbc_lcorr(nbc);
    mode    = (fbc) ? fbc_mode(fbc)  : nbc_mode(nbc);
    if (distuv) mode = (mode & ~NBC_DISTUV) | distuv;
    if (maxllh) mode = (mode & ~NBC_MAXLLH) | maxllh;
                                /* adapt the estimation parameters */
    if (fbc) fbc_setup(fbc, mode, lcorr);
    else     nbc_setup(nbc, mode, lcorr);
  }                             /* set up the classifier anew */

  /* --- generate database --- */
  if (fn_out && *fn_out)        /* if an output file name is given, */
    out = fopen(fn_out, "w");   /* open output file for writing */
  else {                        /* if no output file name is given, */
    out = stdout; fn_out = ""; }    /* write to std. output */
  fprintf(stderr, "writing %s ... ", fn_out);
  if (!out) error(E_FOPEN, fn_out);
  if ((flags & AS_ATT)          /* if to write a table header */
  &&  (as_write(attset, out, flags) != 0))
    error(E_FWRITE, fn_out);    /* write the attributes names */
  flags = AS_INST | (flags & ~AS_ATT);
  dseed(seed);                  /* init. random number generator */
  for (i = tplcnt; --i >= 0;) { /* generate random tuples */
    if (fbc) fbc_rand(fbc, drand);   /* instantiate the */
    else     nbc_rand(nbc, drand);   /* attribute set */
    if (as_write(attset, out, flags) != 0)
      error(E_FWRITE,fn_out);   /* write the generated tuple */
  }                             /* to the output file */
  if (out != stdout) {          /* if not written to stdout */
    i = fclose(out); out = NULL;/* close the output file */
    if (i != 0) error(E_FWRITE, fn_out);
  }                             /* print a success message */
  fprintf(stderr, "[%d tuple(s)] done.\n", tplcnt);

  /* --- clean up --- */
  #ifndef NDEBUG
  if (fbc) fbc_delete(fbc, 1);  /* delete full  Bayes classifier */
  if (nbc) nbc_delete(nbc, 1);  /* or     naive Bayes classifier */
  #endif                        /* and underlying attribute set */
  #ifdef STORAGE
  showmem("at end of program"); /* check memory usage */
  #endif
  return 0;                     /* return 'ok' */
}  /* main() */