www.pudn.com > bayes.rar > tbal.c


/*----------------------------------------------------------------------
  File    : tbal.c
  Contents: program to balance value frequencies
  Author  : Christian Borgelt
  History : 13.02.1999 file created from file skel1.c
            17.04.1999 simplified using the new module 'io'
            14.07.2001 adapted to modified module tfscan
            16.08.2003 slight changes in error message output
----------------------------------------------------------------------*/
#include 
#include 
#include 
#include 
#include 
#ifndef AS_RDWR
#define AS_RDWR
#endif
#ifndef TAB_RDWR
#define TAB_RDWR
#endif
#include "io.h"
#ifdef STORAGE
#include "storage.h"
#endif

/*----------------------------------------------------------------------
  Preprocessor Definitions
----------------------------------------------------------------------*/
#define PRGNAME     "tbal"
#define DESCRIPTION "balance value frequencies"
#define VERSION     "version 1.5 (2003.08.16)         " \
                    "(c) 1999-2003   Christian Borgelt"

/* --- error codes --- */
#define OK            0         /* no error */
#define E_NONE        0         /* no error */
#define E_NOMEM     (-1)        /* not enough memory */
#define E_FOPEN     (-2)        /* file open failed */
#define E_FREAD     (-3)        /* file read failed */
#define E_FWRITE    (-4)        /* file write failed */
#define E_OPTION    (-5)        /* unknown option */
#define E_OPTARG    (-6)        /* missing option argument */
#define E_ARGCNT    (-7)        /* wrong number of arguments */
#define E_EXPVAL    (-8)        /* value expected */
#define E_EXPNUM    (-9)        /* number expected */
#define E_NUMBER   (-10)        /* illegal number */
#define E_UNKNOWN  (-11)        /* unknown error */

#define TMFIELDS   "too many fields"

/*----------------------------------------------------------------------
  Constants
----------------------------------------------------------------------*/
static const char *errmsgs[] = {   /* error messages */
  /* E_NONE      0 */  "no error\n",
  /* E_NOMEM    -1 */  "not enough memory\n",
  /* E_FOPEN    -2 */  "cannot open file %s\n",
  /* E_FREAD    -3 */  "read error on file %s\n",
  /* E_FWRITE   -4 */  "write error on file %s\n",
  /* E_OPTION   -5 */  "unknown option -%c\n",
  /* E_OPTARG   -6 */  "missing option argument\n",
  /* E_ARGCNT   -7 */  "wrong number of arguments\n",
  /* E_EXPVAL   -8 */  "file %s, record %d: value expected\n",
  /* E_EXPNUM   -9 */  "file %s, record %d: number expected\n",
  /* E_NUMBER  -10 */  "file %s, record %d: "
                          "illegal number \"%s\" in field %d\n",
  /* E_UNKNOWN -11 */  "unknown error\n"
};

/*----------------------------------------------------------------------
  Global Variables
----------------------------------------------------------------------*/
const  char   *prgname = NULL;  /* program name for error messages */
static ATTSET *attset  = NULL;  /* attribute set */
static TABLE  *table   = NULL;  /* table */
static FILE   *in      = NULL;  /* input  file */
static double *freqs   = NULL;  /* value frequency vector */
static char   buf[AS_MAXLEN+1]; /* read buffer */

/*----------------------------------------------------------------------
  Functions
----------------------------------------------------------------------*/

static void error (int code, ...)
{                               /* --- print error message */
  va_list    args;              /* list of variable arguments */
  const char *msg;              /* error message */

  assert(prgname);              /* check the program name */
  if (code < E_UNKNOWN) code = E_UNKNOWN;
  if (code < 0) {               /* if to report an error, */
    msg = errmsgs[-code];       /* get the error message */
    if (!msg) msg = errmsgs[-E_UNKNOWN];
     fprintf(stderr, "\n%s: ", prgname);
    va_start(args, code);       /* get variable arguments */
    vfprintf(stderr, msg, args);/* print the error message */
    va_end(args);               /* end argument evaluation */
  }
  #ifndef NDEBUG                /* clean up memory */
  if (freqs)  free(freqs);      /* and close files */
  if (table)  tab_delete(table, 0);
  if (attset) as_delete(attset);
  if (in && (in != stdin)) fclose(in);
  #endif
  #ifdef STORAGE
  showmem("at end of program"); /* check memory usage */
  #endif
  exit(code);                   /* abort the program */
}  /* error() */

/*--------------------------------------------------------------------*/

int main (int argc, char *argv[])
{                               /* --- main function */
  int    i, k = 0;              /* loop variables, counters */
  char   *s;                    /* to traverse options */
  char   **optarg = NULL;       /* option argument */
  char   *fn_hdr  = NULL;       /* name of table header file */
  char   *fn_tab  = NULL;       /* name of table file */
  char   *fn_frq  = NULL;       /* name of frequency file */
  char   *fn_out  = NULL;       /* name of output file */
  char   *blanks  = NULL;       /* blank  characters */
  char   *fldseps = NULL;       /* field  separators */
  char   *recseps = NULL;       /* record separators */
  char   *uvchars = NULL;       /* unknown value characters */
  char   *clscol  = NULL;       /* name of class column to balance */
  int    inflags  = 0;                 /* table file read  flags */
  int    outflags = AS_ATT|AS_WEIGHT;  /* table file write flags */
  double wgtsum   = 0;          /* weight of tuples in output table */
  int    valcnt;                /* number of attribute values */
  int    clsid;                 /* id of class column */
  ATT    *att;                  /* class attribute */
  int    d;                     /* delimiter type */

  prgname = argv[0];            /* get program name for error msgs. */

  /* --- print startup/usage message --- */
  if (argc > 1) {               /* if arguments are given */
    fprintf(stderr, "%s - %s\n", argv[0], DESCRIPTION);
    fprintf(stderr, VERSION); } /* print a startup message */
  else {                        /* if no argument is given */
    printf("usage: %s [options] [-q frqfile] "
                     "[-d|-h hdrfile] tabfile outfile\n", argv[0]);
    printf("%s\n", DESCRIPTION);
    printf("%s\n", VERSION);
    printf("-c#      name of field to balance (default: last field)\n");
    printf("-s#      sum of tuple weights in output table "
                    "(default: as in input table)\n");
    printf("         (-2: lower, -1: boost, 0: shift weights)\n");
    printf("-q       adjust to relative frequencies stated in frqfile\n");
    printf("frqfile  file containing value/relative frequency pairs\n");
    printf("-a       align fields of output table "
                    "(default: do not align)\n");
    printf("-w       do not write field names to output file\n");
    printf("-b/f/r#  blank characters, field and record separators\n"
           "         (default: \" \\t\\r\", \" \\t\", \"\\n\")\n");
    printf("-u#      unknown value characters (default: \"?\")\n");
    printf("-n       number of tuple occurrences in last field\n");
    printf("-d       use default header "
                    "(field names = field numbers)\n");
    printf("-h       read table header (field names) from hdrfile\n");
    printf("hdrfile  file containing table header (field names)\n");
    printf("tabfile  table file to read "
                    "(field names in first record)\n");
    printf("outfile  file to write output table to\n");
    return 0;                   /* print a usage message */
  }                             /* and abort the program */

  /* --- evaluate arguments --- */
  for (i = 1; i < argc; i++) {  /* traverse arguments */
    s = argv[i];                /* get option argument */
    if (optarg) { *optarg = s; optarg = NULL; continue; }
    if ((*s == '-') && *++s) {  /* -- if argument is an option */
      while (1) {               /* traverse characters */
        switch (*s++) {         /* evaluate option */
          case 'c': optarg    = &clscol;       break;
          case 's': wgtsum    = strtod(s, &s); break;
          case 'q': optarg    = &fn_frq;       break;
          case 'a': outflags |= AS_ALIGN;      break;
          case 'w': outflags &= ~AS_ATT;       break;
  	  case 'b': optarg    = &blanks;       break;
          case 'f': optarg    = &fldseps;      break;
          case 'r': optarg    = &recseps;      break;
          case 'u': optarg    = &uvchars;      break;
          case 'n': inflags  |= AS_WEIGHT;     break;
          case 'd': inflags  |= AS_DFLT;       break;
          case 'h': optarg    = &fn_hdr;       break;
          default : error(E_OPTION, *--s);     break;
        }                       /* set option variables */
        if (!*s) break;         /* if at end of string, abort loop */
        if (optarg) { *optarg = s; optarg = NULL; break; }
      } }                       /* get option argument */
    else {                      /* -- if argument is no option */
      switch (k++) {            /* evaluate non-option */
        case  0: fn_tab = s;      break;
        case  1: fn_out = s;      break;
        default: error(E_ARGCNT); break;
      }                         /* note filenames */
    }
  }
  if (optarg) error(E_OPTARG);  /* check option argument */
  if (k != 2) error(E_ARGCNT);  /* check number of arguments */
  if (fn_hdr) {                 /* set header flags */
    inflags = AS_ATT | (inflags & ~AS_DFLT);
    if (strcmp(fn_hdr, "-") == 0) fn_hdr = "";
  }                             /* convert "-" to "" */

  /* --- read table header --- */
  attset = as_create("domains", att_delete);
  if (!attset) error(E_NOMEM);  /* create an attribute set */
  as_chars(attset, blanks, fldseps, recseps, uvchars);
  fprintf(stderr, "\n");        /* set delimiter characters */
  in = io_hdr(attset, fn_hdr, fn_tab, inflags, 1);
  if (!in) error(1);            /* read the table header */

  /* --- determine id of class column --- */
  if (!clscol)                  /* if no class column name given, */
    clsid = as_attcnt(attset) -1;     /* use last column as class */
  else {                        /* if class column name given */
    if ((clsid = as_attid(attset, clscol)) < 0) {
      s = (inflags & AS_ATT) ? fn_hdr : fn_tab;
      io_error(E_MISFLD, s, 1, clscol); error(1);
    }                           /* check whether class exists */
  }                             /* and abort on error */

  /* --- read table --- */
  table = io_bodyin(attset, in, fn_tab, inflags, "table", 1);
  in    = NULL;                 /* read the table and */
  if (!table) error(1);         /* check for an error */

  /* --- balance frequencies --- */
  if (fn_frq) {                 /* if frequencies are given */
    if (*fn_frq)                /* if a proper file name is given, */
      in = fopen(fn_frq, "rb"); /* open frequency file for reading */
    else {                      /* if no proper file name is given, */
      in = stdin; fn_frq = ""; } /* read from standard input */
    fprintf(stderr, "reading %s ... ", fn_frq);
    if (!in) error(E_FOPEN, fn_frq);
    att    = as_att(attset, clsid);
    valcnt = att_valcnt(att);   /* get att. and number of values */
    freqs  = (double*)malloc(valcnt *sizeof(double));
    if (!freqs) error(E_NOMEM); /* allocate a frequency vector */
    for (i = valcnt; --i >= 0; ) freqs[i] = 1.0F;
    for (k = 0; 1; k++) {       /* frequency read loop */
      d = tfs_getfld(as_tfscan(attset), in, buf, AS_MAXLEN);
      if (d <= TFS_EOF) {       /* read next value */
        if (d < 0) error(E_FREAD, fn_frq); else break; }
      if (buf[0] == '\0') {     /* if name read is empty */
        if (d >= TFS_FLD) error(E_EXPVAL, fn_frq, k+1);
        continue;               /* check for a missing name */
      }                         /* and skip empty lines */
      if (d < TFS_FLD) error(E_EXPNUM, fn_frq, k+1);
      i = att_valid(att, buf);  /* get the value identifier */
      if (i <  0) {             /* and check it */
        io_error(E_VALUE, fn_frq, k+1, buf, 1); error(1); }
      d = tfs_getfld(as_tfscan(attset), in, buf, AS_MAXLEN);
      if (d <  0) error(E_FREAD, fn_frq);
      if (d >= TFS_FLD) {       /* check the number of fields */
        io_error(E_FLDCNT, fn_frq, k+1, "", 3, 2); error(1); }
      freqs[i] = strtod(buf, &s);    /* read the value frequency */
      if ((s == buf) || *s || (freqs[i] < 0)) {
        io_error(E_NUMBER, fn_frq, k+1, buf, 2); error(1); }
    }                           /* convert frequency to a number */
    if (in != stdin) fclose(in);/* close the input file */
    in = NULL;                  /* and clear the variable */
    fprintf(stderr, "done.\n"); /* print a success message */
  }
  tab_reduce(table);            /* reduce and balance the table */
  tab_balance(table, clsid, wgtsum, freqs);

  /* --- write output table --- */
  if (io_tabout(table, fn_out, outflags, 1) != 0)
    error(1);                   /* write the balanced table */

  /* --- clean up --- */
  #ifndef NDEBUG
  if (freqs) free(freqs);       /* delete frequency vector, */
  tab_delete(table, 1);         /* table, and attribute set */
  #endif
  #ifdef STORAGE
  showmem("at end of program"); /* check memory usage */
  #endif
  return 0;                     /* return 'ok' */
}  /* main() */