www.pudn.com > bayes.rar > tsplit.c


/*----------------------------------------------------------------------
  File    : tsplit.c
  Contents: program to split table into subtables
  Author  : Christian Borgelt
  History : 24.02.1998 file created
            12.09.1998 adapted to modified module attset
            25.09.1998 table reading simplified
            06.02.1999 arbitrary sample size made possible
            07.02.1999 input from stdin, output to stdout added
            12.02.1999 default header handling improved
            17.04.1999 simplified using the new module 'io'
            14.07.2001 adapted to modified module tfscan
            16.08.2003 slight changes in error message output
----------------------------------------------------------------------*/
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#ifndef AS_RDWR
#define AS_RDWR
#endif
#ifndef TAB_RDWR
#define TAB_RDWR
#endif
#include "io.h"
#ifdef STORAGE
#include "storage.h"
#endif

/*----------------------------------------------------------------------
  Preprocessor Definitions
----------------------------------------------------------------------*/
#define PRGNAME     "tsplit"
#define DESCRIPTION "split a table into subtables"
#define VERSION     "version 1.6 (2003.08.16)         " \
                    "(c) 1998-2003   Christian Borgelt"

/* --- error codes --- */
#define OK            0         /* no error */
#define E_NONE        0         /* no error */
#define E_NOMEM     (-1)        /* not enough memory */
#define E_FOPEN     (-2)        /* file open failed */
#define E_FREAD     (-3)        /* file read failed */
#define E_FWRITE    (-4)        /* file write failed */
#define E_OPTION    (-5)        /* unknown option */
#define E_OPTARG    (-6)        /* missing option argument */
#define E_ARGCNT    (-7)        /* wrong number of arguments */
#define E_FLDNAME   (-8)        /* illegal field name */
#define E_EMPTAB    (-9)        /* empty table */
#define E_SMLTAB   (-10)        /* table too small for sample */
#define E_UNKNOWN  (-11)        /* unknown error */

/*----------------------------------------------------------------------
  Constants
----------------------------------------------------------------------*/
static const char *errmsgs[] = {   /* error messages */
  /* E_NONE      0 */  "no error\n",
  /* E_NOMEM    -1 */  "not enough memory\n",
  /* E_FOPEN    -2 */  "cannot open file %s\n",
  /* E_FREAD    -3 */  "read error on file %s\n",
  /* E_FWRITE   -4 */  "write error on file %s\n",
  /* E_OPTION   -5 */  "unknown option -%c\n",
  /* E_OPTARG   -6 */  "missing option argument\n",
  /* E_ARGCNT   -7 */  "wrong number of arguments\n",
  /* E_FLDNAME  -8 */  "illegal field name \"%s\"\n",
  /* E_EMPTAB   -9 */  "table is empty\n",
  /* E_SMLTAB  -10 */  "table is too small for sample\n",
  /* E_UNKNOWN -11 */  "unknown error\n"
};

/*----------------------------------------------------------------------
  Global Variables
----------------------------------------------------------------------*/
const  char   *prgname = NULL;  /* program name for error messages */
static ATTSET *attset  = NULL;  /* attribute set */
static TABLE  *table   = NULL;  /* table */
static FILE   *in      = NULL;  /* input  file */
static FILE   *out     = NULL;  /* output file */
static char   fn_out[1024];     /* output file name */

/*----------------------------------------------------------------------
  Random Number Functions
----------------------------------------------------------------------*/

#ifdef DRAND48                  /* if library for drand48() available */
extern void   srand48 (long seed);
extern double drand48 (void);   /* use drand48 functions */
#define dseed(s) srand48(s)
#define drand    drand48

#else                           /* if only standard rand() available */
#define dseed(s) srand((unsigned int)s)
static double drand (void)
{ return rand()/(RAND_MAX +1.0); }
#endif

/*----------------------------------------------------------------------
  Comparison Function
----------------------------------------------------------------------*/

static int tplcmp (const TUPLE *tpl1, const TUPLE *tpl2, void *data)
{                               /* --- compare two tuples */
  const INST *col1, *col2;      /* buffer for column values */

  col1 = tpl_colval(tpl1, (int)data);   /* get column values */
  col2 = tpl_colval(tpl2, (int)data);   /* for both tuples */
  if (col1->i > col2->i) return  1;
  if (col1->i < col2->i) return -1;
  return 0;                     /* return sign of diff. of values */
}  /* tplcmp() */

/*----------------------------------------------------------------------
  Functions
----------------------------------------------------------------------*/

static void error (int code, ...)
{                               /* --- print error message */
  va_list    args;              /* list of variable arguments */
  const char *msg;              /* error message */

  assert(prgname);              /* check the program name */
  if (code < E_UNKNOWN) code = E_UNKNOWN;
  if (code < 0) {               /* if to report an error, */
    msg = errmsgs[-code];       /* get the error message */
    if (!msg) msg = errmsgs[-E_UNKNOWN];
    fprintf(stderr, "\n%s: ", prgname);
    va_start(args, code);       /* get variable arguments */
    vfprintf(stderr, msg, args);/* print error message */
    va_end(args);               /* end argument evaluation */
  }
  #ifndef NDEBUG
  if (table)  tab_delete(table, 0); /* clean up memory */
  if (attset) as_delete(attset);    /* and close files */
  if (in  && (in  != stdin))  fclose(in);
  if (out && (out != stdout)) fclose(out);
  #endif
  #ifdef STORAGE
  showmem("at end of program"); /* check memory usage */
  #endif
  exit(code);                   /* abort the program */
}  /* error() */

/*--------------------------------------------------------------------*/

int main (int argc, char *argv[])
{                               /* --- main function */
  int    i, k = 0;              /* loop variables, counters */
  char   *s;                    /* to traverse options */
  char   **optarg = NULL;       /* option argument */
  char   *fn_hdr  = NULL;       /* name of table header file */
  char   *fn_tab  = NULL;       /* name of table file */
  char   *blanks  = NULL;       /* blank  characters */
  char   *fldseps = NULL;       /* field  separators */
  char   *recseps = NULL;       /* record separators */
  char   *uvchars = NULL;       /* unknown value characters */
  char   *colname = NULL;       /* name of field to base split on */
  char   *pattern = "%i.tab";   /* output file name pattern */
  int    inflags  = 0;          /* table file read  flags */
  int    outflags = AS_ATT;     /* table file write flags */
  int    shuffle  = 0;          /* flag for tuple shuffling */
  int    sample   = 0;          /* flag for drawing a sample */
  int    colid    = -1;         /* column identifier */
  int    tplcnt   = 0;          /* number of tuples */
  double tplwgt   = 0;          /* weight of tuples */
  double tabcnt   = 0;          /* number of tables */
  int    tabid;                 /* table identifier */
  long   seed;                  /* random number seed */
  int    size, first, tplid;    /* table size and tuple index */
  int    prev, val;             /* (previous) column value */
  double off, wgt, tmp;         /* offset, tuple weight and buffer */
  TUPLE  *tpl;                  /* tuple to traverse table */
  int    one_in_n;              /* flag for one in n selection */
  int    done = 0;              /* completion flag */

  prgname = argv[0];            /* get program name for error msgs. */
  seed    = (long)time(NULL);   /* get a default seed value */

  /* --- print startup/usage message --- */
  if (argc > 1) {               /* if arguments are given */
    fprintf(stderr, "%s - %s\n", argv[0], DESCRIPTION);
    fprintf(stderr, VERSION); } /* print a startup message */
  else {                        /* if no argument is given */
    printf("usage: %s [options] [-d|-h hdrfile] tabfile\n", argv[0]);
    printf("%s\n", DESCRIPTION);
    printf("%s\n", VERSION);
    printf("-c#      name of field/column to base split on "
                    "(default: none)\n");
    printf("         (stratified sampling if the option -t "
                    "is also given)\n");
    printf("-x       shuffle tuples before operation\n");
    printf("-s#      seed value for random number generator "
                    "(default: time)\n");
    printf("-t#      number of subtables to split into\n");
    printf("-p#      draw a sample with # tuples (one output table)\n");
    printf("-o#      output file name pattern "
                    "(default: \"%s\")\n", pattern);
    printf("-a       align fields of output tables "
                    "(default: do not align)\n");
    printf("-w       do not write field names to output files\n");
    printf("-b/f/r#  blank characters, field and record separators\n"
           "         (default: \" \\t\\r\", \" \\t\", \"\\n\")\n");
    printf("-u#      unknown value characters (default: \"?\")\n");
    printf("-n       number of tuple occurences in last field\n");
    printf("-d       use default header "
                    "(field names = field numbers)\n");
    printf("-h       read table header (field names) from hdrfile\n");
    printf("hdrfile  file containing table header (field names)\n");
    printf("tabfile  table file to read "
                    "(field names in first record)\n");
    return 0;                   /* print a usage message */
  }                             /* and abort the program */

  /* --- evaluate arguments --- */
  for (i = 1; i < argc; i++) {  /* traverse arguments */
    s = argv[i];                /* get option argument */
    if (optarg) { *optarg = s; optarg = NULL; continue; }
    if ((*s == '-') && *++s) {  /* -- if argument is an option */
      while (1) {               /* traverse characters */
        switch (*s++) {         /* evaluate option */
          case 'c': optarg    = &colname;              break;
          case 'x': shuffle   = 1;                     break;
          case 's': seed      =      strtol(s, &s, 0); break;
          case 't': tabcnt    =      strtol(s, &s, 0); break;
          case 'p': sample    = (int)strtol(s, &s, 0); break;
   	  case 'o': optarg    = &pattern;              break;
          case 'a': outflags |= AS_ALIGN;              break;
          case 'w': outflags &= ~AS_ATT;               break;
  	  case 'b': optarg    = &blanks;               break;
          case 'f': optarg    = &fldseps;              break;
          case 'r': optarg    = &recseps;              break;
          case 'u': optarg    = &uvchars;              break;
          case 'n': outflags |= AS_WEIGHT;
                    inflags  |= AS_WEIGHT;             break;
          case 'd': inflags  |= AS_DFLT;               break;
          case 'h': optarg    = &fn_hdr;               break;
          default : error(E_OPTION, *--s);             break;
        }                       /* set option variables */
        if (!*s) break;         /* if at end of string, abort loop */
        if (optarg) { *optarg = s; optarg = NULL; break; }
      } }                       /* get option argument */
    else {                      /* -- if argument is no option */
      switch (k++) {            /* evaluate non-option */
        case  0: fn_tab = s;      break;
        default: error(E_ARGCNT); break;
      }                         /* note filenames */
    }
  }
  if (optarg) error(E_OPTARG);  /* check option argument */
  if (k != 1) error(E_ARGCNT);  /* check number of arguments */
  if (fn_hdr && (strcmp(fn_hdr, "-") == 0))
    fn_hdr = "";                /* convert "-" to "" */
  if (fn_hdr)                   /* set header flags */
    inflags = AS_ATT | (inflags & ~AS_DFLT);
  if ((outflags & AS_ALIGN) && (outflags & AS_ATT))
    outflags |= AS_ALNHDR;      /* set align to header flag */

  /* --- create attribute set and read table --- */
  attset = as_create("domains", att_delete);
  if (!attset) error(E_NOMEM);  /* create an attribute set */
  as_chars(attset, blanks, fldseps, recseps, uvchars);
  fprintf(stderr, "\n");        /* set delimiter characters */
  in = io_hdr(attset, fn_hdr, fn_tab, inflags, 1);
  if (!in) error(1);            /* read the table header */
  if (colname) {                /* if a field/column name is given */
    colid = as_attid(attset, colname);
    if (colid < 0) error(E_FLDNAME, colname);
  }                             /* get the column identifier */
  table = io_bodyin(attset, in, fn_tab, inflags, "table", 1);
  in    = NULL;                 /* read the table body */
  if (!table) error(1);         /* and check for an error */
  tplcnt = tab_tplcnt(table);   /* get and check the table size */
  if (tplcnt <= 0)     error(E_EMPTAB);
  if (sample > tplcnt) error(E_SMLTAB);

  /* --- split table --- */
  if (shuffle) {                /* if the shuffle flag is set, */
    dseed(seed);                /* init. random number generator */
    tab_shuffle(table, 0, INT_MAX, drand);
  }                             /* shuffle tuples in table */
  if (colid >= 0)               /* sort table w.r.t. given column */
    tab_sort(table, 0, INT_MAX, tplcmp, (void*)colid);
  if (sample > 0) tabcnt = (tplcnt/(double)sample) *(1 +1e-12);
  one_in_n = ((colid < 0) || (tabcnt > 0));
  if (tabcnt <= 0) tabcnt = 1;  /* get tuple selection mode */
  val   = UV_SYM;               /* clear current and get first value */
  prev  = (colid >= 0) ? tpl_colval(tab_tpl(table, 0), colid)->i : val;
  size  = tplcnt;               /* note number of tuples in table, */
  first = tplid = tabid = 0;    /* initialize tuple and table index */
  do {                          /* table write loop */
    if (!*pattern) {            /* if no file name pattern is given, */
      out = stdout; strcpy(fn_out, ""); } /* write to stdout */
    else {                      /* if a file name pattern is given */
      sprintf(fn_out, pattern, tabid++);
      out = fopen(fn_out, "w"); /* open output file for writing */
    }
    fprintf(stderr, "writing %s ... ", fn_out);
    if (!out) error(E_FOPEN, fn_out);
    if ((outflags & AS_ATT)     /* if to write table header */
    &&  (as_write(attset, out, outflags) != 0))
      error(E_FWRITE, fn_out);  /* write field names to subtable */
    k = AS_INST | (outflags & ~AS_ATT);
    tplcnt = 0; tplwgt = 0;     /* initialize tuple counter */
    if (one_in_n) {             /* if to select every n-th tuple */
      tplid = 0; off = first;   /* get next tuple offset */
      if ((++first >= tabcnt) || sample)
        done = 1;               /* if last table, set done flag */
      while (tplid < size) {    /* while not at end of table */
        tpl = tab_tpl(table, tplid++);  /* get next tuple */
        wgt = floor(tpl_getwgt(tpl));   /* and its weight */
        if (wgt <= off) {       /* if offset is larger than weight, */
          off -= wgt; continue; }                /* skip this tuple */
        tpl_toas(tpl);          /* transfer tuple to attribute set */
        tplwgt += tmp = ceil((wgt -off) /tabcnt);
        as_setwgt(attset, tmp); /* set weight instantiation weight */
        if (as_write(attset, out, k) != 0)
          error(E_FWRITE, fn_out);  /* write instantiation (tuple) */
        off = fmod(off +tmp *tabcnt -wgt, tabcnt);
        tplcnt++;               /* compute next offset and */
      } }                       /* increment tuple counter */
    else {                      /* if to split according to values */
      while (tplid < size) {    /* while not all tuples processed */
        tpl = tab_tpl(table, tplid);      /* get next tuple and */
        val = tpl_colval(tpl, colid)->i;  /* its column value */
        if (val != prev) break; /* if next value reached, abort */
        tpl_toas(tpl);          /* transfer tuple to attribute set */
        if (as_write(attset, out, k) != 0)
          error(E_FWRITE, fn_out);  /* write instantiation (tuple) */
        tplwgt += as_getwgt(attset);
        tplcnt++; tplid++;      /* write instantiation (tuple) */
      }                         /* and increment tuple counter */
      if (tplid >= size) done = 1;
      prev = val;               /* check for completion and */
    }                           /* note the current column value */
    if (out == stdout) {        /* if written to standard output, */
      if (!done) printf("\n");} /* separate tables by an empty line */
    else {                      /* if not written to standard output */
      i = fclose(out); out = NULL;  /* close output file */
      if (i != 0) error(E_FWRITE, fn_out);
    }                           /* print a success message */
    fprintf(stderr, "[%d/%g tuple(s)] done.\n", tplcnt, tplwgt);
  } while (!done);              /* while not all tables written */

  /* --- clean up --- */
  #ifndef NDEBUG
  tab_delete(table, 1);         /* delete table and attribute set */
  #endif
  #ifdef STORAGE
  showmem("at end of program"); /* check memory usage */
  #endif
  return 0;                     /* return 'ok' */
}  /* main() */