www.pudn.com > bayes.rar > uvins.c
/*----------------------------------------------------------------------
File : uvins.c
Contents: insert unknown values into a table
Authors : Christian Borgelt
History : 20.12.2002 file created
18.01.2003 option -x extended, options -i, -w added
16.08.2003 slight changes in error message output
----------------------------------------------------------------------*/
#include
#include
#include
#include
#include
#include
#ifndef TAB_RDWR
#define TAB_RDWR
#endif
#include "io.h"
#include "vecops.h"
#ifdef STORAGE
#include "storage.h"
#endif
/*----------------------------------------------------------------------
Preprocessor Definitions
----------------------------------------------------------------------*/
#define PRGNAME "uvins"
#define DESCRIPTION "insert unknown values into a table"
#define VERSION "version 1.3 (2003.08.16) " \
"(c) 2002-2003 Christian Borgelt"
/* --- error codes --- */
#define OK 0 /* no error */
#define E_NONE 0 /* no error */
#define E_NOMEM (-1) /* not enough memory */
#define E_FOPEN (-2) /* file open failed */
#define E_FREAD (-3) /* file read failed */
#define E_FWRITE (-4) /* file write failed */
#define E_OPTION (-5) /* unknown option */
#define E_OPTARG (-6) /* missing option argument */
#define E_ARGCNT (-7) /* wrong number of arguments */
#define E_PERCENT (-8) /* illegal percentage */
#define E_INEX (-9) /* both include and exclude used */
#define E_FLDNAME (-10) /* unknown field name */
#define E_UNKNOWN (-11) /* unknown error */
/*----------------------------------------------------------------------
Constants
----------------------------------------------------------------------*/
static const char *errmsgs[] = { /* error messages */
/* E_NONE 0 */ "no error\n",
/* E_NOMEM -1 */ "not enough memory\n",
/* E_FOPEN -2 */ "cannot open file %s\n",
/* E_FREAD -3 */ "read error on file %s\n",
/* E_FWRITE -4 */ "write error on file %s\n",
/* E_OPTION -5 */ "unknown option -%c\n",
/* E_OPTARG -6 */ "missing option argument\n",
/* E_ARGCNT -7 */ "wrong number of arguments\n",
/* E_PERCENT -8 */ "illegal percentage %d\n",
/* E_INEX -9 */ "both include and exclude used\n",
/* E_FLDNAME -10 */ "unknown field name \"%s\"\n",
/* E_UNKNOWN -11 */ "unknown error\n"
};
/*----------------------------------------------------------------------
Global Variables
----------------------------------------------------------------------*/
const char *prgname = NULL; /* program name for error messages */
static ATTSET *attset = NULL; /* attribute set */
static TABLE *table = NULL; /* table */
static int *map = NULL; /* column map */
static INST **insts = NULL; /* instance vector */
/*----------------------------------------------------------------------
Random Number Functions
----------------------------------------------------------------------*/
#ifdef DRAND48 /* if library for drand48() available */
extern void srand48 (long seed);
extern double drand48 (void); /* use drand48 functions */
#define dseed(s) srand48(s)
#define drand drand48
#else /* if only standard rand() available */
#define dseed(s) srand((unsigned int)s)
static double drand (void)
{ return rand()/(RAND_MAX +1.0); }
#endif
/*----------------------------------------------------------------------
Functions
----------------------------------------------------------------------*/
static void error (int code, ...)
{ /* --- print error message */
va_list args; /* list of variable arguments */
const char *msg; /* error message */
assert(prgname); /* check the program name */
if (code < E_UNKNOWN) code = E_UNKNOWN;
if (code < 0) { /* if to report an error, */
msg = errmsgs[-code]; /* get the error message */
if (!msg) msg = errmsgs[-E_UNKNOWN];
fprintf(stderr, "\n%s: ", prgname);
va_start(args, code); /* get variable arguments */
vfprintf(stderr, msg, args);/* print the error message */
va_end(args); /* end argument evaluation */
}
#ifndef NDEBUG /* clean up memory */
if (insts) free(insts);
if (map) free(map);
if (table) tab_delete(table, 0);
if (attset) as_delete(attset);
#endif
#ifdef STORAGE
showmem("at end of program"); /* check memory usage */
#endif
exit(code); /* abort the program */
} /* error() */
/*--------------------------------------------------------------------*/
int main (int argc, char *argv[])
{ /* --- main function */
int i, k = 0, n; /* loop variables, counters */
char *s; /* to traverse options */
char **optarg = NULL; /* option argument */
char *fn_hdr = NULL; /* name of table header file */
char *fn_in = NULL; /* name of table file to read */
char *fn_out = NULL; /* name of table file to write */
char *blanks = NULL; /* blank characters */
char *fldseps = NULL; /* field separators */
char *recseps = NULL; /* record separators */
char *uvchars = NULL; /* unknown value characters */
int clude = 0; /* flag for field in-/exclusion */
int inflags = 0; /* table file read flags */
int outflags = AS_ATT; /* table file write flags */
double percent = 10.0; /* percent unknown values */
int colcnt = 0; /* number of fields/columns */
int csvmem = 0; /* flag for memory conservation */
int rowcnt; /* number of tuples/rows */
long seed; /* random number seed */
TUPLE *tpl; /* to traverse the tuples */
INST *inst; /* a table field */
prgname = argv[0]; /* get program name for error msgs. */
seed = (long)time(NULL); /* get a default seed value */
/* --- print startup/usage message --- */
if (argc > 1) { /* if arguments are given */
fprintf(stderr, "%s - %s\n", argv[0], DESCRIPTION);
fprintf(stderr, VERSION); } /* print a startup message */
else { /* if no argument is given */
printf("usage: %s [options] infile outfile\n", argv[0]);
printf("%s\n", DESCRIPTION);
printf("%s\n", VERSION);
printf("-p# percentage of unknown values to insert "
"(default: %g%%)\n", percent);
printf("-i# name of field to include "
"(multiple fields possible)\n");
printf("-x# name of field to exclude "
"(multiple fields possible)\n");
printf("-s# seed value for random number generator "
"(default: time)\n");
printf("-m conserve memory (may slow down operation)\n");
printf("-a# align fields of output table "
"(default: do not align)\n");
printf("-w do not write field names to output file\n");
printf("-b/f/r# blank characters, field and record separators\n"
" (default: \" \\t\\r\", \" \\t\", \"\\n\")\n");
printf("-u# unknown value characters (default: \"?\")\n");
printf("-n number of tuple occurrences in last field\n");
printf("-d use default header "
"(field names = field numbers)\n");
printf("-h read table header (field names) from hdrfile\n");
printf("hdrfile file containing table header (field names)\n");
printf("infile table file to read "
"(field names in first record)\n");
printf("outfile table file to write\n");
return 0; /* print a usage message */
} /* and abort the program */
/* --- evaluate arguments --- */
for (i = 1; i < argc; i++) { /* traverse arguments */
s = argv[i]; /* get option argument */
if (optarg) { *optarg = s; optarg = NULL; continue; }
if ((*s == '-') && *++s) { /* -- if argument is an option */
while (1) { /* traverse characters */
switch (*s++) { /* evaluate option */
case 'p': percent = strtod(s, &s); break;
case 'i': if (clude < 0) error(E_INEX);
clude = +1;
optarg = argv +colcnt++; break;
case 'x': if (clude > 0) error(E_INEX);
clude = -1;
optarg = argv +colcnt++; break;
case 's': seed = strtol(s, &s, 0); break;
case 'm': csvmem = 1; break;
case 'a': outflags |= AS_ALIGN; break;
case 'w': outflags &= ~AS_ATT; break;
case 'b': optarg = &blanks; break;
case 'f': optarg = &fldseps; break;
case 'r': optarg = &recseps; break;
case 'u': optarg = &uvchars; break;
case 'n': inflags |= AS_WEIGHT; break;
case 'd': inflags |= AS_DFLT; break;
case 'h': optarg = &fn_hdr; break;
default : error(E_OPTION, *--s); break;
} /* set option variables */
if (!*s) break; /* if at end of string, abort loop */
if (optarg) { *optarg = s; optarg = NULL; break; }
} } /* get option argument */
else { /* -- if argument is no option */
switch (k++) { /* evaluate non-option */
case 0: fn_in = s; break;
case 1: fn_out = s; break;
default: error(E_ARGCNT); break;
} /* note filenames */
}
}
if (optarg) error(E_OPTARG); /* check option argument */
if (k != 2) error(E_ARGCNT); /* check number of arguments */
if (fn_hdr) { /* set header flags */
inflags = AS_ATT | (inflags & ~AS_DFLT);
if (strcmp(fn_hdr, "-") == 0) fn_hdr = "";
} /* convert "-" to "" */
if ((percent < 0) || (percent > 100))
error(E_PERCENT, percent); /* check the percentage */
/* --- create attribute set and read table --- */
attset = as_create("domains", att_delete);
if (!attset) error(E_NOMEM); /* create an attribute set */
as_chars(attset, blanks, fldseps, recseps, uvchars);
fprintf(stderr, "\n"); /* set delimiter characters */
table = io_tabin(attset, fn_hdr, fn_in, inflags, "table", 1);
if (!table) error(1); /* read the table file */
map = (int*)malloc(tab_colcnt(table) *sizeof(int));
if (!map) error(E_NOMEM); /* create a column map */
/* --- find columns to work on --- */
fprintf(stderr, "inserting unknown values ... ");
if (clude) { /* if to in-/exclude certain columns */
k = (clude > 0) ? 0 : 1; /* initialize the column map */
for (i = n = tab_colcnt(table); --i >= 0; ) map[i] = k;
for (i = colcnt; --i >= 0; ) { /* traverse the field names */
k = as_attid(attset, argv[i]);
if (k < 0) error(E_FLDNAME, argv[i]);
map[k] = clude; /* mark/unmark the columns */
} /* to include or exclude */
for (colcnt = i = 0; i < n; i++) /* collect the column numbers */
if (map[i] > 0) map[colcnt++] = i; }
else { /* if to work on all columns */
colcnt = tab_colcnt(table); /* get the number of columns */
for (i = colcnt; --i >= 0; ) map[i] = i;
} /* create an identity map */
rowcnt = tab_tplcnt(table); /* get the number of tuples */
/* --- insert unknown values --- */
dseed(seed); /* traverse the unknowns to insert */
if (csvmem) { /* if to conserve memory */
for (n = (int)(0.01 *percent *rowcnt *colcnt +0.4999); --n >= 0; ) {
do { /* table field search loop */
i = (int)(colcnt *drand());
if (i < 0) i = 0; /* compute a random column index */
if (i >= colcnt) i = colcnt-1;
k = (int)(rowcnt *drand());
if (k < 0) k = 0; /* compute a random row index */
if (k >= rowcnt) k = rowcnt-1;
inst = tpl_colval(tab_tpl(table, k), map[i]);
} while (inst->i < 0); /* find a known table field */
inst->i = UV_SYM; /* and replace its contents */
} } /* with an unknown value */
else { /* if to use an instance vector */
insts = (INST**)malloc(colcnt *rowcnt *sizeof(INST*));
if (!insts) error(E_NOMEM); /* create an instance vector */
for (n = 0, k = rowcnt; --k >= 0; ) {
tpl = tab_tpl(table, k); /* traverse the tuples of the table */
for (i = colcnt; --i >= 0; )
insts[n++] = tpl_colval(tpl, map[i]);
} /* collect the eligible instances */
v_shuffle(insts, n, drand); /* and shuffle them */
for (n = (int)(0.01 *percent *n +0.4999); --n >= 0; )
insts[n]->i = UV_SYM; /* set the first 'percent' instances */
} /* to an unknown value */
fprintf(stderr, "done.\n"); /* print a success message */
/* --- write the output table --- */
io_tabout(table, fn_out, outflags, 1);
/* --- clean up --- */
#ifndef NDEBUG
if (insts) free(insts); /* delete instances vector, */
free(map); /* column map, */
tab_delete(table, 1); /* table and attribute set */
#endif
#ifdef STORAGE
showmem("at end of program"); /* check memory usage */
#endif
return 0; /* return 'ok' */
} /* main() */