www.pudn.com > bayes.rar > corr.c
/*----------------------------------------------------------------------
File : corr.c
Contents: covariances/correlation coefficients computation program
Author : Christian Borgelt
History : 07.04.1999 file created from file xmat.c
10.04.1999 first version completed
11.04.1999 options -x, -v, and -c added
14.04.1999 option -k (expected values from known pairs)
01.12.1999 bug in connection with option -d removed
10.11.2000 adapted to new module mvnorm
07.06.2002 LaTeX output option added
16.08.2003 slight changes in error message output
22.04.2004 bug in function dblout fixed (log(0))
----------------------------------------------------------------------*/
#include
#include
#include
#include
#include
#include
#include
#include "symtab.h"
#include "tfscan.h"
#include "mvnorm.h"
#ifdef STORAGE
#include "storage.h"
#endif
/*----------------------------------------------------------------------
Preprocessor Definitions
----------------------------------------------------------------------*/
#define PRGNAME "corr"
#define DESCRIPTION "compute covariance matrix/correlation coefficients"
#define VERSION "version 2.7 (2006.10.06) " \
"(c) 1999-2006 Christian Borgelt"
/* --- sizes --- */
#define BUFSIZE 512 /* size of read buffer */
#define BLKSIZE 32 /* block size for column vector */
/* --- error codes --- */
#define OK 0 /* no error */
#define E_NONE 0 /* no error */
#define E_NOMEM (-1) /* not enough memory */
#define E_FOPEN (-2) /* file open failed */
#define E_FREAD (-3) /* file read failed */
#define E_FWRITE (-4) /* file write failed */
#define E_OPTION (-5) /* unknown option */
#define E_OPTARG (-6) /* missing option argument */
#define E_ARGCNT (-7) /* wrong number of arguments */
#define E_STDIN (-8) /* double assignment of stdin */
#define E_FLDNAME (-9) /* illegal field name */
#define E_VALUE (-10) /* illegal value */
#define E_EMPFLD (-11) /* empty field name */
#define E_DUPFLD (-12) /* duplicate field name */
#define E_FLDCNT (-13) /* wrong number of fields */
#define E_UNKNOWN (-14) /* unknown error */
/*----------------------------------------------------------------------
Type Definitions
----------------------------------------------------------------------*/
typedef struct { /* --- set of table columns --- */
int colvsz; /* size of column vector */
int colcnt; /* number of matrix columns */
int vldcnt; /* number of valid matrix columns */
int maxlen; /* maximal length of a column name */
const char **names; /* table column names */
double *vals; /* current values */
} COLSET; /* (set of table columns) */
/*----------------------------------------------------------------------
Constants
----------------------------------------------------------------------*/
static const char *errmsgs[] = { /* error messages */
/* E_NONE 0 */ "no error\n",
/* E_NOMEM -1 */ "not enough memory\n",
/* E_FOPEN -2 */ "cannot open file %s\n",
/* E_FREAD -3 */ "read error on file %s\n",
/* E_FWRITE -4 */ "write error on file %s\n",
/* E_OPTION -5 */ "unknown option -%c\n",
/* E_OPTARG -6 */ "missing option argument\n",
/* E_ARGCNT -7 */ "wrong number of arguments\n",
/* E_STDIN -8 */ "double assignment of standard input\n",
/* E_FLDNAME -9 */ "illegal field name \"%s\"\n",
/* E_VALUE -10 */ "file %s, record %d: "
"illegal value \"%s\" in field %d\n",
/* E_EMPFLD -11 */ "file %s, record %d: "
"empty name%s in field %d\n",
/* E_DUPFLD -12 */ "file %s, record %d: "
"duplicate field name \"%s\"\n",
/* E_FLDCNT -13 */ "file %s, record %d: "
"%d field(s) expected\n",
/* E_UNKNOWN -14 */ "unknown error\n"
};
/*----------------------------------------------------------------------
Global Variables
----------------------------------------------------------------------*/
static char *prgname; /* program name for error messages */
static TFSCAN *tfscan = NULL; /* table file scanner */
static SYMTAB *symtab = NULL; /* symbol table */
static COLSET *colset = NULL; /* column descriptions */
static MVNORM *mvnorm = NULL; /* multivariate normal distribution */
static FILE *in = NULL; /* input file */
static FILE *out = NULL; /* output file */
static char rdbuf[BUFSIZE]; /* read buffer */
static char fnbuf[BUFSIZE]; /* field name buffer */
/*----------------------------------------------------------------------
Column Set Functions
----------------------------------------------------------------------*/
#define cs_create() (COLSET*)calloc(1, sizeof(COLSET))
#define cs_colcnt(s) ((s)->colcnt)
#define cs_proc(s,m,w) mvn_add (m, (s)->vals, w)
#define cs_procx(s,m,w) mvn_addx(m, (s)->vals, w)
/*--------------------------------------------------------------------*/
#ifndef NDEBUG
static void cs_delete (COLSET *cset)
{ /* --- delete a column set */
if (cset->names) /* if there is a names vector, */
free((void*)cset->names); /* delete it */
if (cset->vals) /* if there is a value vector, */
free((void*)cset->vals); /* delete it */
free(cset); /* delete the column set body */
} /* cs_delete() */
#endif
/*--------------------------------------------------------------------*/
static int cs_add (COLSET *cset, const char *name)
{ /* --- add a column to a column set */
int n; /* new column vector size, buffer */
void *p; /* new names/values vector */
assert(cset && name); /* check the function arguments */
n = cset->colvsz; /* get the column vector size and */
if (cset->colcnt >= n) { /* if the column vector is full */
n += (n > BLKSIZE) ? (n >> 1) : BLKSIZE;
p = realloc((void*)cset->names, n *sizeof(const char*));
if (!p) return -1; /* enlarge the names vector */
cset->names = (const char**)p;
p = realloc(cset->vals, n *sizeof(double));
if (!p) return -1; /* enlarge the names vector */
cset->vals = (double*)p; cset->colvsz = n;
} /* set the new vectors and their size */
cset->names[cset->colcnt ] = name;
cset->vals [cset->colcnt++] = MVN_UNKNOWN;
return 0; /* note the table column's name */
} /* cs_add() */ /* and return 'ok' */
/*--------------------------------------------------------------------*/
static void cs_set (COLSET *cset, int colid, const char *name)
{ /* --- set a column value */
char *s; /* end pointer for conversion */
double *val; /* pointer to the value to set */
assert(cset && (colid >= 0) && (colid < cset->colcnt));
if (!cset->names[colid]) /* if the column name has been */
return; /* deleted, abort the function */
val = cset->vals +colid; /* get the value buffer */
if (!name) { /* if no value name is given, */
*val = MVN_UNKNOWN; return; } /* the value is unknown */
*val = strtod(name, &s); /* convert the attribute value */
if (*s || (s == name)) { /* if it is symbolic */
*val = MVN_UNKNOWN; cset->names[colid] = NULL; }
} /* cs_set() */ /* invalidate the column */
/*--------------------------------------------------------------------*/
static void cs_prepare (COLSET *cset)
{ /* --- prepare columns for output */
int i, len; /* loop variable, length of a name */
const char *name; /* to traverse the column names */
cset->vldcnt = 0; /* init. the number of valid columns */
cset->maxlen = 9; /* and the maximal length of a name */
for (i = 0; i < cset->colcnt; i++) {
name = cset->names[i]; /* traverse the columns */
if (!name) continue; /* skip invalidated columns */
cset->vldcnt++; /* count the valid columns */
len = (int)strlen(name); /* determine the length of the name */
if (len > cset->maxlen) cset->maxlen = len;
} /* adapt the maximal length */
} /* cs_prepare() */
/*----------------------------------------------------------------------
Auxiliary Functions
----------------------------------------------------------------------*/
static void dblout (FILE *out, double num, int len)
{ /* --- print a floating point number */
int n, d; /* number of characters/decimals */
char m[16], e[8]; /* mantissa and exponent */
if (num >= 0.0) { m[0] = ' '; }
else { num = -num; m[0] = '-'; }
len--; /* determine and store the sign */
n = (num == 0) /* calculate the decimal exponent */
? 0 : (int)floor(log10(num));
if ((n > len) || (n <= -3)) { /* if an exponent is needed, */
num /= pow(10, n); /* comp. mantissa and note exponent */
d = len -2 -(n = sprintf(e, "e%d", n)); }
else { /* if no exponent is needed, */
d = len -((n > 0) ? n+2 : 2); /* compute the number */
e[0] = n = 0; /* of decimal places possible */
} /* and clear the exponent */
n += sprintf(m+1, "%.*f", (d <= 0) ? 0 : d, num);
while (n < len) { /* format the mantissa, */
fputc(' ', out); n++; } /* fill to the requested length, */
fputs(m, out); fputs(e, out); /* and print mantissa and exponent */
} /* dblout() */
/*----------------------------------------------------------------------
The above function is needed, because a format string used with the
function fprintf does not yield appropriate results.
----------------------------------------------------------------------*/
static int isunk (TFSCAN *tfscan, const char *s)
{ /* --- test for an unknown value */
assert(tfscan && s); /* check arguments */
while (*s) /* while not at end of string */
if (!tfs_istype(tfscan, TFS_OTHER, *s++))
return 0; /* if a character is not marked, */
return 1; /* return 'not an unknown value', */
} /* isunk() */ /* otherwise return 'unknown value' */
/*----------------------------------------------------------------------
Output Functions
----------------------------------------------------------------------*/
static void expvar (COLSET *cset, MVNORM *mvn,
int tex, int cnt, FILE *out)
{ /* --- print exp. values/variances */
int i, k, n = 0; /* loop variables */
const char *name; /* to traverse the column names */
double t; /* expected value and variance */
char *s1, *s2, *s3; /* separators */
char *end; /* end of line */
assert(mvn && out); /* check the function arguments */
cs_prepare(cset); /* prepare columns for output */
if (tex) { /* if to generate LaTeX code */
fputs("\\begin{tabular}{|r|l|r|r|r|}\\hline\n", out);
fputs("\\multicolumn{5}{|l|}", out);
fputs("{attribute statistics\\rule{0pt}{2.4ex}} \\\\\n", out);
fputs("\\hline\\hline\\rule{0pt}{2.4ex}%\n", out);
fputs("no & attribute", out); /* print a header */
for (k = 9; k < cset->maxlen; k++) fputc(' ', out);
fputs(" & exp. val. & variance & std. dev.", out);
fputs(" \\\\\n\\hline\\rule{0pt}{2.4ex}%\n", out);
s1 = " & "; s2 = " &$"; s3 = "$&$"; end = "$ \\\\\n"; }
else { /* if to generate normal text */
fputs("attribute statistics\n", out);
fputs(" no | attribute", out); /* print a header */
for (k = 9; k < cset->maxlen; k++) fputc(' ', out);
fputs(" | exp. val. | variance | std. dev.", out);
if (cnt) fputs(" | cnt", out);
fputs("\n----+-", out);
for (k = 0; k < cset->maxlen; k++) fputc('-', out);
fputs("-+-----------+-----------+----------", out);
if (cnt) fputs("-+------", out);
s1 = s2 = s3 = " | "; end = "\n";
fputc('\n', out);
}
for (i = 0; i < cset->colcnt; i++) {
name = cset->names[i]; /* traverse the matrix columns, */
if (!name) continue; /* but skip invalidated columns */
fprintf(out, "%3i", ++n); /* print the column number, */
fputs(s1, out); /* a separator, */
fputs(name, out); /* and the column name */
for (k = (int)strlen(name); k < cset->maxlen; k++)
fputc(' ', out); /* fill the output field */
fputs(s2, out); /* retrieve and */
t = mvn_exp(mvn, i); /* print the expected value */
if (t <= MVN_UNKNOWN) fputs(" ?", out);
else dblout(out, t, 9);
t = mvn_var(mvn, i); /* retrieve and */
fputs(s3, out); /* print the variance */
if (t < 0) fputs(" ?", out);
else dblout(out, t, 9);
fputs(s3, out); /* print the standard deviation */
if (t < 0) fputs(" ?", out);
else dblout(out, sqrt(t), 9);
if (!tex && cnt) /* if to print counters */
fprintf(out, " | %5g", mvn_cnt(mvn, i));
fputs(end, out); /* terminate the output line */
}
if (tex) /* if to generate LaTeX output */
fputs("\\hline\n\\end{tabular}\n", out);
} /* expvar() */
/*--------------------------------------------------------------------*/
static void covar (COLSET *cset, MVNORM *mvn, int tex, FILE *out)
{ /* --- print covariance matrix */
int x, y, n = 0; /* loop variables */
const char *xname, *yname; /* to traverse the column names */
double cov; /* covariance */
char *s1, *s2, *s3; /* separators */
char *end; /* end of line */
assert(cset && mvn && out); /* check the function arguments */
cs_prepare(cset); /* prepare columns for output */
if (tex) { /* if to generate LaTeX output */
fprintf(out, "\\begin{tabular}{|r|l|*{%d}{r|}}", cset->vldcnt);
fputs("\\hline\n", out); /* print a header */
fprintf(out, "\\multicolumn{%d}{|l|}", cset->vldcnt +2);
fputs("{covariance matrix\\rule{0pt}{2.4ex}} \\\\\n", out);
fputs("\\hline\\hline\\rule{0pt}{2.4ex}%\n", out);
fputs("no & attribute", out);
for (x = 9; x < cset->maxlen; x++) fputc(' ', out);
for (x = 1; x <= cset->vldcnt; x++) fprintf(out, " & %8i", x);
fputs(" \\\\\n\\hline\\rule{0pt}{2.4ex}%\n", out);
s1 = " & "; s2 = " &"; s3 = "$&$"; end = "$ \\\\\n"; }
else { /* if to generate text output */
fputs("covariance matrix\nno | attribute", out);
for (x = 9; x < cset->maxlen; x++) fputc(' ', out);
fputs(" |", out); /* print a header */
for (x = 1; x <= cset->vldcnt; x++) fprintf(out, " %8i", x);
fputs("\n---+-", out);
for (x = 0; x < cset->maxlen; x++) fputc('-', out);
fputs("-+", out);
for (x = 0; x < cset->vldcnt; x++) fputs("---------", out);
fputc('\n', out); /* print a separating line */
s1 = " | "; s2 = " |"; s3 = " "; end = "\n";
}
for (y = 0; y < cset->colcnt; y++) {
yname = cset->names[y]; /* traverse the matrix columns, */
if (!yname) continue; /* but skip invalidated columns */
fprintf(out, "%2i", ++n); /* print the column number, */
fputs(s1, out); /* a separator, */
fputs(yname, out); /* and the column name */
for (x = (int)strlen(yname); x < cset->maxlen; x++)
fputc(' ', out); /* fill the output field */
fputs(s2, out); /* print a separator */
for (x = 1; x < n; x++) { /* skip lower left part of matrix */
fputs(" ", out); if (tex) fputs(s2, out); }
fputc(s3[0], out); /* start a number */
for (x = y; x < cset->colcnt; x++) {
xname = cset->names[x]; /* traverse the remaining columns, */
if (!xname) continue; /* but skip invalidated columns */
if (x > y) fputs(s3,out); /* print a separator */
cov = mvn_cov(mvn, x, y); /* get and print the covariance */
if (cov <= MVN_UNKNOWN) fputs(" ?", out);
else dblout(out, cov, 8);
}
fputs(end, out); /* terminate the output line */
}
if (tex) /* if to generate LaTeX output */
fputs("\\hline\n\\end{tabular}\n", out);
} /* covar() */
/*--------------------------------------------------------------------*/
static void correl (COLSET *cset, MVNORM *mvn, int tex, FILE *out)
{ /* --- print correlation coefficients */
int x, y, n = 0; /* loop variables */
const char *xname, *yname; /* to traverse the column names */
double r, t; /* correlation coefficient, buffer */
char *s1, *s2, *s3; /* separators */
char *end; /* end of line */
assert(cset && mvn && out); /* check the function arguments */
cs_prepare(cset); /* prepare columns for output */
if (tex) { /* if to generate LaTeX output */
fprintf(out, "\\begin{tabular}{|r|l|*{%d}{r|}}", cset->vldcnt);
fputs("\\hline\n", out); /* print a header */
fprintf(out, "\\multicolumn{%d}{|l|}", cset->vldcnt +2);
fputs("{correlation coefficients\\rule{0pt}{2.4ex}} \\\\\n", out);
fputs("\\hline\\hline\\rule{0pt}{2.4ex}%\n", out);
fputs("no & attribute", out);
for (x = 9; x < cset->maxlen; x++) fputc(' ', out);
for (x = 1; x <= cset->vldcnt; x++) fprintf(out, " & %5i", x);
fputs(" \\\\\n\\hline\\rule{0pt}{2.4ex}%\n", out);
s1 = " & "; s2 = " &"; s3 = "$&$"; end = "$ \\\\\n"; }
else { /* if to generate text output */
fputs("correlation coefficients\nno | attribute", out);
for (x = 9; x < cset->maxlen; x++) fputc(' ', out);
fputs(" |", out); /* print a header */
for (x = 1; x <= cset->vldcnt; x++) fprintf(out, " %5i", x);
fputs("\n---+-", out);
for (x = 0; x < cset->maxlen; x++) fputc('-', out);
fputs("-+", out);
for (x = 0; x < cset->vldcnt; x++) fputs("------", out);
fputc('\n', out); /* print a separating line */
s1 = " | "; s2 = " |"; s3 = " "; end = "\n";
}
for (y = 0; y < cset->colcnt; y++) {
yname = cset->names[y]; /* traverse the matrix columns, */
if (!yname) continue; /* but skip invalidated columns */
fprintf(out, "%2i", ++n); /* print the column number, */
fputs(s1, out); /* a separator, */
fputs(yname, out); /* and the column name */
for (x = (int)strlen(yname); x < cset->maxlen; x++)
fputc(' ', out); /* fill the output field */
fputs(s2, out); /* print a separator */
for (x = 1; x < n; x++) { /* skip lower left part of matrix */
fputs(" ", out); if (tex) fputs(s2, out); }
fputc(s3[0], out); /* start a number */
fputs(" 1.00", out); /* the diagonal is always 1 */
for (x = y+1; x < cset->colcnt; x++) {
xname = cset->names[x]; /* traverse the remaining columns, */
if (!xname) continue; /* but skip invalidated columns */
fputs(s3, out); /* print a separator */
r = mvn_corr(mvn, x, y); /* get and check the correl. coeff. */
if (r < MVN_UNKNOWN) { fputs(" ?", out); continue; }
t = fabs(r); /* print the correlation coefficient */
fputs(((r >= 0) || (t < 0.0005)) ? " " : "-", out);
if (t >= 0.9995) fputs("1.00", out);
else fprintf(out, ".%03.0f", t *1000);
}
fputs(end, out); /* terminate the output line */
}
if (tex) /* if to generate LaTeX output */
fputs("\\hline\n\\end{tabular}\n", out);
} /* correl() */
/*----------------------------------------------------------------------
Functions
----------------------------------------------------------------------*/
static void error (int code, ...)
{ /* --- print error message */
va_list args; /* list of variable arguments */
const char *msg; /* error message */
if ((code > 0) || (code < E_UNKNOWN))
code = E_UNKNOWN; /* check error code */
msg = errmsgs[-code]; /* get error message */
if (!msg) msg = errmsgs[-E_UNKNOWN];
fprintf(stderr, "\n%s: ", prgname);
va_start(args, code); /* get variable arguments */
vfprintf(stderr, msg, args); /* print error message */
va_end(args); /* end argument evaluation */
#ifndef NDEBUG
if (mvnorm) mvn_delete(mvnorm);
if (colset) cs_delete(colset);
if (tfscan) tfs_delete(tfscan); /* clean up memory */
if (symtab) st_delete(symtab); /* and close files */
if (in && (in != stdin)) fclose(in);
if (out && (out != stdout)) fclose(out);
#endif
#ifdef STORAGE
showmem("at end of program"); /* check memory usage */
#endif
exit(code); /* abort the program */
} /* error() */
/*--------------------------------------------------------------------*/
int main (int argc, char *argv[])
{ /* --- main function */
int i, k = 0; /* loop variables, counters */
char *s; /* to traverse options */
char **optarg = NULL; /* option argument */
char *fn_hdr = NULL; /* name of table header file */
char *fn_tab = NULL; /* name of table file */
char *fn_mat = NULL; /* name of matrix file */
char *fname = NULL; /* buffer for file name */
char *blanks = NULL; /* blank characters */
char *fldseps = NULL; /* field separators */
char *recseps = NULL; /* record separators */
char *uvchars = NULL; /* unknown value characters */
int header = 0; /* header type (table/default/file) */
int fldcnt = 0; /* number of fields/columns */
int wgtflg = 0; /* flag for weight in last field */
int flags = 0; /* flags for matrices to print */
int pos = 0; /* flag for positive values */
int tex = 0; /* flag for LaTeX output */
size_t maxlen = 6, len; /* (maximal) length of a value name */
int tplcnt = 0; /* number of tuples */
double tplwgt = 0.0; /* weight of tuples */
double weight = 1.0; /* weight of tuple */
int d; /* delimiter type */
int *p; /* pointer to symbol data */
prgname = argv[0]; /* get program name for error msgs. */
/* --- print startup/usage message --- */
if (argc > 1) { /* if arguments are given */
fprintf(stderr, "%s - %s\n", argv[0], DESCRIPTION);
fprintf(stderr, VERSION); } /* print a startup message */
else { /* if no argument is given */
printf("usage: %s [options] "
"[-d|-h hdrfile] tabfile [matfile]\n", argv[0]);
printf("%s\n", DESCRIPTION);
printf("%s\n", VERSION);
printf("-x print expected values and standard deviation\n");
printf("-v print covariance matrix\n");
printf("-c print correlation coefficients (default)\n");
printf("-m use maximum likelihood estimates "
"for the (co)variances\n");
printf("-p use only positive values\n");
printf("-t print output in LaTeX format\n");
printf("-n number of tuple occurrences in last field\n");
printf("-b/f/r# blank characters, field and record separators\n"
" (default: \" \\t\\r\", \" \\t\", \"\\n\")\n");
printf("-u# unknown value characters (default: \"?\")\n");
printf("-d use default header "
"(field names = field numbers)\n");
printf("-h read table header (field names) from hdrfile\n");
printf("hdrfile file containing table header (field names)\n");
printf("tabfile table file to read "
"(field names in first record)\n");
printf("matfile file to write matrix to (optional)\n");
return 0; /* print a usage message */
} /* and abort the program */
/* --- evaluate arguments --- */
for (i = 1; i < argc; i++) { /* traverse arguments */
s = argv[i]; /* get option argument */
if (optarg) { *optarg = s; optarg = NULL; continue; }
if ((*s == '-') && *++s) { /* -- if argument is an option */
while (1) { /* traverse characters */
switch (*s++) { /* evaluate option */
case 'x': flags |= MVN_EXPVAR; break;
case 'v': flags |= MVN_COVAR; break;
case 'c': flags |= MVN_CORREL; break;
case 'm': flags |= MVN_MAXLLH; break;
case 'p': pos = 1; break;
case 't': tex = 1; break;
case 'n': wgtflg = 1; break;
case 'b': optarg = &blanks; break;
case 'f': optarg = &fldseps; break;
case 'r': optarg = &recseps; break;
case 'u': optarg = &uvchars; break;
case 'd': header = 1; break;
case 'h': optarg = &fn_hdr; break;
default : error(E_OPTION, *--s); break;
} /* set option variables */
if (!*s) break; /* if at end of string, abort loop */
if (optarg) { *optarg = s; optarg = NULL; break; }
} } /* get option argument */
else { /* -- if argument is no option */
switch (k++) { /* evaluate non-option */
case 0: fn_tab = s; break;
case 1: fn_mat = s; break;
default: error(E_ARGCNT); break;
} /* note filenames */
}
}
if (optarg) error(E_OPTARG); /* check option argument */
if ((k < 1) || (k > 2)) error(E_ARGCNT);
if (fn_hdr) { /* set the header file flag */
header = 2; if (strcmp(fn_hdr, "-") == 0) fn_hdr = ""; }
if (!flags) flags = MVN_CORREL;
/* --- create symbol table and table file scanner --- */
symtab = st_create(0, 0, (HASHFN*)0, (SYMFN*)0);
if (!symtab) error(E_NOMEM); /* create symbol table */
tfscan = tfs_create(); /* and table file scanner */
if (!tfscan) error(E_NOMEM); /* and set delimiter characters */
tfs_chars(tfscan, TFS_BLANK, blanks);
tfs_chars(tfscan, TFS_FLDSEP, fldseps);
tfs_chars(tfscan, TFS_RECSEP, recseps);
colset = cs_create(); /* create a column set */
if (!colset) error(E_NOMEM); /* for the table reading */
/* --- read table header/first record --- */
fname = (fn_hdr) ? fn_hdr : fn_tab;
if (!fname || !*fname) { /* if no proper file name is given, */
in = stdin; fname = ""; } /* read from standard input */
else { /* if a proper file name is given */
in = fopen(fname, "r"); /* open file for reading */
if (!in) error(E_FOPEN, fname);
}
fprintf(stderr, "\nreading %s ... ", fname);
do { /* read fields of table header */
d = tfs_getfld(tfscan, in, rdbuf, BUFSIZE-1);
if (d < 0) error(E_FREAD, fname);
fldcnt++; /* read the next field name */
if ((d <= TFS_REC) && wgtflg)
break; /* skip a tuple weight field */
if (header != 1) { /* if to use a normal header */
if (!rdbuf[0]) error(E_EMPFLD, fname, 1, rdbuf, fldcnt);
p = (int*)st_insert(symtab, rdbuf, -1, sizeof(int));
if (!p) error(E_NOMEM); /* insert name into symbol table */
if (p == EXISTS) error(E_DUPFLD, fname, 1, rdbuf, fldcnt);
*p = fldcnt-1; /* note the field number */
if (cs_add(colset, st_name(p)) != 0)
error(E_NOMEM); } /* add a column to the column set */
else { /* if to use a default header, */
sprintf(fnbuf, "%d", fldcnt); /* create a field name */
p = (int*)st_insert(symtab, fnbuf, -1, sizeof(int));
if (!p) error(E_NOMEM); /* insert name into symbol table */
*p = fldcnt-1; /* note the field number */
if (cs_add(colset, st_name(p)) != 0)
error(E_NOMEM); /* add a column to the column set */
cs_set(colset, fldcnt-1, isunk(tfscan, rdbuf) ? NULL : rdbuf);
} /* set the column value */
len = strlen(st_name(p)); /* determine the maximal length */
if (len > maxlen) maxlen = len; /* of the field names */
} while (d > TFS_REC); /* while not at end of record */
if (fn_hdr) { /* close the table header file */
fclose(in); fprintf(stderr, "done."); }
/* --- compute covariance matrix --- */
mvnorm = mvn_create(colset->colcnt);
if (!mvnorm) error(E_NOMEM); /* create a normal distribution */
if (header > 1) { /* if a table header file is given */
if (fn_tab && *fn_tab) /* if a proper table name is given, */
in = fopen(fn_tab, "r"); /* open table file for reading */
else { /* if no table file name is given, */
in = stdin; fn_tab = ""; } /* read from std. input */
fprintf(stderr, "reading %s ... ", fn_tab);
if (!in) error(E_FOPEN, fn_tab); }
else if (header > 0) { /* if to use a default header */
if (wgtflg) { /* if a tuple weight is given, */
weight = strtod(rdbuf,&s); /* get the tuple weight */
if (!*s || (s == rdbuf) || (weight < 0))
error(E_VALUE, fname, 1, rdbuf, fldcnt);
}
if (pos) cs_procx(colset, mvnorm, weight);
else cs_proc (colset, mvnorm, weight);
tplwgt += weight; /* aggregate for the first tuple, */
tplcnt++; /* sum the tuple weight and */
} /* increment the tuple counter */
do { /* read table records */
d = TFS_FLD; /* read fields in table record */
for (i = 0; (i < fldcnt) && (d >= TFS_FLD); i++) {
d = tfs_getfld(tfscan, in, rdbuf, BUFSIZE-1);
if (d < 0) error(E_FREAD, fn_tab);
if (d <= TFS_EOF) { /* if at end of file, */
i = -1; break; } /* abort the read loop */
if (i < fldcnt -wgtflg) /* if this is a normal field */
cs_set(colset, i, isunk(tfscan, rdbuf) ? NULL : rdbuf);
else { /* if this is the weight field */
weight = strtod(rdbuf, &s);
if (!*s || (s == rdbuf) || (weight < 0))
error(E_VALUE, fn_tab, tplcnt +((header > 0) ? 1 : 2), rdbuf);
} /* get the tuple weight/counter */
} /* and check for a valid number */
if (i < 0) break; /* if at end of file, abort loop */
if (i != fldcnt) /* check number of fields in record */
error(E_FLDCNT, fn_tab, tplcnt +((header > 0) ? 1 : 2), fldcnt);
if (pos) cs_procx(colset, mvnorm, weight);
else cs_proc (colset, mvnorm, weight);
tplwgt += weight; /* aggregate for the current tuple */
tplcnt++; /* and sum the weight/count the tuple */
} while (d > TFS_EOF); /* while not at end of file */
if (in != stdin) fclose(in); /* close the input file and */
in = NULL; /* clear the file variable */
fprintf(stderr, "[%d/%g tuple(s)] done.\n", tplcnt, tplwgt);
/* --- print matrix/matrices --- */
k = flags; /* build the evaluation flags */
if (k & MVN_CORREL) k |= MVN_COVAR;
if (k & MVN_COVAR) k |= MVN_EXPVAR;
mvn_calc(mvnorm, k); /* calculate parameters from data */
if (fn_mat) /* if a matrix file name is given, */
out = fopen(fn_mat, "w"); /* open correlation matrix file */
else { /* if no matrix file name is given, */
out = stdout; fn_mat = ""; } /* write to stdout */
fprintf(stderr, "writing %s ... ", fn_mat);
if (!out) error(E_FOPEN, fn_mat);
if (flags & MVN_EXPVAR) { /* print the expected values */
expvar(colset, mvnorm, tex, pos, out);
if (flags & (MVN_COVAR|MVN_CORREL)) fputc('\n', out);
} /* leave one line empty */
if (flags & MVN_COVAR) { /* print the covariance matrix */
covar(colset, mvnorm, tex, out);
if (flags & MVN_CORREL) fputc('\n', out);
} /* leave one line empty */
if (flags & MVN_CORREL) /* print the correlation coefficients */
correl(colset, mvnorm, tex, out);
if (out != stdout) { /* if not written to standard output */
i = fclose(out); out = NULL;
if (i) error(E_FWRITE, fn_mat);
} /* close correlation matrix file */
fprintf(stderr, "done.\n"); /* and print a success message */
/* --- clean up --- */
#ifndef NDEBUG /* if debug version */
mvn_delete(mvnorm); /* delete normal distribution, */
cs_delete(colset); /* column set, */
tfs_delete(tfscan); /* table file scanner, */
st_delete(symtab); /* and symbol table */
#endif
#ifdef STORAGE
showmem("at end of program"); /* check memory usage */
#endif
return 0; /* return 'ok' */
} /* main() */