www.pudn.com > FP-GROWTH.rar > fpgrowth.c
/*----------------------------------------------------------------------
File : fpgrowth.c
Contents: fpgrowth algorithm for finding frequent item sets
Author : Christian Borgelt
History : 21.11.2004 file created from eclat.c
22.11.2004 pruning of tree projection to bonsai added
23.11.2004 absolute/relative support output changed
09.12.2004 filter added (binary logarithm of supp. quotient)
20.06.2005 use of flag for "no item sorting" corrected
----------------------------------------------------------------------*/
#include
#include
#include
#include
#include
#include
#include
#include "scan.h"
#include "fptree.h"
#ifdef STORAGE
#include "storage.h"
#endif
/*----------------------------------------------------------------------
Preprocessor Definitions
----------------------------------------------------------------------*/
#define PRGNAME "fpgrowth"
#define DESCRIPTION "find frequent item sets " \
"with the fpgrowth algorithm"
#define VERSION "version 1.8 (2005.12.06) " \
"(c) 2004-2005 Christian Borgelt"
/* --- output flags --- */
#define OF_REL 0x01 /* print relative support */
#define OF_ABS 0x02 /* print absolute support */
#define OF_DEV 0x04 /* print deviation from indep. occ. */
#define OF_SCANF 0x08 /* convert names to scanable form */
/* --- error codes --- */
#define E_OPTION (-5) /* unknown option */
#define E_OPTARG (-6) /* missing option argument */
#define E_ARGCNT (-7) /* too few/many arguments */
#define E_STDIN (-8) /* double assignment of stdin */
#define E_SUPP (-9) /* invalid item set support */
#define E_ITEMCNT (-10) /* invalid number of items */
#define E_NOTAS (-11) /* no items or transactions */
#define E_UNKNOWN (-18) /* unknown error */
#ifndef QUIET /* if not quiet version */
#define MSG(x) x /* print messages */
#else /* if quiet version */
#define MSG(x) /* suppress messages */
#endif
#define SEC_SINCE(t) ((clock()-(t)) /(double)CLOCKS_PER_SEC)
#define RECCNT(s) (tfs_reccnt(is_tfscan(s)) \
+ ((tfs_delim(is_tfscan(s)) == TFS_REC) ? 0 : 1))
#define BUFFER(s) tfs_buf(is_tfscan(s))
/*----------------------------------------------------------------------
Constants
----------------------------------------------------------------------*/
#define LN_2 0.69314718055994530942 /* ln(2) */
#ifndef QUIET /* if not quiet version */
/* --- error messages --- */
static const char *errmsgs[] = {
/* E_NONE 0 */ "no error\n",
/* E_NOMEM -1 */ "not enough memory\n",
/* E_FOPEN -2 */ "cannot open file %s\n",
/* E_FREAD -3 */ "read error on file %s\n",
/* E_FWRITE -4 */ "write error on file %s\n",
/* E_OPTION -5 */ "unknown option -%c\n",
/* E_OPTARG -6 */ "missing option argument\n",
/* E_ARGCNT -7 */ "wrong number of arguments\n",
/* E_STDIN -8 */ "double assignment of standard input\n",
/* E_SUPP -9 */ "invalid minimal support %g%%\n",
/* E_ITEMCNT -10 */ "invalid number of items %d\n",
/* E_NOTAS -11 */ "no items or transactions to work on\n",
/* -12 to -15 */ NULL, NULL, NULL, NULL,
/* E_ITEMEXP -16 */ "file %s, record %d: item expected\n",
/* E_DUPITEM -17 */ "file %s, record %d: duplicate item %s\n",
/* E_UNKNOWN -18 */ "unknown error\n"
};
#endif
/*----------------------------------------------------------------------
Global Variables
----------------------------------------------------------------------*/
#ifndef QUIET
static char *prgname; /* program name for error messages */
#endif
static ITEMSET *itemset = NULL; /* item set */
static TASET *taset = NULL; /* transaction set */
static FPTREE *fptree = NULL; /* frequent pattern tree */
static FILE *in = NULL; /* input file */
static FILE *out = NULL; /* output file */
static int tacnt = 0; /* number of transactions */
static double mindev = -DBL_MAX; /* minimal value of deviation */
static double *logfs = NULL; /* logarithms of item frequencies */
static int flags = OF_REL; /* output flags */
static char *fmt = "%.1f"; /* output format for support */
static char buf[4*TFS_SIZE+4]; /* buffer for formatting */
/*----------------------------------------------------------------------
Item Set Report Function
----------------------------------------------------------------------*/
static int _report (int *ids, int cnt, int supp, void *data)
{ /* --- report a frequent item set */
int i; /* loop variable */
const char *name; /* buffer for item name */
double dev = 0; /* deviation from indep. occurrence */
assert(ids); /* check the function arguments */
if (flags & OF_DEV) { /* if to filter w.r.t. deviation */
for (i = cnt; --i >= 0; ) /* traverse the items and */
dev += logfs[ids[i]]; /* sum the logs of the item freqs. */
dev = (log(supp/(double)tacnt) -dev) /LN_2;
if (dev < mindev) return 0; /* compute and check the deviation */
} /* from an independent occurrence */
for (i = 0; i < cnt; i++) { /* traverse the item set */
name = is_name(itemset, ids[i]);
if (flags & OF_SCANF) { /* convert to scanable form */
sc_format(buf, name, 0); name = buf; }
fputs(name, out); fputc(' ', out);
} /* print the item names */
fputs(" (", out); /* print rel. and abs. support */
if (flags & OF_REL) { fprintf(out, fmt, (supp/(double)tacnt) *100);
fputs((flags & OF_ABS) ? "%/" : "%", out); }
if (flags & OF_ABS) { fprintf(out, "%d", supp); }
if (flags & OF_DEV) { fputs(", ", out); fprintf(out, fmt, dev); }
fputs(")\n", out); /* terminate the support output */
return 1; /* return 'reported 1 item set' */
} /* _report() */
/*----------------------------------------------------------------------
Main Functions
----------------------------------------------------------------------*/
static void error (int code, ...)
{ /* --- print an error message */
#ifndef QUIET /* if not quiet version */
va_list args; /* list of variable arguments */
const char *msg; /* error message */
assert(prgname); /* check the program name */
if (code < E_UNKNOWN) code = E_UNKNOWN;
if (code < 0) { /* if to report an error, */
msg = errmsgs[-code]; /* get the error message */
if (!msg) msg = errmsgs[-E_UNKNOWN];
fprintf(stderr, "\n%s: ", prgname);
va_start(args, code); /* get variable arguments */
vfprintf(stderr, msg, args);/* print error message */
va_end(args); /* end argument evaluation */
}
#endif
#ifndef NDEBUG /* if debug version */
if (fptree) fpt_delete(fptree); /* clean up memory */
if (taset) tas_delete(taset, 0); /* and close files */
if (itemset) is_delete(itemset);
if (in && (in != stdin)) fclose(in);
if (out && (out != stdout)) fclose(out);
#endif
#ifdef STORAGE /* if storage debugging */
showmem("at end of program"); /* check memory usage */
#endif
exit(code); /* abort the program */
} /* error() */
/*--------------------------------------------------------------------*/
int main (int argc, char *argv[])
{ /* --- main function */
int i, k = 0, n; /* loop variables, counters */
char *s; /* to traverse the options */
char **optarg = NULL; /* option argument */
char *fn_in = NULL; /* name of input file */
char *fn_out = NULL; /* name of output file */
char *blanks = NULL; /* blanks */
char *fldseps = NULL; /* field separators */
char *recseps = NULL; /* record separators */
char *cominds = NULL; /* comment indicators */
double supp = 0.1; /* minimal support (in percent) */
int min = 1; /* minimal size of item set */
int max = 5; /* maximal size of item set */
int sort = -2; /* flag for item sorting and recoding */
int mode = FPT_BONSAI;/* tree projection mode */
int heap = 1; /* flag for heap sort vs. quick sort */
int *map; /* identifier map for recoding */
clock_t t; /* timer for measurements */
#ifndef QUIET /* if not quiet version */
prgname = argv[0]; /* get program name for error msgs. */
/* --- print usage message --- */
if (argc > 1) { /* if arguments are given */
fprintf(stderr, "%s - %s\n", argv[0], DESCRIPTION);
fprintf(stderr, VERSION); } /* print a startup message */
else { /* if no arguments given */
printf("usage: %s [options] infile outfile\n", argv[0]);
printf("%s\n", DESCRIPTION);
printf("%s\n", VERSION);
printf("-m# minimal number of items per item set "
"(default: %d)\n", min);
printf("-n# maximal number of items per item set "
"(default: %d)\n", max);
printf("-s# minimal support of an item set "
"(default: %g%%)\n", supp *100);
printf(" (positive: percentage, "
"negative: absolut number)\n");
printf("-d# minimal binary logarithm of support quotient "
"(default: none)\n");
printf("-p# output format for the item set support "
"(default: \"%s\")\n", fmt);
printf("-a print absolute support "
"(number of transactions)\n");
printf("-g write output in scanable form "
"(quote certain characters)\n");
printf("-q# sort items w.r.t. their frequency (default: %d)\n"
" (1: ascending, -1: descending, 0: do not sort,\n"
" 2: ascending, -2: descending w.r.t. "
"transaction size sum)\n", sort);
printf("-u use alternative tree projection method\n");
printf("-z do not prune tree projections to bonsai\n");
printf("-j use quicksort to sort the transactions "
"(default: heapsort)\n");
printf("-i# ignore records starting with a character "
"in the given string\n");
printf("-b/f/r# blank characters, field and record separators\n"
" (default: \" \\t\\r\", \" \\t\", \"\\n\")\n");
printf("infile file to read transactions from\n");
printf("outfile file to write frequent item sets to\n");
return 0; /* print a usage message */
} /* and abort the program */
#endif /* #ifndef QUIET */
/* --- evaluate arguments --- */
for (i = 1; i < argc; i++) { /* traverse arguments */
s = argv[i]; /* get option argument */
if (optarg) { *optarg = s; optarg = NULL; continue; }
if ((*s == '-') && *++s) { /* -- if argument is an option */
while (*s) { /* traverse options */
switch (*s++) { /* evaluate switches */
case 'm': min = (int)strtol(s, &s, 0); break;
case 'n': max = (int)strtol(s, &s, 0); break;
case 's': supp = 0.01*strtod(s, &s); break;
case 'd': mindev = strtod(s, &s); break;
case 'p': optarg = &fmt; break;
case 'a': flags |= OF_ABS; break;
case 'g': flags |= OF_SCANF; break;
case 'q': sort = (int)strtol(s, &s, 0); break;
case 'u': mode |= FPT_ALTPROJ; break;
case 'z': mode &= ~FPT_BONSAI; break;
case 'j': heap = 0; break;
case 'i': optarg = &cominds; break;
case 'b': optarg = &blanks; break;
case 'f': optarg = &fldseps; break;
case 'r': optarg = &recseps; break;
default : error(E_OPTION, *--s); break;
} /* set option variables */
if (optarg && *s) { *optarg = s; optarg = NULL; break; }
} } /* get option argument */
else { /* -- if argument is no option */
switch (k++) { /* evaluate non-options */
case 0: fn_in = s; break;
case 1: fn_out = s; break;
default: error(E_ARGCNT); break;
} /* note filenames */
}
}
if (optarg) error(E_OPTARG); /* check option argument */
if (k != 2) error(E_ARGCNT); /* and the number of arguments */
if (supp > 1) /* check the minimal support */
error(E_SUPP, supp); /* (< 0: absolute number) */
if (min <= 0) error(E_ITEMCNT, min); /* check the limits */
if (max <= 0) error(E_ITEMCNT, max); /* for the set size */
if (mindev > -DBL_MAX) flags |= OF_DEV;
/* --- create item set and transaction set --- */
itemset = is_create(); /* create an item set and */
if (!itemset) error(E_NOMEM); /* set the special characters */
is_chars(itemset, blanks, fldseps, recseps, cominds);
taset = tas_create(itemset); /* create a transaction set */
if (!taset) error(E_NOMEM); /* to store the transactions */
MSG(fprintf(stderr, "\n")); /* terminate the startup message */
/* --- read transactions --- */
t = clock(); /* start the timer */
if (fn_in && *fn_in) /* if an input file name is given, */
in = fopen(fn_in, "r"); /* open input file for reading */
else { /* if no input file name is given, */
in = stdin; fn_in = ""; } /* read from standard input */
MSG(fprintf(stderr, "reading %s ... ", fn_in));
if (!in) error(E_FOPEN, fn_in);
for (tacnt = 0; 1; tacnt++) { /* transaction read loop */
k = is_read(itemset, in); /* read the next transaction */
if (k < 0) error(k, fn_in, RECCNT(itemset), BUFFER(itemset));
if (k > 0) break; /* check for error and end of file */
if (tas_add(taset, NULL, 0) != 0)
error(E_NOMEM); /* add the loaded transaction */
} /* to the transaction set */
if (in != stdin) fclose(in); /* if not read from standard input, */
in = NULL; /* close the input file */
n = is_cnt(itemset); /* get the number of items */
MSG(fprintf(stderr, "[%d item(s),", n));
MSG(fprintf(stderr, " %d transaction(s)] done ", tacnt));
MSG(fprintf(stderr, "[%.2fs].", SEC_SINCE(t)));
if ((n <= 0) || (tacnt <= 0)) error(E_NOTAS);
MSG(fprintf(stderr, "\n")); /* check for at least one transaction */
if (supp < 0) { /* if absolute support is given */
if (!(flags & OF_ABS)) flags = (flags & ~OF_REL) | OF_ABS;
supp = (-100 *supp -0.5) /tacnt;
if (supp < 0) supp = 0; /* compute a proper relative support */
} /* and check it against 0 */
/* --- sort and recode items --- */
supp = ceil(tacnt *supp); /* sort and recode the items */
MSG(fprintf(stderr, "sorting and recoding items ... "));
t = clock(); /* start the timer */
map = (int*)malloc(is_cnt(itemset) *sizeof(int));
if (!map) error(E_NOMEM); /* create an item identifier map */
n = is_recode(itemset, (int)supp, sort, map);
is_trunc(itemset, n); /* truncate the itemset and */
tas_recode(taset, map, n); /* recode the loaded transactions */
free(map); /* delete the item identifier map */
MSG(fprintf(stderr, "[%d item(s)] ", n));
MSG(fprintf(stderr, "done [%.2fs].", SEC_SINCE(t)));
if (n <= 0) error(E_NOTAS); /* print a log message and */
MSG(fprintf(stderr, "\n")); /* check the number of items */
if (flags & OF_DEV) { /* if to compute deviation */
logfs = (double*)malloc(n *sizeof(double));
if (!logfs) error(E_NOMEM); /* create a buffer for the logarithms */
for (i = n; --i >= 0; ) { /* traverse the (frequent) items */
k = is_getfrq(itemset, i);
logfs[i] = (k > 0) ? log(k/(double)tacnt) : 0;
} /* precompute the logarithms */
} /* of the item frequencies */
/* --- create a frequent pattern tree --- */
MSG(fprintf(stderr, "creating frequent pattern tree ... "));
t = clock(); /* start the timer */
tas_sort(taset, heap); /* sort the transactions */
fptree = fpt_create(taset); /* create a frequent pattern tree */
if (!fptree) error(E_NOMEM); /* to represent the transactions */
tas_delete(taset, 0); /* delete the transaction set */
taset = NULL; /* and print a success message */
MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t)));
/* --- find frequent item sets --- */
t = clock(); /* start the timer */
if (fn_out && *fn_out) /* if an output file name is given, */
out = fopen(fn_out, "w"); /* open the output file */
else { /* if no output file name is given, */
out = stdout; fn_out = ""; } /* write to std. output */
MSG(fprintf(stderr, "writing %s ... ", fn_out));
if (!out) error(E_FOPEN, fn_out);
k = fpt_search(fptree, (int)supp, min, max, mode, _report, NULL);
if (k < 0) error(E_NOMEM); /* search for frequent item sets */
if (fflush(out) != 0) error(E_FWRITE, fn_out);
if (out != stdout) fclose(out);
out = NULL; /* close the output file */
MSG(fprintf(stderr, "[%d set(s)] done ", k));
MSG(fprintf(stderr, "[%.2fs].\n", SEC_SINCE(t)));
/* --- clean up --- */
#ifndef NDEBUG /* if this is a debug version */
if (logfs) free(logfs); /* delete the log freqs. buffer */
fpt_delete(fptree); /* delete the freq. pattern tree */
is_delete(itemset); /* and the item set */
#endif
#ifdef STORAGE /* if storage debugging */
showmem("at end of program"); /* check memory usage */
#endif
return 0; /* return 'ok' */
} /* main() */