www.pudn.com > FP-GROWTH.rar > fpgrowth.c


/*----------------------------------------------------------------------
  File    : fpgrowth.c
  Contents: fpgrowth algorithm for finding frequent item sets
  Author  : Christian Borgelt
  History : 21.11.2004 file created from eclat.c
            22.11.2004 pruning of tree projection to bonsai added
            23.11.2004 absolute/relative support output changed
            09.12.2004 filter added (binary logarithm of supp. quotient)
            20.06.2005 use of flag for "no item sorting" corrected
----------------------------------------------------------------------*/
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include "scan.h"
#include "fptree.h"
#ifdef STORAGE
#include "storage.h"
#endif

/*----------------------------------------------------------------------
  Preprocessor Definitions
----------------------------------------------------------------------*/
#define PRGNAME     "fpgrowth"
#define DESCRIPTION "find frequent item sets " \
                    "with the fpgrowth algorithm"
#define VERSION     "version 1.8 (2005.12.06)         " \
                    "(c) 2004-2005   Christian Borgelt"

/* --- output flags --- */
#define OF_REL      0x01        /* print relative support */
#define OF_ABS      0x02        /* print absolute support */
#define OF_DEV      0x04        /* print deviation from indep. occ. */
#define OF_SCANF    0x08        /* convert names to scanable form */

/* --- error codes --- */
#define E_OPTION    (-5)        /* unknown option */
#define E_OPTARG    (-6)        /* missing option argument */
#define E_ARGCNT    (-7)        /* too few/many arguments */
#define E_STDIN     (-8)        /* double assignment of stdin */
#define E_SUPP      (-9)        /* invalid item set support */
#define E_ITEMCNT  (-10)        /* invalid number of items */
#define E_NOTAS    (-11)        /* no items or transactions */
#define E_UNKNOWN  (-18)        /* unknown error */

#ifndef QUIET                   /* if not quiet version */
#define MSG(x)        x         /* print messages */
#else                           /* if quiet version */
#define MSG(x)                  /* suppress messages */
#endif

#define SEC_SINCE(t)  ((clock()-(t)) /(double)CLOCKS_PER_SEC)
#define RECCNT(s)     (tfs_reccnt(is_tfscan(s)) \
                      + ((tfs_delim(is_tfscan(s)) == TFS_REC) ? 0 : 1))
#define BUFFER(s)     tfs_buf(is_tfscan(s))

/*----------------------------------------------------------------------
  Constants
----------------------------------------------------------------------*/
#define LN_2       0.69314718055994530942   /* ln(2) */

#ifndef QUIET                   /* if not quiet version */
/* --- error messages --- */
static const char *errmsgs[] = {
  /* E_NONE      0 */  "no error\n",
  /* E_NOMEM    -1 */  "not enough memory\n",
  /* E_FOPEN    -2 */  "cannot open file %s\n",
  /* E_FREAD    -3 */  "read error on file %s\n",
  /* E_FWRITE   -4 */  "write error on file %s\n",
  /* E_OPTION   -5 */  "unknown option -%c\n",
  /* E_OPTARG   -6 */  "missing option argument\n",
  /* E_ARGCNT   -7 */  "wrong number of arguments\n",
  /* E_STDIN    -8 */  "double assignment of standard input\n",
  /* E_SUPP     -9 */  "invalid minimal support %g%%\n",
  /* E_ITEMCNT -10 */  "invalid number of items %d\n",
  /* E_NOTAS   -11 */  "no items or transactions to work on\n",
  /*    -12 to -15 */  NULL, NULL, NULL, NULL,
  /* E_ITEMEXP -16 */  "file %s, record %d: item expected\n",
  /* E_DUPITEM -17 */  "file %s, record %d: duplicate item %s\n",
  /* E_UNKNOWN -18 */  "unknown error\n"
};
#endif

/*----------------------------------------------------------------------
  Global Variables
----------------------------------------------------------------------*/
#ifndef QUIET
static char    *prgname;        /* program name for error messages */
#endif
static ITEMSET *itemset = NULL; /* item set */
static TASET   *taset   = NULL; /* transaction set */
static FPTREE  *fptree  = NULL; /* frequent pattern tree */
static FILE    *in      = NULL; /* input  file */
static FILE    *out     = NULL; /* output file */
static int     tacnt    = 0;    /* number of transactions */
static double  mindev   = -DBL_MAX; /* minimal value of deviation */
static double  *logfs   = NULL;     /* logarithms of item frequencies */
static int     flags    = OF_REL;   /* output flags */
static char    *fmt     = "%.1f";   /* output format for support */
static char    buf[4*TFS_SIZE+4];   /* buffer for formatting */

/*----------------------------------------------------------------------
  Item Set Report Function
----------------------------------------------------------------------*/

static int _report (int *ids, int cnt, int supp, void *data)
{                               /* --- report a frequent item set */
  int        i;                 /* loop variable */
  const char *name;             /* buffer for item name */
  double     dev = 0;           /* deviation from indep. occurrence */

  assert(ids);                  /* check the function arguments */
  if (flags & OF_DEV) {         /* if to filter w.r.t. deviation */
    for (i = cnt; --i >= 0; )   /* traverse the items and */
      dev += logfs[ids[i]];     /* sum the logs of the item freqs. */
    dev = (log(supp/(double)tacnt) -dev) /LN_2;
    if (dev < mindev) return 0; /* compute and check the deviation */
  }                             /* from an independent occurrence */
  for (i = 0; i < cnt; i++) {   /* traverse the item set */
    name = is_name(itemset, ids[i]);
    if (flags & OF_SCANF) {     /* convert to scanable form */
      sc_format(buf, name, 0); name = buf; }
    fputs(name, out); fputc(' ', out);
  }                             /* print the item names */
  fputs(" (", out);             /* print rel. and abs. support */
  if (flags & OF_REL) { fprintf(out, fmt, (supp/(double)tacnt) *100);
                        fputs((flags & OF_ABS) ? "%/" : "%", out); }
  if (flags & OF_ABS) { fprintf(out, "%d", supp); }
  if (flags & OF_DEV) { fputs(", ", out); fprintf(out, fmt, dev); }
  fputs(")\n", out);            /* terminate the support output */
  return 1;                     /* return 'reported 1 item set' */
}  /* _report() */

/*----------------------------------------------------------------------
  Main Functions
----------------------------------------------------------------------*/

static void error (int code, ...)
{                               /* --- print an error message */
  #ifndef QUIET                 /* if not quiet version */
  va_list    args;              /* list of variable arguments */
  const char *msg;              /* error message */

  assert(prgname);              /* check the program name */
  if (code < E_UNKNOWN) code = E_UNKNOWN;
  if (code < 0) {               /* if to report an error, */
    msg = errmsgs[-code];       /* get the error message */
    if (!msg) msg = errmsgs[-E_UNKNOWN];
    fprintf(stderr, "\n%s: ", prgname);
    va_start(args, code);       /* get variable arguments */
    vfprintf(stderr, msg, args);/* print error message */
    va_end(args);               /* end argument evaluation */
  }
  #endif
  #ifndef NDEBUG                /* if debug version */
  if (fptree)  fpt_delete(fptree);    /* clean up memory */
  if (taset)   tas_delete(taset, 0);  /* and close files */
  if (itemset) is_delete(itemset);
  if (in  && (in  != stdin))  fclose(in);
  if (out && (out != stdout)) fclose(out);
  #endif
  #ifdef STORAGE                /* if storage debugging */
  showmem("at end of program"); /* check memory usage */
  #endif
  exit(code);                   /* abort the program */
}  /* error() */

/*--------------------------------------------------------------------*/

int main (int argc, char *argv[])
{                               /* --- main function */
  int     i, k = 0, n;          /* loop variables, counters */
  char    *s;                   /* to traverse the options */
  char    **optarg = NULL;      /* option argument */
  char    *fn_in   = NULL;      /* name of input  file */
  char    *fn_out  = NULL;      /* name of output file */
  char    *blanks  = NULL;      /* blanks */
  char    *fldseps = NULL;      /* field  separators */
  char    *recseps = NULL;      /* record separators */
  char    *cominds = NULL;      /* comment indicators */
  double  supp     = 0.1;       /* minimal support (in percent) */
  int     min      =  1;        /* minimal size of item set */
  int     max      =  5;        /* maximal size of item set */
  int     sort     = -2;        /* flag for item sorting and recoding */
  int     mode     = FPT_BONSAI;/* tree projection mode */
  int     heap     =  1;        /* flag for heap sort vs. quick sort */
  int     *map;                 /* identifier map for recoding */
  clock_t t;                    /* timer for measurements */

  #ifndef QUIET                 /* if not quiet version */
  prgname = argv[0];            /* get program name for error msgs. */

  /* --- print usage message --- */
  if (argc > 1) {               /* if arguments are given */
    fprintf(stderr, "%s - %s\n", argv[0], DESCRIPTION);
    fprintf(stderr, VERSION); } /* print a startup message */
  else {                        /* if no arguments given */
    printf("usage: %s [options] infile outfile\n", argv[0]);
    printf("%s\n", DESCRIPTION);
    printf("%s\n", VERSION);
    printf("-m#      minimal number of items per item set "
                    "(default: %d)\n", min);
    printf("-n#      maximal number of items per item set "
                    "(default: %d)\n", max);
    printf("-s#      minimal support of an item set "
                    "(default: %g%%)\n", supp *100);
    printf("         (positive: percentage, "
                    "negative: absolut number)\n");
    printf("-d#      minimal binary logarithm of support quotient "
                    "(default: none)\n");
    printf("-p#      output format for the item set support "
                    "(default: \"%s\")\n", fmt);
    printf("-a       print absolute support "
                    "(number of transactions)\n");
    printf("-g       write output in scanable form "
                    "(quote certain characters)\n");
    printf("-q#      sort items w.r.t. their frequency (default: %d)\n"
           "         (1: ascending, -1: descending, 0: do not sort,\n"
           "          2: ascending, -2: descending w.r.t. "
                    "transaction size sum)\n", sort);
    printf("-u       use alternative tree projection method\n");
    printf("-z       do not prune tree projections to bonsai\n");
    printf("-j       use quicksort to sort the transactions "
                    "(default: heapsort)\n");
    printf("-i#      ignore records starting with a character "
                    "in the given string\n");
    printf("-b/f/r#  blank characters, field and record separators\n"
           "         (default: \" \\t\\r\", \" \\t\", \"\\n\")\n");
    printf("infile   file to read transactions from\n");
    printf("outfile  file to write frequent item sets to\n");
    return 0;                   /* print a usage message */
  }                             /* and abort the program */
  #endif  /* #ifndef QUIET */

  /* --- evaluate arguments --- */
  for (i = 1; i < argc; i++) {  /* traverse arguments */
    s = argv[i];                /* get option argument */
    if (optarg) { *optarg = s; optarg = NULL; continue; }
    if ((*s == '-') && *++s) {  /* -- if argument is an option */
      while (*s) {              /* traverse options */
        switch (*s++) {         /* evaluate switches */
          case 'm': min    = (int)strtol(s, &s, 0); break;
          case 'n': max    = (int)strtol(s, &s, 0); break;
          case 's': supp   = 0.01*strtod(s, &s);    break;
          case 'd': mindev =      strtod(s, &s);    break;
          case 'p': optarg = &fmt;                  break;
          case 'a': flags |= OF_ABS;                break;
          case 'g': flags |= OF_SCANF;              break;
          case 'q': sort   = (int)strtol(s, &s, 0); break;
          case 'u': mode  |=  FPT_ALTPROJ;          break;
          case 'z': mode  &= ~FPT_BONSAI;           break;
          case 'j': heap   = 0;                     break;
          case 'i': optarg = &cominds;              break;
          case 'b': optarg = &blanks;               break;
          case 'f': optarg = &fldseps;              break;
          case 'r': optarg = &recseps;              break;
          default : error(E_OPTION, *--s);          break;
        }                       /* set option variables */
        if (optarg && *s) { *optarg = s; optarg = NULL; break; }
      } }                       /* get option argument */
    else {                      /* -- if argument is no option */
      switch (k++) {            /* evaluate non-options */
        case  0: fn_in  = s;      break;
        case  1: fn_out = s;      break;
        default: error(E_ARGCNT); break;
      }                         /* note filenames */
    }
  }
  if (optarg) error(E_OPTARG);  /* check option argument */
  if (k != 2) error(E_ARGCNT);  /* and the number of arguments */
  if (supp > 1)                 /* check the minimal support */
    error(E_SUPP, supp);        /* (< 0: absolute number) */
  if (min <= 0) error(E_ITEMCNT, min);   /* check the limits */
  if (max <= 0) error(E_ITEMCNT, max);   /* for the set size */
  if (mindev > -DBL_MAX) flags |= OF_DEV;

  /* --- create item set and transaction set --- */
  itemset = is_create();        /* create an item set and */
  if (!itemset) error(E_NOMEM); /* set the special characters */
  is_chars(itemset, blanks, fldseps, recseps, cominds);
  taset = tas_create(itemset);  /* create a transaction set */
  if (!taset) error(E_NOMEM);   /* to store the transactions */
  MSG(fprintf(stderr, "\n"));   /* terminate the startup message */

  /* --- read transactions --- */
  t = clock();                  /* start the timer */
  if (fn_in && *fn_in)          /* if an input file name is given, */
    in = fopen(fn_in, "r");     /* open input file for reading */
  else {                        /* if no input file name is given, */
    in = stdin; fn_in = ""; }   /* read from standard input */
  MSG(fprintf(stderr, "reading %s ... ", fn_in));
  if (!in) error(E_FOPEN, fn_in);
  for (tacnt = 0; 1; tacnt++) { /* transaction read loop */
    k = is_read(itemset, in);   /* read the next transaction */
    if (k < 0) error(k, fn_in, RECCNT(itemset), BUFFER(itemset));
    if (k > 0) break;           /* check for error and end of file */
    if (tas_add(taset, NULL, 0) != 0)
      error(E_NOMEM);           /* add the loaded transaction */
  }                             /* to the transaction set */
  if (in != stdin) fclose(in);  /* if not read from standard input, */
  in = NULL;                    /* close the input file */
  n  = is_cnt(itemset);         /* get the number of items */
  MSG(fprintf(stderr, "[%d item(s),", n));
  MSG(fprintf(stderr, " %d transaction(s)] done ", tacnt));
  MSG(fprintf(stderr, "[%.2fs].", SEC_SINCE(t)));
  if ((n <= 0) || (tacnt <= 0)) error(E_NOTAS);
  MSG(fprintf(stderr, "\n"));   /* check for at least one transaction */
  if (supp < 0) {               /* if absolute support is given */
    if (!(flags & OF_ABS)) flags = (flags & ~OF_REL) | OF_ABS;
    supp = (-100 *supp -0.5) /tacnt;
    if (supp < 0) supp = 0;     /* compute a proper relative support */
  }                             /* and check it against 0 */

  /* --- sort and recode items --- */
  supp = ceil(tacnt *supp);     /* sort and recode the items */
  MSG(fprintf(stderr, "sorting and recoding items ... "));
  t   = clock();                /* start the timer */
  map = (int*)malloc(is_cnt(itemset) *sizeof(int));
  if (!map) error(E_NOMEM);     /* create an item identifier map */
  n = is_recode(itemset, (int)supp, sort, map);
  is_trunc(itemset, n);         /* truncate the itemset and */
  tas_recode(taset, map, n);    /* recode the loaded transactions */
  free(map);                    /* delete the item identifier map */
  MSG(fprintf(stderr, "[%d item(s)] ", n));
  MSG(fprintf(stderr, "done [%.2fs].", SEC_SINCE(t)));
  if (n <= 0) error(E_NOTAS);   /* print a log message and */
  MSG(fprintf(stderr, "\n"));   /* check the number of items */
  if (flags & OF_DEV) {         /* if to compute deviation */
    logfs = (double*)malloc(n *sizeof(double));
    if (!logfs) error(E_NOMEM); /* create a buffer for the logarithms */
    for (i = n; --i >= 0; ) {   /* traverse the (frequent) items */
      k = is_getfrq(itemset, i);
      logfs[i] = (k > 0) ? log(k/(double)tacnt) : 0;
    }                           /* precompute the logarithms */
  }                             /* of the item frequencies */

  /* --- create a frequent pattern tree --- */
  MSG(fprintf(stderr, "creating frequent pattern tree ... "));
  t = clock();                  /* start the timer */
  tas_sort(taset, heap);        /* sort the transactions */
  fptree = fpt_create(taset);   /* create a frequent pattern tree */
  if (!fptree) error(E_NOMEM);  /* to represent the transactions */
  tas_delete(taset, 0);         /* delete the transaction set */
  taset = NULL;                 /* and print a success message */
  MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t)));

  /* --- find frequent item sets --- */
  t = clock();                  /* start the timer */
  if (fn_out && *fn_out)        /* if an output file name is given, */
    out = fopen(fn_out, "w");   /* open the output file */
  else {                        /* if no output file name is given, */
    out = stdout; fn_out = ""; }    /* write to std. output */
  MSG(fprintf(stderr, "writing %s ... ", fn_out));
  if (!out) error(E_FOPEN, fn_out);
  k = fpt_search(fptree, (int)supp, min, max, mode, _report, NULL);
  if (k < 0) error(E_NOMEM);    /* search for frequent item sets */
  if (fflush(out) != 0) error(E_FWRITE, fn_out);
  if (out != stdout) fclose(out);
  out = NULL;                   /* close the output file */
  MSG(fprintf(stderr, "[%d set(s)] done ", k));
  MSG(fprintf(stderr, "[%.2fs].\n", SEC_SINCE(t)));

  /* --- clean up --- */
  #ifndef NDEBUG                /* if this is a debug version */
  if (logfs) free(logfs);       /* delete the log freqs. buffer */
  fpt_delete(fptree);           /* delete the freq. pattern tree */
  is_delete(itemset);           /* and the item set */
  #endif
  #ifdef STORAGE                /* if storage debugging */
  showmem("at end of program"); /* check memory usage */
  #endif
  return 0;                     /* return 'ok' */
}  /* main() */