www.pudn.com > cppcc.rar > basic_dfa_generator.cc


/*
 *  File:       basic_dfa_generator.cc
 *              $Id: basic_dfa_generator.cc,v 1.7 2002/05/27 02:58:12 alec Exp $
 *
 *  Author:     Alec Panoviciu (alecu@email.com)
 * 
 *  Comments:
 *
 *  Revision history:
 *
 *  $Log: basic_dfa_generator.cc,v $
 *  Revision 1.7  2002/05/27 02:58:12  alec
 *  doc update
 *
 *  Revision 1.6  2002/05/22 01:31:42  alec
 *  cleaned up firstpo/followpos/nullable computation
 *
 *  Revision 1.5  2002/05/07 10:02:18  alec
 *  fixed some bugs & mem leaks; added MORE tokens support
 *
 *  Revision 1.4  2002/05/04 17:39:22  alec
 *  the scanner works (slightly tested)
 *
 *  Revision 1.3  2002/05/01 16:32:14  alec
 *  dfa ok. huh !
 *
 *  Revision 1.2  2002/05/01 09:18:26  alec
 *  - vBitset fixes
 *  - FOLLOWPOS written (not tested yet)
 *
 *  Revision 1.1  2002/04/30 16:26:38  alec
 *  my big endian will eat your little endian
 *
 */

/*
  Copyright (C) 2002 Alexandru Panoviciu (alecu@email.com)

  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with this program; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

 */

#include 
#include 

#include "prop_registry.hh"
#include "scanner_spec.hh"
#include "basic_dfa_generator.hh"
#include "basic_dfa_spec.hh"
#include "dfa_source_re.hh"
#include "re_node.hh"
#include "re_node_algo.hh"
#include "itoken_spec.hh"

BasicDfaGenerator::BasicDfaGenerator (ITokenSpec &tokens_,
                                      PropRegistry ®istry_) :
  tokens(tokens_), registry(registry_)
{}


/**
 * Implements the regexp leaf nodes numbering algorithm.
 */
class LeafNumberGen : public ReNodeAlgo
{
public:

  LeafNumberGen () : counter(0) {}

  /**
   * Assigns each leaf node with unique consecutive ids.
   */
  virtual bool operator () (ReNode &n)
  {
    ASSERT(&n != NULL, "Regexp tree corrupted !");
    
    if (typeid(n) == typeid(ReCharNode)) {
      ReCharNode &nn = dynamic_cast(n);
      nn.pos = counter++;
    } else if (typeid(n) == typeid(ReEotNode)) {
      ReEotNode &nn = dynamic_cast(n);
      nn.pos = counter++;
    } else if (typeid(n) == typeid(ReLambdaNode)) {
      ReLambdaNode &nn = dynamic_cast(n);
      nn.pos = counter++;
    }
      
    return true;
  }
  
  unsigned int counter;
};


/**
 * Implements the FOLLOWPOS table generation algorithm.
 */
class FollowPosGen : public ReNodeAlgo
{
public:
  
  FollowPosGen (vBitset *table_) : table(table_) {}

  virtual bool operator () (ReNode &n)
  {
    if (typeid(n) == typeid(ReCatNode)) {
      ReCatNode &nn = dynamic_cast(n);
      vBitset &preLast = nn.pre->lastPos;
      vBitset &postFirst = nn.post->firstPos;
      vBitset::reference ri = preLast.begin();
      for (unsigned int i = 0; ri.neq(preLast.end()); ri.next(), i++)
        if (ri.get())
          table[i] |= postFirst;
    } else if (typeid(n) == typeid(ReStarNode)) {
      ReStarNode &nn = dynamic_cast(n);
      vBitset &nnLast = nn.lastPos;
      vBitset &nnFirst = nn.firstPos;
      vBitset::reference ri = nnLast.begin();
      for (unsigned int i = 0; ri.neq(nnLast.end()); ri.next(), i++)
        if (ri.get())
          table[i] |= nnFirst;
    }

    return true;
  }
  
  vBitset *table;
};


/**
 * Stores a pointers to each leaf node in a regexp tree into a given array of
 * pointers. The position in the array where the opinters are sored is given
 * by the leaf nodes' position fields.
 */
class LeafFinder : public ReNodeAlgo
{
public:
  
  LeafFinder (DfaSourceRe **leaves_) : leaves(leaves_)
  {}

  virtual bool operator () (ReNode &n)
  {
    ASSERT(&n != NULL, "Regexp tree corrupted !");
    
    if (typeid(n) == typeid(ReCharNode)) {
      ReCharNode &nn = dynamic_cast(n);
      leaves[nn.pos] = &nn;
    } else if (typeid(n) == typeid(ReEotNode)) {
      ReEotNode &nn = dynamic_cast(n);
      leaves[nn.pos] = &nn;
    } else if (typeid(n) == typeid(ReLambdaNode)) {
      ReLambdaNode &nn = dynamic_cast(n);
      leaves[nn.pos] = &nn;
    }
    return true;
  }

  
  DfaSourceRe **leaves;
};


/**
 * Computes the firstpos and lastpos sets for each node. The bitvectors that
 * represents the sets will have a 1 at position i if the leaf node with
 * position i belongs to the set.
 */
class AttributeGen : public ReNodeAlgo
{
public:

  virtual bool operator () (ReNode &n)
  {
    if (typeid(n) == typeid(ReCatNode)) {
      ReCatNode &nn = dynamic_cast(n);
      nn.nullable = nn.pre->nullable && nn.post->nullable;
      nn.firstPos = nn.pre->firstPos;
      if (nn.pre->nullable) nn.firstPos |= nn.post->firstPos;
      nn.lastPos = nn.post->lastPos;
      if (nn.post->nullable) nn.lastPos |= nn.pre->lastPos;
    } else if (typeid(n) == typeid(ReOrNode)) {
      ReOrNode &nn = dynamic_cast(n);
      nn.nullable = nn.pre->nullable || nn.post->nullable;
      nn.firstPos = nn.pre->firstPos;
      nn.firstPos |= nn.post->firstPos;
      nn.lastPos = nn.post->lastPos;
      nn.lastPos |= nn.pre->lastPos;
    } else if (typeid(n) == typeid(ReStarNode)) {
      ReStarNode &nn = dynamic_cast(n);
      nn.nullable = true;
      nn.firstPos = nn.in->firstPos;
      nn.lastPos = nn.in->lastPos;
    } else if (typeid(n) == typeid(ReCharNode)) {
      ReCharNode &nn = dynamic_cast(n);
      nn.nullable = false;
      nn.firstPos.resize(nn.pos + 1);
      nn.firstPos[nn.pos].set();
      nn.lastPos.resize(nn.pos + 1);
      nn.lastPos[nn.pos].set();
    } else if (typeid(n) == typeid(ReEotNode)) {
      ReEotNode &nn = dynamic_cast(n);
      nn.nullable = true;
      nn.firstPos.resize(nn.pos + 1);
      nn.firstPos[nn.pos].set();
      nn.lastPos.resize(nn.pos + 1);
      nn.lastPos[nn.pos].set();
    } else if (typeid(n) == typeid(ReLambdaNode)) {
      ReLambdaNode &nn = dynamic_cast(n);
      nn.nullable = true;
    } else ASSERT(0, "Bat typeid in AttributeGen::operator ()");

    return true;
  }
};



BasicDfaSpec& BasicDfaGenerator::createBasicDfa(const LexicalStateSpec &ls)
{
  BasicDfaSpec &res = *new BasicDfaSpec(ls.name);
  DfaSourceRe *regexp = dynamic_cast(ls.regexp);

  
  
  // 1. do the numbering
  LeafNumberGen nr;
  regexp->dfTraverse(nr);

  // This vector tells us where leave nodes are. At index i, we have a a pointer
  // to leaf node with pos == i
  DfaSourceRe **leaves = new (DfaSourceRe*)[nr.counter];
  LeafFinder lf(leaves);
  regexp->dfTraverse(lf);
  
  
  // 2. compute attributes
  AttributeGen aga;
  regexp->dfTraverse(aga);
  
#ifdef DEBUG
  if (((string)registry["debug"]).find('f') != string::npos) {
    cerr << "-------------------------------------------------------------"
         << endl << "STATE NODES DUMP FOR STATE " << ls.name << endl;
    regexp->dump(cerr);
  }
#endif

  // 3. compute the followpos
  vBitset *followPos = new vBitset[nr.counter];
  FollowPosGen fpg(followPos);
  regexp->dfTraverse(fpg);

#ifdef DEBUG
  if (((string)registry["debug"]).find('f') != string::npos) {
    cerr << "-------------------------------------------------------------"
         << endl << "FOLLOWPOS table:" << endl;
    for (unsigned int i = 0; i < nr.counter; i++) {
      cerr << i << ": ";
      vBitset::reference ri = followPos[i].begin();
      for (unsigned int j = 0; ri.neq(followPos[i].end()); ri.next(), j++)
        if (ri.get()) cerr << j << " ";
      cerr << endl;
    }
  }
#endif

  // the bitvector associated to each state
  vector statesDesc;

  // states for which transitions have not been computed yet.
  // the number stored is the index into statesDesc where the state is described
  queue q;

  // create the initial state
  statesDesc.push_back(new vBitset(regexp->firstPos));
  res.states.push_back(BasicDfaSpec::State());
  q.push(0); 

  while (!q.empty())
  {
    int Dnum = q.front(); q.pop();
    vBitset &Dset = *statesDesc[Dnum];
    
    for (unsigned int c = 0; c < 256; c++) {
      vBitset *Uset = new vBitset();
      
      vBitset::reference pi = Dset.begin();
      for (unsigned int p = 0; pi.neq(Dset.end()); p++, pi.next())
        if (pi.get() && (typeid(*leaves[p]) == typeid(ReCharNode))) {
          ReCharNode &n = dynamic_cast(*leaves[p]);
          if (n.match == c)  // node n at position p matches c, then add
                              // followPos(n) into Udesc
            *Uset |= followPos[p];
        }

      if (Uset->all0()) {
        delete Uset;
        continue; //if U is empty, continue with next c
      }
      
      int Unum = 0;

      for (; Unum < statesDesc.size(); Unum++)
        if (*statesDesc[Unum] == *Uset) break;
      
      if (Unum == statesDesc.size()) {
        // U is a new state
        statesDesc.push_back(Uset);
        res.states.push_back(BasicDfaSpec::State());
        q.push(Unum);
      } else
        delete Uset;
      
      res.states[Dnum].addTransition(c, Unum);
      
    }

  } // while !w.empty()


  // determine and mark as such the accepting states
  // a state is accepting state if at least one of the position that describes
  // it is that of an EOT leaf node
  for (int Snum = 0; Snum < statesDesc.size(); Snum++) {
    vBitset::reference pi = statesDesc[Snum]->begin();
    for (unsigned int i = 0; pi.neq(statesDesc[Snum]->end()); i++, pi.next())
      if (pi.get() && (typeid(*leaves[i]) == typeid(ReEotNode))) {
        ReEotNode &n = dynamic_cast(*leaves[i]);
        // this warning is damn annoying !
        if (res.states[Snum].isFinal) {
          /*  cerr << formatWarning(leaves[i]->getPos(), string("Regexp for token ") +
                                tokens[res.states[Snum].tokId].name() +
                                " hides regexp for token "
                                + tokens[n.tokId].name() +
                                " which might not get matched while in lexical state " +
                                ls.name) << endl;*/
          continue;
        }
        res.states[Snum].isFinal = true;
        res.states[Snum].tokId = n.tokId;
      }
  }
 
  
#ifdef DEBUG
  if (((string)registry["debug"]).find('f') != string::npos) {
    cerr << "----------------------------------------------------" << endl
         << "DFA dump:" << endl;
    res.dump(cerr);
  }
#endif

  for (int i = 0; i < statesDesc.size(); i++) delete statesDesc[i];
  delete[] followPos;
  delete[] leaves;
  
  return res;
}