www.pudn.com > CIVStringSet.zip > StringSet.Cpp


// StringSet.Cpp: implementation of the CIVStringSet class. 
// 
// Written 12 June 2002 by Scot T Brennecke 
// Thanks to Moishe Halibard and Moshe Rubin for their article, 
//    "A Multiple Substring Search Algorithm" in the June 2002 
//    edition of C/C++ Users Journal.  This class is based on 
//    the algorthim therein described, but extended to return 
//    all strings and use MFC classes. 
 
 
#include "StdAfx.H" 
#include "StringSet.H" 
 
#ifdef _DEBUG 
#undef THIS_FILE 
static char THIS_FILE[]=__FILE__ ; 
#define new DEBUG_NEW 
#endif 
 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
//                       C I V S t r i n g S e t 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
// CIVStringSet 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
CIVStringSet::CIVStringSet( WORD wInitialWidth /*= 64*/ ) 
    : m_apnFSM( new DWORD[wInitialWidth][128] ), 
      m_nCurDim( wInitialWidth ), 
      m_nUsedCols( 0 ), 
      m_wMaxUsedState( 0 ), 
      m_nCurTextChar( 0 ) 
{ 
    // Initialize all FSM entries to zero 
    ZeroMemory( m_apnFSM, sizeof(DWORD) * 128 * wInitialWidth ) ; 
} 
 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
// ~CIVStringSet 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
CIVStringSet::~CIVStringSet() 
{ 
    delete [] m_apnFSM ; 
} 
 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
// Add 
//    several variations, the first being the most basic 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
bool CIVStringSet::Add( LPCTSTR pszWord ) 
{ 
    bool bRetVal = false ; 
 
    // Insert the word into the FSM, then add it to the array 
    if ( ( pszWord != NULL ) 
      && InsertWord( pszWord, (WORD)(m_nSize + 1) ) 
       ) 
        bRetVal = (CStringArray::Add( pszWord ) >= 0) ; 
 
    return bRetVal ; 
} 
 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
// add a single word from a CString (preserves reference counting) 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
bool CIVStringSet::Add( const CString & rstrWord ) 
{ 
    bool bRetVal = false ; 
 
    // Insert the word into the FSM, then add it to the array 
    if ( InsertWord( (LPCTSTR)rstrWord, (WORD)(m_nSize + 1) ) ) 
        bRetVal = (CStringArray::Add( rstrWord ) >= 0) ; 
 
    return bRetVal ; 
} 
 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
// Add multiple words, delimited with chars from pszDelims 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
int CIVStringSet::Add( LPCTSTR pszWords, LPCTSTR pszDelims ) 
{ 
    int nCount = 0 ; 
 
    // Start with the entire string 
    CString strUnparsed( pszWords ), 
            strToken ; 
 
    // stop iterating when we find no more words following delimiters 
    while ( ! strUnparsed.IsEmpty() ) 
    { 
        // Extract the first token 
        strToken = strUnparsed.SpanExcluding( pszDelims ) ; 
        if ( ! strToken.IsEmpty() ) 
        { 
            if ( Add( strToken ) )  // add it to the FSM and the array 
                nCount++ ; 
            else 
                break ; 
        } 
 
        // Now, move beyond the token just extracted 
        int nTokenLen = strToken.GetLength() ; 
        if ( nTokenLen < strUnparsed.GetLength() ) 
            strUnparsed = strUnparsed.Mid( nTokenLen ) ; 
        else 
            break ; 
 
        // Remove the delimiters 
        strUnparsed.TrimLeft( pszDelims ) ; 
    } 
 
    return nCount ; 
} 
 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
// add nWords words, each \0 term'd, with extra \0 at end 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
int CIVStringSet::Add( LPCTSTR pszzWords, int nWords ) 
{ 
    int nCount = 0 ; 
 
    // Start with the first word in the multi-string 
    LPCTSTR pszCurWord = pszzWords ; 
    if ( pszCurWord != NULL ) 
    { 
        // stop iterating when we find a "word" beginning with null (the extra \0) 
        while ( pszCurWord[0] != '\0' ) 
        { 
            if ( Add( pszCurWord ) )  // add it to the FSM and the array 
            { 
                nCount++ ; 
 
                // Position to the next word, one beyond the null terminator 
                size_t nLen = _tcslen( pszCurWord ) ; 
                // If this fails, the extra \0 was probably not there, as required 
                pszCurWord = &pszzWords[nLen+1] ; 
            } 
            else 
                break ; 
        } 
    } 
 
#ifdef _DEBUG 
    if ( ( nWords != nCount ) 
      && ( nWords > 0 ) 
       ) 
        TRACE("Expected %d words, but found %d!\n", nWords, nCount); 
#else 
    UNUSED(nWords); 
#endif 
 
    return nCount ; 
} 
 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
// add all the elements of a CStringArray 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
int CIVStringSet::Add( CStringArray astrWords ) 
{ 
    int nCount = astrWords.GetSize() ; 
    SetSize( m_nSize + nCount ) ; 
 
    for ( int nWord = 0 ; nWord < nCount ; nWord++ ) 
    { 
        if ( ! Add( astrWords[nWord] ) )  // add it to the FSM and the array 
            break ; 
    } 
 
    return nWord ; 
} 
 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
// add all the elements of a CStringList 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
int CIVStringSet::Add( CStringList lstrWords ) 
{ 
    int nCount = lstrWords.GetCount() ; 
    SetSize( m_nSize + nCount ) ; 
 
    // Iterate the string list 
    POSITION pos = lstrWords.GetHeadPosition() ; 
    while ( pos != NULL ) 
    { 
        if ( ! Add( lstrWords.GetNext( pos ) ) )  // add it to the FSM and the array 
            break ; 
    } 
 
    return nCount ; 
} 
 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
// InsertWord 
//          put the new word into the FSM 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
bool CIVStringSet::InsertWord( LPCTSTR pszWord, WORD wIndex ) 
{ 
    bool bRetVal = false ; 
 
    size_t nLen = _tcslen( pszWord ) ; 
    if ( ( m_wMaxUsedState < MAXWORD )    // if we've got 65,535 states, something's wrong 
      && SetColDim( m_nUsedCols + nLen )  // make sure enough columns are allocated 
       ) 
    { 
        WORD wCurState = 0 ; 
        for ( UINT nChar = 0 ; nChar < nLen ; nChar++ ) 
        { 
            int nIdxChar = (pszWord[nChar] & 0x7F) ;  // mask out char values above 127 
            DWORD dwEntry = m_apnFSM[wCurState][nIdxChar] ;  // the state, and possibly also an index 
            WORD wState = LOWORD(dwEntry) ;  // extract only the state portion 
            bool bNew = (wState == 0) ;      // no previous words have been here 
 
            if ( bNew ) 
                wState = ++m_wMaxUsedState ;  // use a new state number 
 
            // special case - last character of substring 
            if ( nChar == ( nLen-1 ) ) 
            { 
                // Put both the state number and the word index in the entry 
                m_apnFSM[wCurState][nIdxChar] = (DWORD)MAKELONG(wState, wIndex) ; 
                break ; 
            } 
            else if ( bNew ) // if current entry for this char value and state is still zero, add to FSM 
                m_apnFSM[wCurState][nIdxChar] = wState ; 
 
            wCurState = wState ;  // prepare for next iteration 
        } 
 
        m_nUsedCols += nLen ; 
        bRetVal = true ; 
    } 
 
    return bRetVal ; 
} 
 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
// SetColDim 
//          set the current width to at least nNewDim columns 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
bool CIVStringSet::SetColDim( size_t nNewDim ) 
{ 
    bool bRetVal = false ; 
 
    // Determine if we don't yet have enough columns for the newest word  
    if ( m_nCurDim < nNewDim ) 
    { 
        int nNewSize = ((nNewDim + 63) & 0xFFC0) ;  // set to next higher multiple of 64 
        if ( nNewSize < MAXWORD )  // we're "limited" to 65,535 columns 
        { 
            // reallocate the FSM with the increased column width 
            DWORD (* apnTemp)[128] = new DWORD[nNewSize][128] ; 
            ASSERT((int)m_nUsedCols < nNewSize); 
            // Copy the portion already used, and zero out the rest 
            CopyMemory( &apnTemp[0], &m_apnFSM[0], sizeof(DWORD) * 128 * m_nUsedCols ) ; 
            ZeroMemory( &apnTemp[m_nUsedCols], sizeof(DWORD) * 128 * (nNewSize - m_nUsedCols) ) ; 
            delete [] m_apnFSM ; // Get rid of old FSM buffer, 
            m_apnFSM = apnTemp ; // and point to new one 
            m_nCurDim = (WORD)nNewSize ; 
 
            bRetVal = true ; 
        } 
    } 
    else 
        bRetVal = true ; 
 
    return bRetVal ; 
} 
 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
// FindFirstIn 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
UINT CIVStringSet::FindFirstIn( CString strText, int & rnFirst ) 
{ 
    // The real work is done by FindNext, but we must initialize the starting 
    //    character index and string buffer first 
    m_nCurTextChar = 0 ; 
    m_strSearch = strText ; 
 
    return FindNext( rnFirst ) ; 
} 
 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
// FindNext 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
UINT CIVStringSet::FindNext( int & rnNext ) 
{ 
    WORD wCurState = 0 ; // start in state 0 
    UINT nStartChar = 0xFFFFFFFF ; 
    size_t nLen = m_strSearch.GetLength() ; 
 
    // Start with the character following the last one examined 
    for ( UINT nIdx = m_nCurTextChar ; nIdx < nLen ; nIdx++ ) 
    { 
        WORD wPrevState = wCurState ; 
 
        // if we are in state 0, save the index of the starting character, 
        //   in case we must backtrack or for the return value 
        if ( wCurState == 0 ) 
            nStartChar = nIdx ; 
 
        // follow up the table states 
        int nIdxChar = (m_strSearch[(int)nIdx] & 0x7F) ;  // mask out char values above 127 
        DWORD dwEntry = m_apnFSM[wCurState][nIdxChar] ; 
 
        // if we reach an entry with a high word (an index), we have found a full match 
        //    return the word we found 
        if ( dwEntry > MAXWORD ) 
        { 
            int nIndex = HIWORD(dwEntry) - 1 ; 
            ASSERT(nIndex < GetSize()); 
            rnNext = nIndex ; 
            m_nCurTextChar = nIdx + 1 ;  // set the index for the next time 
            break ; 
        } 
        else 
            wCurState = LOWORD(dwEntry) ; 
 
        // have we followed a false lead? 
        // if so, backtrack nIdx to position to continue search, with state reset to zero 
        // set nIdx to nStartChar, and it will be incremented as the loop re-enters 
        if ( ( wPrevState != 0 ) 
          && ( wCurState == 0 ) 
           ) 
        { 
            nIdx = nStartChar ; 
            nStartChar = 0xFFFFFFFF ;  // in case we're about to terminate the loop 
        } 
    } 
 
    // return the index of the start of the word, or -1 if no word found 
    return (( nIdx < nLen ) ? nStartChar : 0xFFFFFFFF) ; 
} 
 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
// FindAllIn 
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
size_t CIVStringSet::FindAllIn( CString strText, CWordPosPairList & rlstrWords ) 
{ 
    rlstrWords.clear() ; 
 
    // Iterate through all words, and put them in the list 
    int  nWord = -1 ; 
    UINT nPos = FindFirstIn( strText, nWord ) ; 
    if ( nPos != 0xFFFFFFFF ) 
    { 
        do 
        { 
            CWordPosPair posWord( nWord, nPos ) ; 
            rlstrWords.push_back( posWord ) ; 
            nPos = FindNext( nWord ) ; 
        } while ( nPos != 0xFFFFFFFF ) ; 
    } 
 
    return rlstrWords.size() ; 
}