www.pudn.com > tidy_src_041213.rar > streamio.h


#ifndef __STREAMIO_H__
#define __STREAMIO_H__

/* streamio.h -- handles character stream I/O

  (c) 1998-2003 (W3C) MIT, ERCIM, Keio University
  See tidy.h for the copyright notice.

  CVS Info :

    $Author: hoehrmann $ 
    $Date: 2004/03/07 14:38:47 $ 
    $Revision: 1.12 $ 

  Wrapper around Tidy input source and output sink
  that calls appropriate interfaces, and applies 
  necessary char encoding transformations: to/from
  ISO-10646 and/or UTF-8.

*/

#include "forward.h"
#include "buffio.h"
#include "fileio.h"

#ifdef __cplusplus
extern "C"
{
#endif
typedef enum
{
  FileIO,
  BufferIO,
  UserIO
} IOType;

/************************
** Source
************************/

#define CHARBUF_SIZE 5

/* non-raw input is cleaned up*/
struct _StreamIn
{
    int    state;     /* FSM for ISO2022 */
    Bool   pushed;
    tchar* charbuf;
    uint   bufpos;
    uint   bufsize;
    int    tabs;
    int    lastcol;
    int    curcol;
    int    curline;
    int    encoding;
    IOType iotype;

    TidyInputSource source;

#ifdef TIDY_WIN32_MLANG_SUPPORT
    ulong  mlang;
#endif

#ifdef TIDY_STORE_ORIGINAL_TEXT
    tmbstr otextbuf;
    size_t otextsize;
    uint   otextlen;
#endif

    /* Pointer back to document for error reporting */
    TidyDocImpl* doc;
};

void freeStreamIn(StreamIn* in);

StreamIn* FileInput( TidyDocImpl* doc, FILE* fp, int encoding );
StreamIn* BufferInput( TidyDocImpl* doc, TidyBuffer* content, int encoding );
StreamIn* UserInput( TidyDocImpl* doc, TidyInputSource* source, int encoding );

int       ReadBOMEncoding(StreamIn *in);
uint      ReadChar( StreamIn* in );
void      UngetChar( uint c, StreamIn* in );
uint      PopChar( StreamIn *in );
Bool      IsEOF( StreamIn* in );


/************************
** Sink
************************/

struct _StreamOut
{
    int   encoding;
    int   state;     /* for ISO 2022 */
    uint  nl;

#ifdef TIDY_WIN32_MLANG_SUPPORT
    ulong mlang;
#endif

    IOType iotype;
    TidyOutputSink sink;
};

StreamOut* FileOutput( FILE* fp, int encoding, uint newln );
StreamOut* BufferOutput( TidyBuffer* buf, int encoding, uint newln );
StreamOut* UserOutput( TidyOutputSink* sink, int encoding, uint newln );

StreamOut* StdErrOutput(void);
StreamOut* StdOutOutput(void);
void       ReleaseStreamOut( StreamOut* out );

void WriteChar( uint c, StreamOut* out );
void outBOM( StreamOut *out );

ctmbstr GetEncodingNameFromTidyId(uint id);

/************************
** Misc
************************/

/* character encodings
*/
#define RAW         0
#define ASCII       1
#define LATIN0      2
#define LATIN1      3
#define UTF8        4
#define ISO2022     5
#define MACROMAN    6
#define WIN1252     7
#define IBM858      8

#if SUPPORT_UTF16_ENCODINGS
#define UTF16LE     9
#define UTF16BE     10
#define UTF16       11
#endif

/* Note that Big5 and SHIFTJIS are not converted to ISO 10646 codepoints
** (i.e., to Unicode) before being recoded into UTF-8. This may be
** confusing: usually UTF-8 implies ISO10646 codepoints.
*/
#if SUPPORT_ASIAN_ENCODINGS
#if SUPPORT_UTF16_ENCODINGS
#define BIG5        12
#define SHIFTJIS    13
#else
#define BIG5        9
#define SHIFTJIS    10
#endif
#endif

#ifdef TIDY_WIN32_MLANG_SUPPORT
/* hack: windows code page numbers start at 37 */
#define WIN32MLANG  36
#endif

/* states for ISO 2022

 A document in ISO-2022 based encoding uses some ESC sequences called
 "designator" to switch character sets. The designators defined and
 used in ISO-2022-JP are:

    "ESC" + "(" + ?     for ISO646 variants

    "ESC" + "$" + ?     and
    "ESC" + "$" + "(" + ?   for multibyte character sets
*/
#define FSM_ASCII    0
#define FSM_ESC      1
#define FSM_ESCD     2
#define FSM_ESCDP    3
#define FSM_ESCP     4
#define FSM_NONASCII 5


/* char encoding used when replacing illegal SGML chars,
** regardless of specified encoding.  Set at compile time
** to either Windows or Mac.
*/
extern const int ReplacementCharEncoding;

/* Function for conversion from Windows-1252 to Unicode */
uint DecodeWin1252(uint c);

/* Function to convert from MacRoman to Unicode */
uint DecodeMacRoman(uint c);

/* Function for conversion from OS/2-850 to Unicode */
uint DecodeIbm850(uint c);

/* Function for conversion from Latin0 to Unicode */
uint DecodeLatin0(uint c);

/* Function to convert from Symbol Font chars to Unicode */
uint DecodeSymbolFont(uint c);
#ifdef __cplusplus
}
#endif


/* Use numeric constants as opposed to escape chars (\r, \n)
** to avoid conflict Mac compilers that may re-define these.
*/
#define CR    0xD
#define LF    0xA

#if   defined(MAC_OS_CLASSIC)
#define DEFAULT_NL_CONFIG TidyCR
#elif defined(_WIN32) || defined(OS2_OS)
#define DEFAULT_NL_CONFIG TidyCRLF
#else
#define DEFAULT_NL_CONFIG TidyLF
#endif


#endif /* __STREAMIO_H__ */