www.pudn.com > gzip.rar > deflate.c


/* deflate.c -- compress data using the deflation algorithm 
 * Copyright (C) 1992-1993 Jean-loup Gailly 
 * This is free software; you can redistribute it and/or modify it under the 
 * terms of the GNU General Public License, see the file COPYING. 
 */ 
 
/* 
 *  PURPOSE 
 * 
 *      Identify new text as repetitions of old text within a fixed- 
 *      length sliding window trailing behind the new text. 
 * 
 *  DISCUSSION 
 * 
 *      The "deflation" process depends on being able to identify portions 
 *      of the input text which are identical to earlier input (within a 
 *      sliding window trailing behind the input currently being processed). 
 * 
 *      The most straightforward technique turns out to be the fastest for 
 *      most input files: try all possible matches and select the longest. 
 *      The key feature of this algorithm is that insertions into the string 
 *      dictionary are very simple and thus fast, and deletions are avoided 
 *      completely. Insertions are performed at each input character, whereas 
 *      string matches are performed only when the previous match ends. So it 
 *      is preferable to spend more time in matches to allow very fast string 
 *      insertions and avoid deletions. The matching algorithm for small 
 *      strings is inspired from that of Rabin & Karp. A brute force approach 
 *      is used to find longer strings when a small match has been found. 
 *      A similar algorithm is used in comic (by Jan-Mark Wams) and freeze 
 *      (by Leonid Broukhis). 
 *         A previous version of this file used a more sophisticated algorithm 
 *      (by Fiala and Greene) which is guaranteed to run in linear amortized 
 *      time, but has a larger average cost, uses more memory and is patented. 
 *      However the F&G algorithm may be faster for some highly redundant 
 *      files if the parameter max_chain_length (described below) is too large. 
 * 
 *  ACKNOWLEDGEMENTS 
 * 
 *      The idea of lazy evaluation of matches is due to Jan-Mark Wams, and 
 *      I found it in 'freeze' written by Leonid Broukhis. 
 *      Thanks to many info-zippers for bug reports and testing. 
 * 
 *  REFERENCES 
 * 
 *      APPNOTE.TXT documentation file in PKZIP 1.93a distribution. 
 * 
 *      A description of the Rabin and Karp algorithm is given in the book 
 *         "Algorithms" by R. Sedgewick, Addison-Wesley, p252. 
 * 
 *      Fiala,E.R., and Greene,D.H. 
 *         Data Compression with Finite Windows, Comm.ACM, 32,4 (1989) 490-595 
 * 
 *  INTERFACE 
 * 
 *      void lm_init (int pack_level, ush *flags) 
 *          Initialize the "longest match" routines for a new file 
 * 
 *      ulg deflate (void) 
 *          Processes a new input file and return its compressed length. Sets 
 *          the compressed length, crc, deflate flags and internal file 
 *          attributes. 
 */ 
 
#include  
#include "gzip.h" 
#include "lzw.h" /* just for consistency checking */ 
 
/* =========================================================================== 
 * Configuration parameters 
 */ 
 
/* Compile with MEDIUM_MEM to reduce the memory requirements or 
 * with SMALL_MEM to use as little memory as possible. Use BIG_MEM if the 
 * entire input file can be held in memory (not possible on 16 bit systems). 
 * Warning: defining these symbols affects HASH_BITS (see below) and thus 
 * affects the compression ratio. The compressed output 
 * is still correct, and might even be smaller in some cases. 
 */ 
 
#ifndef HASH_BITS 
#define HASH_BITS  15 /* hash */ 
/* For portability to 16 bit machines, do not use values above 15. */ 
#endif 
 
/* To save space (see unlzw.c), we overlay prev+head with tab_prefix and 
 * window with tab_suffix. Check that we can do this: 
 */ 
#if (WSIZE<<1) > (1< BITS-1 
    error: cannot overlay head with tab_prefix1 
#endif 
 
#define HASH_SIZE (unsigned)(1<= HASH_BITS 
 */ 
 
unsigned int near prev_length = 0; 
/* Length of the best match at previous step. Matches not greater than this 
 * are discarded. This is used in the lazy match evaluation. 
 */ 
 
unsigned near strstart = 0;      /* start of string to insert */ 
unsigned near match_start = 0;   /* start of matching string */ 
local int           eofile = 0;        /* flag set at end of input file */ 
local unsigned      lookahead = 0;     /* number of valid bytes ahead in window */ 
 
unsigned near max_chain_length = 0; 
/* To speed up deflation, hash chains are never searched beyond this length. 
 * A higher limit improves compression ratio but degrades the speed. 
 */ 
 
local unsigned int max_lazy_match = 0; 
/* Attempt to find a better match only when the current match is strictly 
 * smaller than this value. This mechanism is used only for compression 
 * levels >= 4. 
 */ 
#define max_insert_length  max_lazy_match 
/* Insert new strings in the hash table only if the match length 
 * is not greater than this length. This saves time but degrades compression. 
 * max_insert_length is used only for compression levels <= 3. 
 */ 
/* compression level (1..9) */ 
 
unsigned near good_match = 0; 
/* Use a faster search when the previous match is longer than this */ 
 
/* Values for max_lazy_match, good_match and max_chain_length, depending on 
 * the desired pack level (0..9). The values given below have been tuned to 
 * exclude worst case performance for pathological files. Better values may be 
 * found for specific files. 
 */ 
 
typedef struct config { 
    ush good_length; /* reduce lazy search above this match length */ 
    ush max_lazy;    /* do not perform lazy search above this match length */ 
    ush nice_length; /* quit search above this match length */ 
    ush max_chain; 
} config; 
 
int near nice_match = 0; /* Stop searching when current match exceeds this */ 
 
/*      good lazy nice chain */ 
local config configuration_table[10] = { 
    {   0,    0,  0,    0},  /* store only *//* 0 */ 
    {   4,    4,  8,    4},  /* maximum speed, no lazy matches */  /* 1 */ 
    {   4,    5, 16,    8},  /* 2 */ 
    {   4,    6, 32,   32}, /* 3 */ 
 
    {   4,    4, 16,   16},  /* lazy matches *//* 4 */ 
    {   8,   16, 32,   32},/* 5 */ 
    {   8,   16, 128, 128},/* 6 */ 
    {   8,   32, 128, 256},/* 7 */ 
    {   32, 128, 258, 1024},/* 8 */ 
    {   32, 258, 258, 4096}}; /* maximum compression *//* 9 */ 
 
    /* Note: the deflate() code requires max_lazy >= MIN_MATCH and max_chain >= 4 
 * For deflate_fast() (levels <= 3) good is ignored and lazy has a different 
 * meaning. 
 */ 
 
#define EQUAL 0 
    /* result of memcmp for equal strings */ 
 
    /* =========================================================================== 
 *  Prototypes for local functions. 
 */ 
    local void fill_window   OF(( void )); 
 
int  longest_match OF(( IPos cur_match )); 
 
/* =========================================================================== 
 * Update a hash value with the given input byte 
 * IN  assertion: all calls to to UPDATE_HASH are made with consecutive 
 *    input characters, so that a running hash key can be computed from the 
 *    previous key instead of complete recalculation each time. 
 */ 
#define UPDATE_HASH(h,c) (h = (((h)<= 1 
 */ 
/* For MSDOS, OS/2 and 386 Unix, an optimized version is in match.asm or 
 * match.s. The code is functionally equivalent, so you can use the C version 
 * if desired. 
 */ 
int longest_match( IPos cur_match ) 
{ 
    unsigned chain_length = max_chain_length;   /* max hash chain length */ 
    register uch * scan = window + strstart;     /* current string */ 
    register uch * match;                        /* matched string */ 
    register int len;                           /* length of current match */ 
    int best_len = prev_length;                 /* best match length so far */ 
    IPos limit = strstart > ( IPos )MAX_DIST ? strstart - ( IPos )MAX_DIST : NIL; 
    /* Stop when cur_match becomes <= limit. To simplify the code, 
     * we prevent matches with the string of window index 0. 
     */ 
 
    /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16. 
 * It is easy to get rid of this optimization if necessary. 
 */ 
    register uch * strend = window + strstart + MAX_MATCH; 
    register uch scan_end1  = scan[best_len - 1]; 
    register uch scan_end   = scan[best_len]; 
 
    if ( prev_length >= good_match ) { 
        /* Do not waste too much time if we already have a good match: */ 
        chain_length >>= 2; 
    } 
 
    do 
    { 
        Assert( cur_match < strstart, "no future" ); 
        match = window + cur_match; 
 
        /* Skip to next match if the match length cannot increase 
         * or if the match length is less than 2: 
         */ 
 
        if ( match[best_len]   != scan_end  || 
            match[best_len - 1] != scan_end1 || 
            * match            != * scan     || 
            *++ match          != scan[1] )      continue; 
 
        /* The check at best_len-1 can be removed because it will be made 
         * again later. (This heuristic is not always a win.) 
         * It is not necessary to compare scan[2] and match[2] since they 
         * are always equal when the other bytes match, given that 
         * the hash keys are equal and that HASH_BITS >= 8. 
         */ 
        scan += 2, match ++; 
 
        /* We check for insufficient lookahead only every 8th comparison; 
         * the 256th check will be made at strstart+258. 
         */ 
        do { 
        } while (*++ scan == *++ match && *++ scan == *++ match && 
            *++ scan == *++ match && *++ scan == *++ match && 
            *++ scan == *++ match && *++ scan == *++ match && 
            *++ scan == *++ match && *++ scan == *++ match && 
            scan < strend ); 
 
        len = MAX_MATCH - ( int )( strend - scan ); 
        scan = strend - MAX_MATCH; 
 
        if ( len > best_len ) { 
            match_start = cur_match; 
            best_len = len; 
            if ( len >= nice_match ) break; 
            scan_end1  = scan[best_len - 1]; 
            scan_end   = scan[best_len]; 
        } 
    } while (( cur_match = prev[cur_match & WMASK] ) > limit 
        && -- chain_length != 0 ); 
 
    return best_len; 
} 
 
#define check_match(start, match, length) 
 
/* =========================================================================== 
 * Fill the window when the lookahead becomes insufficient. 
 * Updates strstart and lookahead, and sets eofile if end of input file. 
 * IN assertion: lookahead < MIN_LOOKAHEAD && strstart + lookahead > 0 
 * OUT assertions: at least one byte has been read, or eofile is set; 
 *    file reads are performed for at least two bytes (required for the 
 *    translate_eol option). 
 */ 
local void fill_window() 
{ 
    register unsigned n, m; 
    unsigned more = ( unsigned )( window_size - ( ulg )lookahead - ( ulg )strstart ); 
    /* Amount of free space at the end of the window. */ 
 
    /* If the window is almost full and there is insufficient lookahead, 
     * move the upper half to the lower one to make room in the upper half. 
     */ 
    if ( more == ( unsigned )- 1 ) { 
        /* Very unlikely, but possible on 16 bit machine if strstart == 0 
         * and lookahead == 1 (input done one byte at time) 
         */ 
        more --; 
    } else if ( strstart >= WSIZE + MAX_DIST ) { 
        /* By the IN assertion, the window is not empty so we can't confuse 
         * more == 0 with more == 64K on a 16 bit machine. 
         */ 
        Assert( window_size == ( ulg )2 * WSIZE, "no sliding with BIG_MEM" ); 
 
        memcpy(( char *)window, ( char *)window + WSIZE, ( unsigned )WSIZE ); 
        match_start -= WSIZE; 
        strstart    -= WSIZE; /* we now have strstart >= MAX_DIST: */ 
 
        block_start -= ( int ) WSIZE; 
 
        for ( n = 0; n < HASH_SIZE; n ++) { 
            m = head[n]; 
            head[n] = ( Pos )( m >= WSIZE ? m - WSIZE : NIL ); 
        } 
        for ( n = 0; n < WSIZE; n ++) { 
            m = prev[n]; 
            prev[n] = ( Pos )( m >= WSIZE ? m - WSIZE : NIL ); 
            /* If n is not on any hash chain, prev[n] is garbage but 
             * its value will never be used. 
             */ 
        } 
        more += WSIZE; 
    } 
    /* At this point, more >= 2 */ 
    if (! eofile ) { 
        n = read_buf(( char *)window + strstart + lookahead, more ); 
        if ( n == 0 || n == ( unsigned )- 1 ) { 
            eofile = 1; 
        } else { 
            lookahead += n; 
        } 
    } 
} 
 
/* =========================================================================== 
 * Flush the current block, with given end-of-file flag. 
 * IN assertion: strstart is set to the end of the current match. 
 */ 
#define FLUSH_BLOCK(eof) \ 
   flush_block(block_start >= 0L ? (char*)&window[(unsigned)block_start] : \ 
                (char*)NULL, (long)strstart - block_start, (eof)) 
 
/* =========================================================================== 
 * Same as above, but achieves better compression. We use a lazy 
 * evaluation for matches: a match is finally adopted only if there is 
 * no better match at the next window position. 
 */ 
ulg deflate() 
{ 
    IPos hash_head;          /* head of hash chain */ 
    IPos prev_match;         /* previous match */ 
    int flush;               /* set if current block must be flushed */ 
    int match_available = 0; /* set if previous match exists */ 
    register unsigned match_length = MIN_MATCH - 1; /* length of best match */ 
 
    /* Process the input block. */ 
    while ( lookahead != 0 ) { 
        /* Insert the string window[strstart .. strstart+2] in the 
         * dictionary, and set hash_head to the head of the hash chain: 
         */ 
        INSERT_STRING( strstart, hash_head ); 
 
        /* Find the longest match, discarding those <= prev_length. 
         */ 
        prev_length = match_length, prev_match = match_start; 
        match_length = MIN_MATCH - 1; 
 
        if ( hash_head != NIL && prev_length < max_lazy_match && 
            strstart - hash_head <= MAX_DIST ) { 
            /* To simplify the code, we prevent matches with the string 
             * of window index 0 (in particular we have to avoid a match 
             * of the string with itself at the start of the input file). 
             */ 
            match_length = longest_match ( hash_head ); 
            /* longest_match() sets match_start */ 
            if ( match_length > lookahead ) match_length = lookahead; 
 
            /* Ignore a length 3 match if it is too distant: */ 
            if ( match_length == MIN_MATCH && strstart - match_start > TOO_FAR ){ 
                /* If prev_match is also MIN_MATCH, match_start is garbage 
                 * but we will ignore the current match anyway. 
                 */ 
                match_length --; 
            } 
        } 
        /* If there was a match at the previous step and the current 
         * match is not better, output the previous match: 
         */ 
        if ( prev_length >= MIN_MATCH && match_length <= prev_length ) { 
 
            check_match( strstart - 1, prev_match, prev_length ); 
 
            flush = ct_tally( strstart - 1 - prev_match, prev_length - MIN_MATCH ); 
 
            /* Insert in hash table all strings up to the end of the match. 
             * strstart-1 and strstart are already inserted. 
             */ 
            lookahead -= prev_length - 1; 
            prev_length -= 2; /* -2 */ 
            do { 
                strstart ++; 
                INSERT_STRING( strstart, hash_head ); 
                /* strstart never exceeds WSIZE-MAX_MATCH, so there are 
                 * always MIN_MATCH bytes ahead. If lookahead < MIN_MATCH 
                 * these bytes are garbage, but it does not matter since the 
                 * next lookahead bytes will always be emitted as literals. 
                 */ 
            } while (-- prev_length != 0 ); 
            match_available = 0; 
            match_length = MIN_MATCH - 1; 
            strstart ++; 
            if ( flush ) FLUSH_BLOCK( 0 ), block_start = strstart; 
        } else if ( match_available ) { 
            /* If there was no match at the previous position, output a 
             * single literal. If there was a match but the current match 
             * is longer, truncate the previous match to a single literal. 
             */ 
            Tracevv(( stderr, "%c", window[strstart - 1] )); 
            if ( ct_tally ( 0, window[strstart - 1] )) { 
                FLUSH_BLOCK( 0 ), block_start = strstart; 
            } 
            strstart ++; 
            lookahead --; 
        } else { 
            /* There is no previous match to compare with, wait for 
             * the next step to decide. 
             */ 
            match_available = 1; 
            strstart ++; 
            lookahead --; 
        } 
        Assert ( strstart <= isize && lookahead <= isize, "a bit too far" ); 
 
        /* Make sure that we always have enough lookahead, except 
         * at the end of the input file. We need MAX_MATCH bytes 
         * for the next match, plus MIN_MATCH bytes to insert the 
         * string following the next match. 
         */ 
        while ( lookahead < MIN_LOOKAHEAD && ! eofile ) fill_window(); 
    } 
    if ( match_available ) ct_tally ( 0, window[strstart - 1] ); 
 
    return FLUSH_BLOCK( 1 ); /* eof */ 
}