www.pudn.com > sdk_host_2520.rar > build_dbcs2unicode.pl


#/////////////////////////////////////////////////////////////////////////////// 
# Copyright(C) SigmaTel, Inc. 2000-2001 
# 
# Filename: build_DBCS2Unicode.pl 
# Description: Read the specified Code Page description file (e.g. CP1250.TXT) 
#              and produce Primary DBCS Map (PDM) and Secondary DBCS Maps (SDMs) 
#              resource files (e.g. CP1250_PDM.src and CP1250_SDMs.src). 
#              Also create file listing the Unicode character codes actually 
#              mapped by the Code Page. 
# 
#       Usage: build_DBCS2Unicode.pl CodePagePrefix  
# 
#       Ex: build_DBCS2Unicode.pl CP1250 0001 
# 
#/////////////////////////////////////////////////////////////////////////////// 
 
use File::Basename; 
my $tool_dir = $0; 
$tool_dir = (dirname $tool_dir) . "/DBCS2Unicode"; 
push @INC, $tool_dir; 
 
print "tool_dir = $tool_dir\n"; 
 
 
require "build_PDM.pl"; 
require "build_SDMs.pl"; 
require "PDM_SDM.pl"; 
 
                              # The Primary and Secondary Maps are implemented as  
                              # hashes.  %PrimaryDBCSMap is the Primary Map.  Each 
                              # key (00-ff) is either a Single Byte Character (SBC) 
                              # or the Lead Byte of a Multi (double) Byte Character 
                              # (MBC).  For an SBC key, the value of the hash for  
                              # that key is a Unicode Character.  For an MBC key, 
                              # the value is a reference to another hash - the 
                              # Secondary DBCS Map (SDM) for that Lead Byte.  Some 
                              # keys for the Primary Map may have undefined values. 
                              # These should map to the default Unicode character, 
                              # as SBCs. 
my %PrimaryDBCSMap; 
 
                              # %UnicodeCharsMapped records which Unicode characters (out 
                              # of 65k possible) are mapped by the Code Page. 
my %UnicodeCharsMapped; 
 
if (@ARGV != 1) { 
    print STDERR "Usage: build_DBCS2Unicode.pl CodePage_prefix \n"; 
    exit -1; 
} 
 
my $CodePagePrefix = shift; 
my $defaultUnicodeChar = 1;     #default character is hardcoded to 1 
$defaultUnicodeChar = hex $defaultUnicodeChar; 
 
 
 
my $statsString = readCodePageFile(\%PrimaryDBCSMap, $CodePagePrefix, \%UnicodeCharsMapped); 
 
build_PDM(\%PrimaryDBCSMap, $CodePagePrefix, $defaultUnicodeChar) ||  
    die "PDM build failed!\n"; 
 
build_SDMs(\%PrimaryDBCSMap, $CodePagePrefix, $defaultUnicodeChar) ||  
    die "SDMs build failed!\n"; 
 
build_UnicodeCharList($CodePagePrefix, \%UnicodeCharsMapped); 
 
print STDERR $statsString; 
 
 
 
 
#;/////////////////////////////////////////////////////////////////////////////// 
#;> Name: readCodePageFile 
#; 
#;  Type: Function 
#;  Description: Read the specified Code Page File, parse the DBCS-to-Unicode 
#                mappings and populate the Primary and Secondary DBCS Maps (hashes). 
# 
#;  Inputs:      $PrimaryDBCSMapRef - Reference to %PrimaryDBCSMap - the thing to  
#                                     populate. 
#                $CodePagePrefix    - First part of the Code Page filename.  The 
#                                     rest is assumed to be ".TXT". 
#                $UnicodeCharsMappedRef - Reference to has for recording which Unicode 
#                                     character codes are actually mapped by the Code  
#                                     Page. 
#;  Outputs:     $PrimaryDBCSMapRef - Gets filled with values and references to new 
#                                     (Secondary DBCS Map) hashes. 
# 
#;  Notes:       The Code Page File format follows that of files found on the 
#                Unicode.org Web site.  They appear to have come from Microsoft,  
#                originally. 
# 
#                The files are ASCII, with each line being a comment (beginning 
#                with a "#" character) or a character mapping. 
# 
#                A character mapping line starts with a DBCS value, of either 
#                two or four digits (e.g. "0x30" or "0x8130").  The longer values 
#                are Multi Byte Characters (MBC);  the shorter are Single Byte 
#                Characters (SBC). 
# 
#                After the DBCS value, there is whitespace, followed by an optional 
#                Unicode character value, in hex (e.g. "0x014e").  If the Unicode 
#                value is missing, the DBCS code is either a Lead Byte or is undefined. 
# 
#                Following the Unicode value, ther eis more whitespace, then an 
#                optional comment, beginning with a "#" character. 
# 
#;< 
#;/////////////////////////////////////////////////////////////////////////////// 
sub readCodePageFile { 
    my $DBCS; 
    my $Unicode; 
    my $LeadByte; 
    my %LeadBytes; 
    my $TrailingByte; 
    my $cntMappedChars = 0; 
    my %UnicodeRegions; 
    my $cntUnicodeRegions = 0; 
    my $cntLeadBytes = 0; 
 
    my $PrimaryDBCSMapRef = shift; 
    my $CodePagePrefix = shift; 
    my $UnicodeCharsMappedRef = shift; 
 
    open(CODEPAGE, "$CodePagePrefix.txt")  ||  
        die "Can't open Code Page file: $CodePagePrefix.txt\n"; 
 
    while () { 
 
        if (/^0x(\S+)\s0x(\S+)/i) {          # Line = 0xYYYY  0xZZZZ  # COMMENTS 
            $DBCS = hex($1); 
            $Unicode = hex($2); 
 
            if ($DBCS > 0xff) {  # MBC (Multi-Byte Character (double-byte)) 
                $LeadByte = ($DBCS & 0xff00)>>8; 
                $TrailingByte = $DBCS & 0x00ff; 
 
                if (defined $$PrimaryDBCSMapRef{$DBCS} && ! ref $$PrimaryDBCSMapRef{$DBCS}) { 
                    print STDERR "ERROR: $DBCS is multiply-defined"; 
                    exit -1; 
                } 
 
                $$PrimaryDBCSMapRef{$LeadByte}->{$TrailingByte} = $Unicode; 
 
                $$UnicodeCharsMappedRef{$Unicode} = 1; 
                $cntMappedChars++; 
                $LeadBytes{$LeadByte} = 1; 
                $UnicodeRegions{$Unicode>>8} = 1; 
            } 
            else {               # SBC  (Single-Byte Character) 
 
                if (defined $$PrimaryDBCSMapRef{$DBCS}) { 
                    print STDERR "ERROR: $DBCS is multiply-defined"; 
                    exit -1; 
                } 
 
                $$PrimaryDBCSMapRef{$DBCS} = $Unicode; 
 
                $$UnicodeCharsMappedRef{$Unicode} = 1; 
                $cntMappedChars++; 
                $UnicodeRegions{$Unicode>>8} = 1; 
            } 
        } 
        elsif (/^0x(\S+)/) {              # Line = 0xYYYY            # COMMENT 
            $DBCS = hex($1); 
        } 
        elsif (/^\w*\#/) {                # Line = # COMMENT 
            # Skip comment lines 
        } 
        else {                            # Line =  
            print "WARNING: Line in Code Page File not matched: $_"; 
        } 
    } 
     
    $cntUnicodeRegions = keys (%UnicodeRegions); 
    $cntLeadBytes = keys (%LeadBytes); 
 
    my $statsString =  
      "\n\nCharacters mapped = $cntMappedChars,    Unicode regions used = $cntUnicodeRegions\n" . 
      "LeadBytes used = $cntLeadBytes\n"; 
 
    close CODEPAGE; 
 
    return $statsString; 
} 
 
 
#;/////////////////////////////////////////////////////////////////////////////// 
#;> Name: build_UnicodeCharList 
#; 
#;  Type: Function 
#;  Description:      Create a file listing all of the Unicode character codes 
#                     used by the Code Page.  
#;  Inputs:           $CodePagePrefix        - First part of Code Page filename 
#                     $UnicodeCharsMappedRef - Reference to hash.  Keys are Unicode 
#                                              char code values (0000-ffff).  If hash 
#                                              is defined for a key, that code was mapped. 
#;  Outputs:  
#;  Notes:  
#;< 
#;/////////////////////////////////////////////////////////////////////////////// 
sub build_UnicodeCharList { 
    my $CodePagePrefix = shift; 
    my $UnicodeCharsMappedRef = shift; 
 
    open LIST_FILE, ">$CodePagePrefix\_UnicodeUsed.txt"  || 
        die "Can't open output file: $CodePagePrefix\_UnicodeUsed.txt\n"; 
 
    for (my $charCode=0; $charCode<=0xffff; $charCode++) { 
        if ($$UnicodeCharsMappedRef{$charCode}) { 
            printf LIST_FILE "0x%04x\n", $charCode; 
        } 
    }  
 
    close LIST_FILE; 
} 
 
#////////////////////////////////////////////////////////////////////// 
#////////////////////////////////////////////////////////////////////// 
#//////////////////////////////////////////////////////////////////////