www.pudn.com > sdk_host_2520.rar > build_dbcs2unicode.pl
#///////////////////////////////////////////////////////////////////////////////
# Copyright(C) SigmaTel, Inc. 2000-2001
#
# Filename: build_DBCS2Unicode.pl
# Description: Read the specified Code Page description file (e.g. CP1250.TXT)
# and produce Primary DBCS Map (PDM) and Secondary DBCS Maps (SDMs)
# resource files (e.g. CP1250_PDM.src and CP1250_SDMs.src).
# Also create file listing the Unicode character codes actually
# mapped by the Code Page.
#
# Usage: build_DBCS2Unicode.pl CodePagePrefix
#
# Ex: build_DBCS2Unicode.pl CP1250 0001
#
#///////////////////////////////////////////////////////////////////////////////
use File::Basename;
my $tool_dir = $0;
$tool_dir = (dirname $tool_dir) . "/DBCS2Unicode";
push @INC, $tool_dir;
print "tool_dir = $tool_dir\n";
require "build_PDM.pl";
require "build_SDMs.pl";
require "PDM_SDM.pl";
# The Primary and Secondary Maps are implemented as
# hashes. %PrimaryDBCSMap is the Primary Map. Each
# key (00-ff) is either a Single Byte Character (SBC)
# or the Lead Byte of a Multi (double) Byte Character
# (MBC). For an SBC key, the value of the hash for
# that key is a Unicode Character. For an MBC key,
# the value is a reference to another hash - the
# Secondary DBCS Map (SDM) for that Lead Byte. Some
# keys for the Primary Map may have undefined values.
# These should map to the default Unicode character,
# as SBCs.
my %PrimaryDBCSMap;
# %UnicodeCharsMapped records which Unicode characters (out
# of 65k possible) are mapped by the Code Page.
my %UnicodeCharsMapped;
if (@ARGV != 1) {
print STDERR "Usage: build_DBCS2Unicode.pl CodePage_prefix \n";
exit -1;
}
my $CodePagePrefix = shift;
my $defaultUnicodeChar = 1; #default character is hardcoded to 1
$defaultUnicodeChar = hex $defaultUnicodeChar;
my $statsString = readCodePageFile(\%PrimaryDBCSMap, $CodePagePrefix, \%UnicodeCharsMapped);
build_PDM(\%PrimaryDBCSMap, $CodePagePrefix, $defaultUnicodeChar) ||
die "PDM build failed!\n";
build_SDMs(\%PrimaryDBCSMap, $CodePagePrefix, $defaultUnicodeChar) ||
die "SDMs build failed!\n";
build_UnicodeCharList($CodePagePrefix, \%UnicodeCharsMapped);
print STDERR $statsString;
#;///////////////////////////////////////////////////////////////////////////////
#;> Name: readCodePageFile
#;
#; Type: Function
#; Description: Read the specified Code Page File, parse the DBCS-to-Unicode
# mappings and populate the Primary and Secondary DBCS Maps (hashes).
#
#; Inputs: $PrimaryDBCSMapRef - Reference to %PrimaryDBCSMap - the thing to
# populate.
# $CodePagePrefix - First part of the Code Page filename. The
# rest is assumed to be ".TXT".
# $UnicodeCharsMappedRef - Reference to has for recording which Unicode
# character codes are actually mapped by the Code
# Page.
#; Outputs: $PrimaryDBCSMapRef - Gets filled with values and references to new
# (Secondary DBCS Map) hashes.
#
#; Notes: The Code Page File format follows that of files found on the
# Unicode.org Web site. They appear to have come from Microsoft,
# originally.
#
# The files are ASCII, with each line being a comment (beginning
# with a "#" character) or a character mapping.
#
# A character mapping line starts with a DBCS value, of either
# two or four digits (e.g. "0x30" or "0x8130"). The longer values
# are Multi Byte Characters (MBC); the shorter are Single Byte
# Characters (SBC).
#
# After the DBCS value, there is whitespace, followed by an optional
# Unicode character value, in hex (e.g. "0x014e"). If the Unicode
# value is missing, the DBCS code is either a Lead Byte or is undefined.
#
# Following the Unicode value, ther eis more whitespace, then an
# optional comment, beginning with a "#" character.
#
#;<
#;///////////////////////////////////////////////////////////////////////////////
sub readCodePageFile {
my $DBCS;
my $Unicode;
my $LeadByte;
my %LeadBytes;
my $TrailingByte;
my $cntMappedChars = 0;
my %UnicodeRegions;
my $cntUnicodeRegions = 0;
my $cntLeadBytes = 0;
my $PrimaryDBCSMapRef = shift;
my $CodePagePrefix = shift;
my $UnicodeCharsMappedRef = shift;
open(CODEPAGE, "$CodePagePrefix.txt") ||
die "Can't open Code Page file: $CodePagePrefix.txt\n";
while () {
if (/^0x(\S+)\s0x(\S+)/i) { # Line = 0xYYYY 0xZZZZ # COMMENTS
$DBCS = hex($1);
$Unicode = hex($2);
if ($DBCS > 0xff) { # MBC (Multi-Byte Character (double-byte))
$LeadByte = ($DBCS & 0xff00)>>8;
$TrailingByte = $DBCS & 0x00ff;
if (defined $$PrimaryDBCSMapRef{$DBCS} && ! ref $$PrimaryDBCSMapRef{$DBCS}) {
print STDERR "ERROR: $DBCS is multiply-defined";
exit -1;
}
$$PrimaryDBCSMapRef{$LeadByte}->{$TrailingByte} = $Unicode;
$$UnicodeCharsMappedRef{$Unicode} = 1;
$cntMappedChars++;
$LeadBytes{$LeadByte} = 1;
$UnicodeRegions{$Unicode>>8} = 1;
}
else { # SBC (Single-Byte Character)
if (defined $$PrimaryDBCSMapRef{$DBCS}) {
print STDERR "ERROR: $DBCS is multiply-defined";
exit -1;
}
$$PrimaryDBCSMapRef{$DBCS} = $Unicode;
$$UnicodeCharsMappedRef{$Unicode} = 1;
$cntMappedChars++;
$UnicodeRegions{$Unicode>>8} = 1;
}
}
elsif (/^0x(\S+)/) { # Line = 0xYYYY # COMMENT
$DBCS = hex($1);
}
elsif (/^\w*\#/) { # Line = # COMMENT
# Skip comment lines
}
else { # Line =
print "WARNING: Line in Code Page File not matched: $_";
}
}
$cntUnicodeRegions = keys (%UnicodeRegions);
$cntLeadBytes = keys (%LeadBytes);
my $statsString =
"\n\nCharacters mapped = $cntMappedChars, Unicode regions used = $cntUnicodeRegions\n" .
"LeadBytes used = $cntLeadBytes\n";
close CODEPAGE;
return $statsString;
}
#;///////////////////////////////////////////////////////////////////////////////
#;> Name: build_UnicodeCharList
#;
#; Type: Function
#; Description: Create a file listing all of the Unicode character codes
# used by the Code Page.
#; Inputs: $CodePagePrefix - First part of Code Page filename
# $UnicodeCharsMappedRef - Reference to hash. Keys are Unicode
# char code values (0000-ffff). If hash
# is defined for a key, that code was mapped.
#; Outputs:
#; Notes:
#;<
#;///////////////////////////////////////////////////////////////////////////////
sub build_UnicodeCharList {
my $CodePagePrefix = shift;
my $UnicodeCharsMappedRef = shift;
open LIST_FILE, ">$CodePagePrefix\_UnicodeUsed.txt" ||
die "Can't open output file: $CodePagePrefix\_UnicodeUsed.txt\n";
for (my $charCode=0; $charCode<=0xffff; $charCode++) {
if ($$UnicodeCharsMappedRef{$charCode}) {
printf LIST_FILE "0x%04x\n", $charCode;
}
}
close LIST_FILE;
}
#//////////////////////////////////////////////////////////////////////
#//////////////////////////////////////////////////////////////////////
#//////////////////////////////////////////////////////////////////////