www.pudn.com > sphinx_recipe.zip > FixCMUDict.pl
#!/usr/bin/perl
# Prepares a Sphinx style dictionary based on the original
# from CMU. Does the following:
# Strips any stress marking in a CMU format dictionary.
# Eliminates comment lines
# Sorts alphabetically by words
# Make sure there aren't identical pronunciations of the same word
# Changes erroneous "e" phone to "eh"
# Outputs the list of all unique phones encountered
# Eliminates all "split" words with underscores
#
# Can optional add string to the end of each pronunciation (like sp)
#
# Copyright 2005 by Keith Vertanen
#
use strict;
if ( @ARGV < 2 )
{
print "$0 [add to end]\n";
exit(1);
}
my $listFile;
my $addToEnd;
my $phoneFile;
($listFile, $phoneFile, $addToEnd) = @ARGV;
open(IN, $listFile);
my $line;
my $pos;
my $rest;
my $word;
my %words;
my $firstChar;
my $newPart;
my $phone;
my %phones;
my @chunks;
my $i;
my $num;
my $posStart;
my $posEnd;
my $wordNoNum;
my $maxNum;
while ($line = )
{
if (index($line, "#") != 0)
{
$pos = index($line, " ");
$word = substr($line, 0, $pos);
# $word =~ s/\([23456789]\)//g;
$wordNoNum = $word;
$wordNoNum =~ s/\([23456789]\)//g;
$firstChar = substr($line, 0, 1);
# See if we need to escape this word
# if (($firstChar !~ /[A-Z|a-z|0-9|\s]/) && (length($line) > 0))
# {
# $word = "\\" . $word;
# # print "blah " . $word . "\n";
# }
if ((length($word) > 0) && ($word !~ /_/))
{
$rest = substr($line, $pos + 1);
$rest =~ s/[0123456789]//g;
$rest =~ s/[\n\r]//g;
$rest = uc($rest);
# Convert e to eh phone
$rest =~ s/\s(E\s)/ EH /g;
if (length($addToEnd) > 0)
{
$rest = $rest . " " . $addToEnd;
}
# Keep track of the unique phones
@chunks = split(/\s{1,}/, $rest);
for ($i = 0; $i < scalar @chunks; $i++)
{
$phones{$chunks[$i]} = 1;
}
# We may already have a pronunciation for this word
# so we'll just add a second line to the output part.
$newPart = $word . "\t" . $rest . "\n";
# Only add if we don't have something identical for this word
if (index($words{$wordNoNum}, $newPart) == -1)
{
# Our next prounouciation variation number should be
# one more than the number of words already stored.
@chunks = split(/[\n\r]/, $words{$wordNoNum});
$maxNum = scalar @chunks + 1;
if ($maxNum > 1)
{
$word = $wordNoNum . "(" . $maxNum . ")";
}
else
{
$word = $wordNoNum;
}
$words{$wordNoNum} = $words{$wordNoNum} . $word . "\t" . $rest . "\n";
}
}
}
}
foreach $word (sort keys %words)
{
if (length($words{$word}) > 0)
{
print $words{$word};
}
}
close IN;
# Always add a SIL phone
$phones{"SIL"} = 1;
open(OUT, ">". $phoneFile);
foreach $phone (sort keys %phones)
{
if (length($phone) > 0)
{
print OUT $phone . "\n";
}
}
close OUT;