www.pudn.com > sphinx_recipe.zip > FixCMUDict.pl


#!/usr/bin/perl

# Prepares a Sphinx style dictionary based on the original
# from CMU.  Does the following:
#    Strips any stress marking in a CMU format dictionary.
#    Eliminates comment lines
#    Sorts alphabetically by words
#    Make sure there aren't identical pronunciations of the same word
#    Changes erroneous "e" phone to "eh"
#    Outputs the list of all unique phones encountered
#    Eliminates all "split" words with underscores
#
# Can optional add string to the end of each pronunciation (like sp)
#
# Copyright 2005 by Keith Vertanen
#

use strict;

if ( @ARGV < 2 )
{
    print "$0   [add to end]\n"; 
    exit(1);
}

my $listFile;
my $addToEnd;
my $phoneFile;

($listFile, $phoneFile, $addToEnd) = @ARGV;

open(IN, $listFile);

my $line;
my $pos;
my $rest;
my $word;
my %words;
my $firstChar;
my $newPart;
my $phone;
my %phones;
my @chunks;
my $i;
my $num;
my $posStart;
my $posEnd;
my $wordNoNum;
my $maxNum;

while ($line = ) 
{
    if (index($line, "#") != 0) 
    {
	$pos = index($line, " ");
	$word = substr($line, 0, $pos);

#	$word =~ s/\([23456789]\)//g;

	$wordNoNum = $word;
	$wordNoNum =~ s/\([23456789]\)//g;
	     
	$firstChar = substr($line, 0, 1);
	
	# See if we need to escape this word
#	if (($firstChar !~ /[A-Z|a-z|0-9|\s]/) && (length($line) > 0))
#	{
#	    $word = "\\" . $word;
#	    # print "blah " . $word . "\n";
#	}

	if ((length($word) > 0) && ($word !~ /_/))
	{
	    $rest = substr($line, $pos + 1);
	    $rest =~ s/[0123456789]//g;
	    $rest =~ s/[\n\r]//g;
	    $rest = uc($rest);
	    
	    # Convert e to eh phone
	    $rest =~ s/\s(E\s)/ EH /g;

	    if (length($addToEnd) > 0)
	    {
		$rest = $rest . " " . $addToEnd;
	    }

	    # Keep track of the unique phones
	    @chunks = split(/\s{1,}/, $rest);
	    for ($i = 0; $i < scalar @chunks; $i++)
	    {
		$phones{$chunks[$i]} = 1;
	    }
	    
	    # We may already have a pronunciation for this word
	    # so we'll just add a second line to the output part.
	    $newPart = $word . "\t" . $rest . "\n";
	    
	    # Only add if we don't have something identical for this word
	    if (index($words{$wordNoNum}, $newPart) == -1)
	    {	
		# Our next prounouciation variation number should be
		# one more than the number of words already stored.
		@chunks = split(/[\n\r]/, $words{$wordNoNum});
		$maxNum = scalar @chunks + 1;
    
		if ($maxNum > 1)
		{
		    $word = $wordNoNum . "(" . $maxNum . ")";
		}
		else
		{
		    $word = $wordNoNum;
		}
		
		$words{$wordNoNum} = $words{$wordNoNum} . $word . "\t" . $rest . "\n";
	    }
	}
    }
}

foreach $word (sort keys %words)
{
    if (length($words{$word}) > 0)
    {
	print $words{$word};
    }
}

close IN;

# Always add a SIL phone
$phones{"SIL"} = 1;

open(OUT, ">". $phoneFile);
foreach $phone (sort keys %phones)
{
    if (length($phone) > 0)
    {
	print OUT $phone . "\n";
    }
}
close OUT;