www.pudn.com > sphinx_recipe.zip > WordsToDictionary.pl



# Converts a word list into a dictionary by looking up the words
# in a provided other superset dictionary.
#
# It should output every variant that was in the dictionary.
#
# Outputs !!UNKNOWN!! for any words we couldn't find in the superset
# dictionary
#
# Optionally can add sentence start and end symbols to top.
#
# Also checks to be sure that variants all have (#) notation otherwise
# the sphinx2 decoder will crash.
#
# Copyright 2006 by Keith Vertanen

use strict;

if ( @ARGV < 3 )
{
    print "$0    [sentence start] [sentence end]\n"; 
    exit(1);
}

my $listFile;
my $dictFile;
my $outFile;
my $sentStart;
my $sentEnd;

($listFile, $dictFile, $outFile, $sentStart, $sentEnd) = @ARGV;

my $line;
my @chunks;
my %words;
my $phones;
my $word;
my $i;
my $wordNoNum;

# Read in the dictionary
open(IN, $dictFile);
open(OUT, ">". $outFile);

# See if we need to put sentence start/end symbols at the top
if ($sentStart)
{
    print OUT $sentStart . " [] sil\n";
}
if ($sentEnd)
{
    print OUT $sentEnd . " [] sil\n";
}


while($line = )
{
    $line =~ s/\n//g;
    $line =~ s/\r//g;

    @chunks = split(/\s{1,}/, $line);

    $word = $chunks[0];

    # For hash purposes, we eliminate the (#) variant bit   
    $wordNoNum = $word;
    $wordNoNum =~ s/\([23456789]\)//g;

    $phones = "";
    for ($i = 1; $i < scalar @chunks; $i++)
    {
	$phones = $phones . $chunks[$i];
	if (($i + 1) < scalar @chunks)
	{
	    $phones = $phones . " ";
	}
    }
		
#print "word = '" . $word . "', phones = '" . $phones . "'\n";

    # See if there is an existing entry for this word and if
    # this word was missing the proper (#) notation
    if (($words{$wordNoNum}) && (index($word, "(") == -1))
    {
	# Add the proper numbering to this entry
	@chunks = split(/[\n\r]/, $words{$wordNoNum});
	$word = $word . "(" . ((scalar @chunks) + 1) . ")";
    }

    # We store in the hash all the lines corresponding to this word	
    $words{$wordNoNum} = $words{$wordNoNum} . $word . "\t" . $phones . "\n";
}

open(IN, $listFile);
while($line = )
{
#    $line =~ s/[\n\s]//g;

    $line =~ s/\n//g;
    $line =~ s/\r//g;

    @chunks = split(/\s{1,}/, $line);

    $word = $chunks[0];    

    # Escape any leading apostrophes
    if (index($word, "'") == 0)
    {
      $word = "\\" . $word;
    }

    if ($words{$word})
    {
	print OUT uc($words{$word});
    }
    else
    {
	print "Unknown word: " . $line . "\n";
	print OUT $line . "\t!!UNKNOWN!!\n";
    }
}
close(IN);
close(OUT);