www.pudn.com > sphinx_recipe.zip > verify_all.pl
#!/usr/bin/perl
## ====================================================================
##
## Copyright (c) 1996-2000 Carnegie Mellon University. All rights
## reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions
## are met:
##
## 1. Redistributions of source code must retain the above copyright
## notice, this list of conditions and the following disclaimer.
##
## 2. Redistributions in binary form must reproduce the above copyright
## notice, this list of conditions and the following disclaimer in
## the documentation and/or other materials provided with the
## distribution.
##
## This work was supported in part by funding from the Defense Advanced
## Research Projects Agency and the National Science Foundation of the
## United States of America, and the CMU Sphinx Speech Consortium.
##
## THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
## ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
## THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
## PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
## NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
## SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
## LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
## DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
## THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
## (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
## OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
##
## ====================================================================
##
## Author: Ricky Houghton (converted from scripts by Rita Singh)
##
use File::Copy;
my $index = 0;
if (lc($ARGV[0]) eq '-cfg') {
$cfg_file = $ARGV[1];
$index = 2;
} else {
$cfg_file = "etc/sphinx_train.cfg";
}
if (! -s "$cfg_file") {
print ("unable to find default configuration file, use -cfg file.cfg or create etc/sphinx_train.cfg for default\n");
exit -3;
}
require $cfg_file;
require "$CFG_SCRIPT_DIR/util/utils.pl";
$ret_value = 0;
$| = 1; # Turn on autoflushing
# My test files for OS case sensitivity
$lowercase_file = "tmp_case_sensitive_test";
$uppercase_file = "TMP_CASE_SENSITIVE_TEST";
# Just in case, clean up both cases
unlink $uppercase_file;
unlink $lowercase_file;
# Create file with lowercase name
open (TEST, ">$lowercase_file");
close(TEST);
# Now, try to open with uppercase name
if (open(TEST, "<$uppercase_file")) {
# If successful, the OS is case insensitive, and we have to check for
# phones in a case insensitive manner
$is_case_sensitive = 0;
close(TEST);
&ST_Log("O.S. is case insensitive (\"A\" == \"a\").\n" .
"Phones will be treated as case insensitive.\n");
} else {
# If unsuccessful, the OS is case sensitive, and we have to check for
# phones in a case sensitive manner
$is_case_sensitive = 1;
&ST_Log("O.S. is case sensitive (\"A\" != \"a\").\n" .
"Phones will be treated as case sensitive.\n");
}
# Clean up the mess
unlink $lowercase_file;
unlink $uppercase_file;
&ST_Log ("MODULE: 00 verify training files\n");
# PHASE 1: Check to see if the phones in the dictionary are listed in the phonelist file
# PHASE 2: Check to make sure there are not duplicate entries in the dictionary
{
open DICT,"$CFG_DICTIONARY" or die "Can not open the dictionary ($CFG_DICTIONARY)";
%dict_phone_hash = ();
%dict_hash = ();
&ST_Log (" Phase 1: DICT - Checking to see if the dict and filler dict agrees with the phonelist file\n");
# This is rather ugly, but it's late and I'm just trying to get the pieces together
# Clean it up later
# Read the dictionary and stick phones into dict_phone_hash
$counter =0;
while () {
if (/^(\S+)\s(.*)$/) {
$dict_hash{$1}++;
$phonetic = $2;
# Aggregate the non-space characters and store the results
# in @phone
@phones = ($phonetic =~ m/(\S+)/g);
for $phone (@phones) {
if ($is_case_sensitive) {
$dict_phone_hash{$phone}++;
} else {
$dict_phone_hash{uc($phone)}++;
}
}
}
$counter++;
}
close DICT;
open DICT,"$CFG_FILLERDICT" or die "Can not open filler dict ($CFG_FILLERDICT)\n";
while () {
if (/^(\S+)\s(.*)$/) {
$dict_hash{$1}++;
$phonetic = $2;
@phones = ($phonetic =~ m/(\S+)/g);
for $phone (@phones) {
if ($is_case_sensitive) {
$dict_phone_hash{$phone}++;
} else {
$dict_phone_hash{uc($phone)}++;
}
}
}
$counter++;
}
close DICT;
# Read the phonelist and stick phones into phonelist_hash
open PHONE,"$CFG_RAWPHONEFILE" or die "Can not open phone list ($CFG_RAWPHONEFILE)\n";
my $has_SIL = 0;
while () {
chomp;
if (m/\s/) {
$status = 'FAILED';
$ret_value = -1;
&ST_LogWarning("Phone \"$_\" has extra white spaces\n")
}
$has_SIL = 1 if m/^SIL$/;
if ($is_case_sensitive) {
$phonelist_hash{$_} = 0;
} else {
$phonelist_hash{uc($_)} = 0;
}
}
close PHONE;
unless ($has_SIL) {
$status = 'FAILED';
$ret_value = -1;
&ST_LogWarning ("The phonelist ($CFG_RAWPHONEFILE) does not define the phone SIL (required!)\n");
}
@keys = keys %dict_phone_hash;
&ST_Log (" Found $counter words using $#keys phones\n");
$status = 'passed';
for $key (sort (keys %dict_phone_hash)){
if (defined($phonelist_hash{$key})) {
$phonelist_hash{$key} = 1;
} else {
$status = 'FAILED';
$ret_value = -1;
copy("$CFG_GIF_DIR/red-ball.gif", "$CFG_BASE_DIR/.00.1.state.gif");
&ST_LogWarning ("This phone ($key) occurs in the dictionary ($CFG_DICTIONARY), but not in the phonelist ($CFG_RAWPHONEFILE)\n");
}
}
for $key (sort (keys %phonelist_hash)) {
if ($phonelist_hash{$key} == 0) {
$status = 'FAILED';
$ret_value = -1;
copy("$CFG_GIF_DIR/red-ball.gif", "$CFG_BASE_DIR/.00.1.state.gif");
&ST_LogWarning ("This phone ($key) occurs in the phonelist ($CFG_RAWPHONEFILE), but not in the dictionary ($CFG_DICTIONARY)\n");
}
}
&ST_HTML_Print ("\t\t $status \n") if ($status eq 'passed');
&ST_HTML_Print ("\t\t $status \n") if ($status eq 'FAILED');
# &ST_Log("\t\t$status\n");
&ST_Log(" Phase 2: DICT - Checking to make sure there are not duplicate entries in the dictionary\n");
$duplicate_status = 'passed';
for $key (keys %dict_hash) {
if ($dict_hash{$key} > 1) {
$ret_value = -2;
$duplicate_status = 'FAILED';
copy("$CFG_GIF_DIR/red-ball.gif", "$CFG_BASE_DIR/.00.2.state.gif");
&ST_LogWarning("This word ($key) has duplicate entries in ($CFG_DICTIONARY)\n");
}
}
# &ST_Log ("\t\t$duplicate_status\n");
&ST_HTML_Print ("\t\t $duplicate_status \n") if($duplicate_status eq 'passed');
&ST_HTML_Print ("\t\t $duplicate_status \n") if($duplicate_status eq 'FAILED');
}
# Check to make sure .ctl file is roughly of correct format
# 3.) Check that each utterance specified in the .ctl file has a positive length
# Verify that the files listed are available and are not of size 0
# 4.) Check number of lines in the transcript and in ctl - they should be the same\n";
{
my ($status,@ctl_lines,$ctl_line,$file,$start,$end,$number_ctl_lines,$number_lines_transcript);
open CTL,"$CFG_LISTOFFILES" or die "Can not open listoffiles ($CFG_LISTOFFILES)";
@ctl_lines = ; # We are going to iterate over this several times
close CTL;
# 3.) Check that each utterance specified in the .ctl file has a positive length
# Verify that the files listed are available and are not of size 0
&ST_Log(" Phase 3: CTL - Check general format; utterance length (must be positive); files exist\n");
$status = 'passed';
$estimated_training_data = 0;
for $ctl_line (@ctl_lines) {
chomp($ctl_line);
# Accept: filename int int possible_comment
if ($ctl_line =~ m/(.+)\s(\d+)\s(\d+).*/) {
$file = $1;
$start = $2;
$end = $3;
if ((defined $start)&& (defined $file)) {
if ($end <= $start) {
warn "Utterance length is <= 0: $start -> $end ($ctl_line)";
$status = 'FAILED';
$ret_value = -3;
}
if (! -s "$CFG_FEATFILES_DIR/$file.$CFG_FEATFILE_EXTENSION") {
$ret_value = -4;
$status = 'FAILED';
&ST_LogWarning ("This file, $CFG_FEATFILES_DIR/$file.$CFG_FEATFILE_EXTENSION, does not exist\n");
}
}
} else {
# Accepts only the file name and possible comment on line by itself..no start/send markers
if ($ctl_line =~ m/^(\S+)(\s.*)?$/) {
$file = $1;
$size = -s "$CFG_FEATFILES_DIR/$file.$CFG_FEATFILE_EXTENSION";
# 1 frame = 13 floating point numbers = 13*4bytes = 52 bytes (only valid for MFC files)
$estimated_training_data += ($size / 52) if (lc($CFG_FEATFILE_EXTENSION) eq 'mfc');
if (! $size) {
$ret_value = -4;
$status = 'FAILED';
&ST_LogWarning ("CTL file, $CFG_FEATFILES_DIR/$file.$CFG_FEATFILE_EXTENSION, does not exist\n");
}
} else {
$status = 'FAILED';
$ret_value = -5;
&ST_LogWarning ("CTL line does not parse correctly:\n$ctl_line\n");
}
}
}
# &ST_Log ("\t\t$status\n");
&ST_HTML_Print ("\t\t $status \n") if($status eq 'passed');
&ST_HTML_Print ("\t\t $status \n") if($status eq 'FAILED');
$number_ctl_lines = $#ctl_lines + 1;
# 4) Check number of lines in the transcript and in ctl - they should be the same\n";
&ST_Log (" Phase 4: CTL - Checking number of lines in the transcript should match lines in control file\n");
open TRN,"$CFG_TRANSCRIPTFILE" or die "Can not open Transcript file ($CFG_TRANSCRIPTFILE)";
$number_transcript_lines = 0;
while () {
$number_transcript_lines++;
}
close TRN;
$status = ($number_ctl_lines == $number_transcript_lines) ? 'passed' : 'FAILED';
&ST_HTML_Print ("\t\t $status \n") if($status eq 'passed');
&ST_HTML_Print ("\t\t $status \n") if($status eq 'FAILED');
# 5) Should already have estimates on the total training time,
&ST_Log (" Phase 5: CTL - Determine amount of training data, see if n_tied_states seems reasonable.\n");
$status = 'passed';
$total_training_data = 0;
for $ctl_line (@ctl_lines) {
# Accept: filename int int possible_comment
#($file,$start,$end) = map /(.+)\s(\d+)\s(\d+).*/,$ctl_line;
# start and end time specify start and end frames
if ($ctl_line =~ m/(.+)\s(\d+)\s(\d+).*/) {
$file = $1;
$start = $2;
$end = $3;
$total_training_data += ($end - $start) unless (($end - $start) < 0);
}
}
$total_training_data = $estimated_training_data if ($total_training_data == 0) ;
if ($total_training_data) {
$total_training_hours = ($total_training_data / 3600)/100;
&ST_Log("\t\tTotal Hours Training: $total_training_hours\n");
$estimated_n_tied_states = 1000;
if ($total_training_hours < 10) {
$status = WARNING;
&ST_Log("\t\tThis is a small amount of data, no comment at this time\n");
} else {
if ($total_training_hours < 100) {
$status = WARNING;
$estimated_n_tied_states = 3000 if ($CFG_HMM_TYPE eq '.cont.'); # Likely bogus
$estimated_n_tied_states = 4000 if ($CFG_HMM_TYPE eq '.semi.'); #
&ST_Log("\t\tRule of thumb suggests $estimated_n_tied_states, however there is no correct answer\n");
} else {
$estimated_n_tied_states = 8000;
$status = 'passed';
&ST_Log("\t\t100+ hours of training data is goodly amount of data.\n");
&ST_Log("\t\tRule of thumb suggests $estimated_n_tied_states for 100 hours, you can adjust accordingly.\n");
}
}
}
&ST_HTML_Print ("\t\t $status \n") if($status eq 'passed');
&ST_HTML_Print ("\t\t $status \n") if($status eq 'FAILED');
&ST_HTML_Print ("\t\t $status \n") if($status eq 'WARNING');
# &ST_Log("\t\t$status\n");
@ctl_lines = ();
}
%transcript_phonelist_hash = ();
# Verify that all transcription words are in the dictionary, and all
# phones are covered
{
&ST_Log(" Phase 6: TRANSCRIPT - Checking that all the words in the transcript are in the dictionary\n");
open DICT,"$CFG_DICTIONARY" or die "Can not open the dictionary ($CFG_DICTIONARY)";
@dict = ;
close DICT;
&ST_Log(" Words in dictionary: $#dict\n");
for (@dict) { # Create a hash of the dict entries
/(\S+)\s+(.*)$/;
if ($is_case_sensitive) {
$d{$1} = $2;
} else {
$d{$1} = uc($2);
}
}
open DICT,"$CFG_FILLERDICT" or die "Can not open filler dict ($CFG_FILLERDICT)\n";
@fill_dict = ;
close DICT;
&ST_Log (" Words in filler dictionary: $#fill_dict\n");
for (@fill_dict) { # Create a hash of the dict entries
/(\S+)\s+(.*)$/;
if ($is_case_sensitive) {
$d{$1} = $2;
} else {
$d{$1} = uc($2);
}
}
@dict = undef; # not needed
@fill_dict = undef; # not needed
open TRN,"$CFG_TRANSCRIPTFILE" or die "Can not open the transcript file ($CFG_TRANSCRIPTFILE)";
$status = 'passed';
while () {
($text) = m/(.*)\s*\(.*\)$/;
if ($text) {
@words = split /\s+/,$text;
for $word (@words) {
if (! $d{$word} && ($word =~ m/\S+/)) {
&ST_LogWarning ("This word: $word was in the transcript file, but is not in the dictionary ($text). Do cases match?\n");
$status = 'FAILED';
$ret_value = -5;
} else {
@phones = ($d{$word} =~ m/(\S+)/g);
for $phone (@phones) {
$transcript_phonelist_hash{$phone} = 1;
}
}
}
}
}
close TRN;
&ST_HTML_Print ("\t\t $status \n") if($status eq 'passed');
&ST_HTML_Print ("\t\t $status \n") if($status eq 'FAILED');
&ST_HTML_Print ("\t\t $status \n") if($status eq 'WARNING');
}
{
&ST_Log(" Phase 7: TRANSCRIPT - Checking that all the phones in the transcript are in the phonelist, and all phones in the phonelist appear at least once\n");
$status = 'passed';
for $phone (sort keys %phonelist_hash) {
if (!defined $transcript_phonelist_hash{$phone}) {
&ST_LogWarning ("This phone ($phone) occurs in the phonelist ($CFG_RAWPHONEFILE), but not in any word in the transcription ($CFG_TRANSCRIPTFILE)\n");
$status = 'FAILED';
}
}
&ST_HTML_Print ("\t\t $status \n") if($status eq 'passed');
&ST_HTML_Print ("\t\t $status \n") if($status eq 'FAILED');
&ST_HTML_Print ("\t\t $status \n") if($status eq 'WARNING');
}
mkdir ($CFG_LOG_DIR,0755) unless -d $CFG_LOG_DIR;
mkdir ("$CFG_BASE_DIR/bwaccumdir",0755) unless -d "$CFG_LOG_DIR/bwaccumdir";
exit ($ret_value);
# General idea for senone:
# 10 hours = 3000 cont. 4000 semi.
# 100 hours = 8000 (cont and semi)
# Rate of increase between the two is very small.