www.pudn.com > sphinx_recipe.zip > convert_code.sh



# Converts all the WSJ0 and WSJ1 training and test data from the
# original compressed form, to normal sphere headed wave 
# files that can be coded by Sphinx.

cd $WSJ_ROOT

# Clean up any old files
rm -f $CMU_WSJ/wv1_files.txt 

# Create some directories we'll need
mkdir -p $CMU_WSJ/feat
mkdir -p $CMU_WSJ/wav
mkdir -p $CMU_WSJ/logdir

# Only get the files in the training and test directories.
#find -iname *.wv1 | grep -E "_tr_|_TR_|_dt_|_ET_" >$CMU_WSJ/wv1_files.txt

# Changed to only get WSJ0 and WSJ1 directories
find ./wsj0 -iname *wv1 | grep -E "_tr_|_TR_|_dt_|_ET_" >$CMU_WSJ/wv1_files.txt
find ./wsj1 -iname *wv1 | grep -E "_tr_|_TR_|_dt_|_ET_" >>$CMU_WSJ/wv1_files.txt
# Create a config file for use with the coding script
perl $CMU_SCRIPTS/CreateConfig.pl $CMU_WSJ/etc/code_template.cfg CMU_ROOT $CMU_ROOT BASE_DIR wsj >$CMU_WSJ/temp_code.cfg

# We don't really need to keep the uncompressed WSJ data
# around, we just want the feature vectors.  We'll do
# the data in chunks, once we code the data
# we can delete the wav files.
I=0
CHUNKS=500
while [ $I -le $CHUNKS ];
do
  echo Converting and coding chunk ${I} of ${CHUNKS}...

  perl $CMU_SCRIPTS/OutputEvery.pl $CMU_WSJ/wv1_files.txt $CHUNKS $I >$CMU_WSJ/temp_wv1_files.txt

  # Call the script that builds the directory structure and converts
  # each file.
  perl $CMU_SCRIPTS/ConvertAndCopy.pl $CMU_WSJ/temp_wv1_files.txt $CMU_WSJ/wav $CMU_WSJ/feat

  perl $CMU_SCRIPTS/StripText.pl $CMU_WSJ/temp_wv1_files.txt ./ .wv1 .WV1 >$CMU_WSJ/temp.fileids

  cd $CMU_WSJ

  # Code up all the files
  perl $CMU_WSJ/scripts_pl/make_feats.pl -cfg $CMU_WSJ/temp_code.cfg -ctl $CMU_WSJ/temp.fileids

  cd $WSJ_ROOT

  # Remove the following two lines if you want to keep the
  # uncompressed wave files around.
  rm -f -r $CMU_WSJ/wav
  mkdir $CMU_WSJ/wav

  I=$(( $I + 1 ))

done

rm $CMU_WSJ/temp_wv1_files.txt
rm $CMU_WSJ/temp_code.cfg
rm $CMU_WSJ/temp.fileids