SphinxBase 0.6
src/sphinx_lmtools/sphinx_lm_eval.c
Go to the documentation of this file.
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 2008 Carnegie Mellon University.  All rights 
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00041 #include <sphinxbase/logmath.h>
00042 #include <sphinxbase/ngram_model.h>
00043 #include <sphinxbase/cmd_ln.h>
00044 #include <sphinxbase/ckd_alloc.h>
00045 #include <sphinxbase/err.h>
00046 #include <sphinxbase/pio.h>
00047 #include <sphinxbase/strfuncs.h>
00048 
00049 #include <stdio.h>
00050 #include <string.h>
00051 #include <math.h>
00052 
00053 static const arg_t defn[] = {
00054   { "-help",
00055     ARG_BOOLEAN,
00056     "no",
00057     "Shows the usage of the tool"},
00058 
00059   { "-logbase",
00060     ARG_FLOAT64,
00061     "1.0001",
00062     "Base in which all log-likelihoods calculated" },
00063 
00064   { "-lm",
00065     ARG_STRING,
00066     NULL,
00067     "Language model file"},
00068 
00069   { "-probdef",
00070     ARG_STRING,
00071     NULL,
00072     "Probability definition file for classes in LM"},
00073 
00074   { "-lmctlfn",
00075     ARG_STRING,
00076     NULL,
00077     "Control file listing a set of language models"},
00078 
00079   { "-lmname",
00080     ARG_STRING,
00081     NULL,
00082     "Name of language model in -lmctlfn to use for all utterances" },
00083 
00084   { "-lsn",
00085     ARG_STRING,
00086     NULL,
00087     "Transcription file to evaluate"},
00088 
00089   { "-text",
00090     ARG_STRING,
00091     "Text string to evaluate"},
00092 
00093   { "-mmap",
00094     ARG_BOOLEAN,
00095     "no",
00096     "Use memory-mapped I/O for reading binary LM files"},
00097 
00098   { "-lw",
00099     ARG_FLOAT32,
00100     "1.0",
00101     "Language model weight" },
00102 
00103   { "-wip",
00104     ARG_FLOAT32,
00105     "1.0",
00106     "Word insertion probability" },
00107 
00108   { "-uw",
00109     ARG_FLOAT32,
00110     "1.0",
00111     "Unigram probability weight (interpolated with uniform distribution)"},
00112 
00113   { "-verbose",
00114     ARG_BOOLEAN,
00115     "no",
00116     "Print details of perplexity calculation" },
00117 
00118   /* FIXME: Support -lmstartsym, -lmendsym, -lmctlfn, -ctl_lm */
00119   { NULL, 0, NULL, NULL }
00120 };
00121 
00122 static int verbose;
00123 
00124 static int
00125 calc_entropy(ngram_model_t *lm, char **words, int32 n,
00126              int32 *out_n_ccs, int32 *out_n_oovs, int32 *out_lm_score)
00127 {
00128         int32 *wids;
00129         int32 startwid;
00130         int32 i, ch, nccs, noovs, unk;
00131 
00132         if (n == 0)
00133             return 0;
00134 
00135         unk = ngram_unknown_wid(lm);
00136 
00137         /* Reverse this array into an array of word IDs. */
00138         wids = ckd_calloc(n, sizeof(*wids));
00139         for (i = 0; i < n; ++i)
00140                 wids[n-i-1] = ngram_wid(lm, words[i]);
00141         /* Skip <s> as it's a context cue (HACK, this should be configurable). */
00142         startwid = ngram_wid(lm, "<s>");
00143 
00144         /* Now evaluate the list of words in reverse using the
00145          * remainder of the array as the history. */
00146         ch = noovs = nccs = 0;
00147         for (i = 0; i < n; ++i) {
00148                 int32 n_used;
00149                 int32 prob;
00150 
00151                 /* Skip <s> as it's a context cue (HACK, this should be configurable). */
00152                 if (wids[i] == startwid) {
00153                         ++nccs;
00154                         continue;
00155                 }
00156                 /* Skip and count OOVs. */
00157                 if (wids[i] == NGRAM_INVALID_WID || wids[i] == unk) {
00158                         ++noovs;
00159                         continue;
00160                 }
00161                 /* Sum up information for each N-gram */
00162                 prob = ngram_ng_score(lm,
00163                                       wids[i], wids + i + 1,
00164                                       n - i - 1, &n_used);
00165                 if (verbose) {
00166                     int m;
00167                     printf("log P(%s|", ngram_word(lm, wids[i]));
00168                     m = i + ngram_model_get_size(lm) - 1;
00169                     if (m >= n)
00170                         m = n - 1;
00171                     while (m > i) {
00172                         printf("%s ", ngram_word(lm, wids[m--]));
00173                     }
00174                     printf(") = %d\n", prob);
00175                 }
00176                 ch -= prob;
00177         }
00178 
00179         if (out_n_ccs) *out_n_ccs = nccs;
00180         if (out_n_oovs) *out_n_oovs = noovs;
00181 
00182         /* Calculate cross-entropy CH = - 1/N sum log P(W|H) */
00183         n -= (nccs + noovs);
00184         if (n <= 0)
00185             return 0;
00186         if (out_lm_score)
00187             *out_lm_score = -ch;
00188         return ch / n;
00189 }
00190 
00191 static void
00192 evaluate_file(ngram_model_t *lm, logmath_t *lmath, const char *lsnfn)
00193 {
00194         FILE *fh;
00195         lineiter_t *litor;
00196         int32 nccs, noovs, nwords, lscr;
00197         float64 ch, log_to_log2;;
00198 
00199         if ((fh = fopen(lsnfn, "r")) == NULL)
00200                 E_FATAL_SYSTEM("failed to open transcript file %s", lsnfn);
00201 
00202         /* We have to keep ch in floating-point to avoid overflows, so
00203          * we might as well use log2. */
00204         log_to_log2 = log(logmath_get_base(lmath)) / log(2);
00205         nccs = noovs = nwords = 0;
00206         ch = 0.0;
00207         for (litor = lineiter_start(fh); litor; litor = lineiter_next(litor)) {
00208                 char **words;
00209                 int32 n, tmp_ch, tmp_noovs, tmp_nccs, tmp_lscr;
00210 
00211                 n = str2words(litor->buf, NULL, 0);
00212                 if (n < 0)
00213                         E_FATAL("str2words(line, NULL, 0) = %d, should not happen\n", n);
00214                 if (n == 0) /* Do nothing! */
00215                         continue;
00216                 words = ckd_calloc(n, sizeof(*words));
00217                 str2words(litor->buf, words, n);
00218 
00219                 /* Remove any utterance ID (FIXME: has to be a single "word") */
00220                 if (words[n-1][0] == '('
00221                     && words[n-1][strlen(words[n-1])-1] == ')')
00222                         n = n - 1;
00223 
00224                 tmp_ch = calc_entropy(lm, words, n, &tmp_nccs,
00225                                       &tmp_noovs, &tmp_lscr);
00226 
00227                 ch += (float64) tmp_ch * (n - tmp_nccs - tmp_noovs) * log_to_log2;
00228                 nccs += tmp_nccs;
00229                 noovs += tmp_noovs;
00230                 lscr += tmp_lscr;
00231                 nwords += n;
00232                 
00233                 ckd_free(words);
00234         }
00235 
00236         ch /= (nwords - nccs - noovs);
00237         printf("cross-entropy: %f bits\n", ch);
00238 
00239         /* Calculate perplexity pplx = exp CH */
00240         printf("perplexity: %f\n", pow(2.0, ch));
00241         printf("lm score: %d\n", lscr);
00242 
00243         /* Report OOVs and CCs */
00244         printf("%d words evaluated\n", nwords);
00245         printf("%d OOVs (%.2f%%), %d context cues removed\n",
00246                noovs, (double)noovs / nwords * 100, nccs);
00247 }
00248 
00249 static void
00250 evaluate_string(ngram_model_t *lm, logmath_t *lmath, const char *text)
00251 {
00252         char *textfoo;
00253         char **words;
00254         int32 n, ch, noovs, nccs, lscr;
00255 
00256         /* Split it into an array of strings. */
00257         textfoo = ckd_salloc(text);
00258         n = str2words(textfoo, NULL, 0);
00259         if (n < 0)
00260                 E_FATAL("str2words(textfoo, NULL, 0) = %d, should not happen\n", n);
00261         if (n == 0) /* Do nothing! */
00262                 return;
00263         words = ckd_calloc(n, sizeof(*words));
00264         str2words(textfoo, words, n);
00265 
00266         ch = calc_entropy(lm, words, n, &nccs, &noovs, &lscr);
00267 
00268         printf("input: %s\n", text);
00269         printf("cross-entropy: %f bits\n",
00270                ch * log(logmath_get_base(lmath)) / log(2));
00271 
00272         /* Calculate perplexity pplx = exp CH */
00273         printf("perplexity: %f\n", logmath_exp(lmath, ch));
00274         printf("lm score: %d\n", lscr);
00275 
00276         /* Report OOVs and CCs */
00277         printf("%d words evaluated\n", n);
00278         printf("%d OOVs, %d context cues removed\n",
00279               noovs, nccs);
00280 
00281         ckd_free(textfoo);
00282         ckd_free(words);
00283 }
00284 
00285 int
00286 main(int argc, char *argv[])
00287 {
00288         cmd_ln_t *config;
00289         ngram_model_t *lm = NULL;
00290         logmath_t *lmath;
00291         const char *lmfn, *probdefn, *lsnfn, *text;
00292 
00293         if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL)
00294                 return 1;
00295 
00296         verbose = cmd_ln_boolean_r(config, "-verbose");
00297 
00298         /* Create log math object. */
00299         if ((lmath = logmath_init
00300              (cmd_ln_float64_r(config, "-logbase"), 0, 0)) == NULL) {
00301                 E_FATAL("Failed to initialize log math\n");
00302         }
00303 
00304         /* Load the language model. */
00305         lmfn = cmd_ln_str_r(config, "-lm");
00306         if (lmfn == NULL
00307             || (lm = ngram_model_read(config, lmfn,
00308                                       NGRAM_AUTO, lmath)) == NULL) {
00309                 E_FATAL("Failed to load language model from %s\n",
00310                         cmd_ln_str_r(config, "-lm"));
00311         }
00312         if ((probdefn = cmd_ln_str_r(config, "-probdef")) != NULL)
00313             ngram_model_read_classdef(lm, probdefn);
00314         ngram_model_apply_weights(lm,
00315                                   cmd_ln_float32_r(config, "-lw"),
00316                                   cmd_ln_float32_r(config, "-wip"),
00317                                   cmd_ln_float32_r(config, "-uw"));
00318 
00319         /* Now evaluate some text. */
00320         lsnfn = cmd_ln_str_r(config, "-lsn");
00321         text = cmd_ln_str_r(config, "-text");
00322         if (lsnfn) {
00323                 evaluate_file(lm, lmath, lsnfn);
00324         }
00325         else if (text) {
00326                 evaluate_string(lm, lmath, text);
00327         }
00328 
00329         return 0;
00330 }