src/sphinx_lmtools/lm_eval.c

Go to the documentation of this file.
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 2008 Carnegie Mellon University.  All rights 
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00041 #include <logmath.h>
00042 #include <ngram_model.h>
00043 #include <cmd_ln.h>
00044 #include <ckd_alloc.h>
00045 #include <err.h>
00046 #include <strfuncs.h>
00047 
00048 #include <stdio.h>
00049 #include <string.h>
00050 #include <math.h>
00051 
00052 static const arg_t defn[] = {
00053   { "-help",
00054     ARG_BOOLEAN,
00055     "no",
00056     "Shows the usage of the tool"},
00057 
00058   { "-logbase",
00059     ARG_FLOAT64,
00060     "1.0001",
00061     "Base in which all log-likelihoods calculated" },
00062 
00063   { "-lm",
00064     ARG_STRING,
00065     NULL,
00066     "Language model file"},
00067 
00068   { "-probdef",
00069     ARG_STRING,
00070     NULL,
00071     "Probability definition file for classes in LM"},
00072 
00073   { "-lmctlfn",
00074     ARG_STRING,
00075     NULL,
00076     "Control file listing a set of language models"},
00077 
00078   { "-lmname",
00079     ARG_STRING,
00080     NULL,
00081     "Name of language model in -lmctlfn to use for all utterances" },
00082 
00083   { "-lsn",
00084     ARG_STRING,
00085     NULL,
00086     "Transcription file to evaluate"},
00087 
00088   { "-text",
00089     ARG_STRING,
00090     "Text string to evaluate"},
00091 
00092   { "-mmap",
00093     ARG_BOOLEAN,
00094     "no",
00095     "Use memory-mapped I/O for reading binary LM files"},
00096 
00097   { "-lw",
00098     ARG_FLOAT32,
00099     "1.0",
00100     "Language model weight" },
00101 
00102   { "-wip",
00103     ARG_FLOAT32,
00104     "1.0",
00105     "Word insertion probability" },
00106 
00107   { "-uw",
00108     ARG_FLOAT32,
00109     "1.0",
00110     "Unigram probability weight (interpolated with uniform distribution)"},
00111 
00112   /* FIXME: Support -lmstartsym, -lmendsym, -lmctlfn, -ctl_lm */
00113   { NULL, 0, NULL, NULL }
00114 };
00115 
00116 static int
00117 calc_entropy(ngram_model_t *lm, char **words, int32 n,
00118              int32 *out_n_ccs, int32 *out_n_oovs)
00119 {
00120         int32 *wids;
00121         int32 startwid;
00122         int32 i, ch, nccs, noovs, unk;
00123 
00124         if (n == 0)
00125             return 0;
00126 
00127         unk = ngram_unknown_wid(lm);
00128 
00129         /* Reverse this array into an array of word IDs. */
00130         wids = ckd_calloc(n, sizeof(*wids));
00131         for (i = 0; i < n; ++i)
00132                 wids[n-i-1] = ngram_wid(lm, words[i]);
00133         /* Skip <s> as it's a context cue (HACK, this should be configurable). */
00134         startwid = ngram_wid(lm, "<s>");
00135 
00136         /* Now evaluate the list of words in reverse using the
00137          * remainder of the array as the history. */
00138         ch = noovs = nccs = 0;
00139         for (i = 0; i < n; ++i) {
00140                 int32 n_used;
00141                 int32 prob;
00142 
00143                 /* Skip <s> as it's a context cue (HACK, this should be configurable). */
00144                 if (wids[i] == startwid) {
00145                         ++nccs;
00146                         continue;
00147                 }
00148                 /* Skip and count OOVs. */
00149                 if (wids[i] == NGRAM_INVALID_WID || wids[i] == unk) {
00150                         ++noovs;
00151                         continue;
00152                 }
00153                 /* Sum up information for each N-gram */
00154                 prob = ngram_ng_score(lm,
00155                                       wids[i], wids + i + 1,
00156                                       n - i - 1, &n_used);
00157                 ch -= prob;
00158         }
00159 
00160         if (out_n_ccs) *out_n_ccs = nccs;
00161         if (out_n_oovs) *out_n_oovs = noovs;
00162 
00163         /* Calculate cross-entropy CH = - 1/N sum log P(W|H) */
00164         n -= (nccs + noovs);
00165         if (n <= 0)
00166             return 0;
00167         return ch / n;
00168 }
00169 
00170 static void
00171 evaluate_file(ngram_model_t *lm, logmath_t *lmath, const char *lsnfn)
00172 {
00173         FILE *fh;
00174         char line[256];
00175         int32 nccs, noovs, nwords;
00176         float64 ch, log_to_log2;;
00177 
00178         if ((fh = fopen(lsnfn, "r")) == NULL)
00179                 E_FATAL_SYSTEM("failed to open transcript file %s", lsnfn);
00180 
00181         /* We have to keep ch in floating-point to avoid overflows, so
00182          * we might as well use log2. */
00183         log_to_log2 = log(logmath_get_base(lmath)) / log(2);
00184         nccs = noovs = nwords = 0;
00185         ch = 0.0;
00186         while (fgets(line, sizeof(line), fh)) {
00187                 char **words;
00188                 int32 n, tmp_ch, tmp_noovs, tmp_nccs;
00189 
00190                 n = str2words(line, NULL, 0);
00191                 if (n < 0)
00192                         E_FATAL("str2words(line, NULL, 0) = %d, should not happen\n", n);
00193                 if (n == 0) /* Do nothing! */
00194                         continue;
00195                 words = ckd_calloc(n, sizeof(*words));
00196                 str2words(line, words, n);
00197 
00198                 /* Remove any utterance ID (FIXME: has to be a single "word") */
00199                 if (words[n-1][0] == '('
00200                     && words[n-1][strlen(words[n-1])-1] == ')')
00201                         n = n - 1;
00202 
00203                 tmp_ch = calc_entropy(lm, words, n, &tmp_nccs, &tmp_noovs);
00204 
00205                 ch += (float64) tmp_ch * (n - tmp_nccs - tmp_noovs) * log_to_log2;
00206                 nccs += tmp_nccs;
00207                 noovs += tmp_noovs;
00208                 nwords += n;
00209                 
00210                 ckd_free(words);
00211         }
00212 
00213         ch /= (nwords - nccs - noovs);
00214         printf("cross-entropy: %f bits\n", ch);
00215 
00216         /* Calculate perplexity pplx = exp CH */
00217         printf("perplexity: %f\n", pow(2.0, ch));
00218 
00219         /* Report OOVs and CCs */
00220         printf("%d words evaluated\n", nwords);
00221         printf("%d OOVs (%.2f%%), %d context cues removed\n",
00222                noovs, (double)noovs / nwords * 100, nccs);
00223 }
00224 
00225 static void
00226 evaluate_string(ngram_model_t *lm, logmath_t *lmath, const char *text)
00227 {
00228         char *textfoo;
00229         char **words;
00230         int32 n, ch, noovs, nccs;
00231 
00232         /* Split it into an array of strings. */
00233         textfoo = ckd_salloc(text);
00234         n = str2words(textfoo, NULL, 0);
00235         if (n < 0)
00236                 E_FATAL("str2words(textfoo, NULL, 0) = %d, should not happen\n", n);
00237         if (n == 0) /* Do nothing! */
00238                 return;
00239         words = ckd_calloc(n, sizeof(*words));
00240         str2words(textfoo, words, n);
00241 
00242         ch = calc_entropy(lm, words, n, &nccs, &noovs);
00243 
00244         printf("input: %s\n", text);
00245         printf("cross-entropy: %f bits\n",
00246                ch * log(logmath_get_base(lmath)) / log(2));
00247 
00248         /* Calculate perplexity pplx = exp CH */
00249         printf("perplexity: %f\n", logmath_exp(lmath, ch));
00250 
00251         /* Report OOVs and CCs */
00252         printf("%d words evaluated\n", n);
00253         printf("%d OOVs, %d context cues removed\n",
00254               noovs, nccs);
00255 
00256         ckd_free(textfoo);
00257         ckd_free(words);
00258 }
00259 
00260 int
00261 main(int argc, char *argv[])
00262 {
00263         cmd_ln_t *config;
00264         ngram_model_t *lm = NULL;
00265         logmath_t *lmath;
00266         const char *lmfn, *probdefn, *lsnfn, *text;
00267 
00268         if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL)
00269                 return 1;
00270 
00271         /* Create log math object. */
00272         if ((lmath = logmath_init
00273              (cmd_ln_float64_r(config, "-logbase"), 0, 0)) == NULL) {
00274                 E_FATAL("Failed to initialize log math\n");
00275         }
00276 
00277         /* Load the language model. */
00278         lmfn = cmd_ln_str_r(config, "-lm");
00279         if (lmfn == NULL
00280             || (lm = ngram_model_read(config, lmfn,
00281                                       NGRAM_AUTO, lmath)) == NULL) {
00282                 E_FATAL("Failed to load language model from %s\n",
00283                         cmd_ln_str_r(config, "-lm"));
00284         }
00285         if ((probdefn = cmd_ln_str_r(config, "-probdef")) != NULL)
00286             ngram_model_read_classdef(lm, probdefn);
00287         ngram_model_apply_weights(lm,
00288                                   cmd_ln_float32_r(config, "-lw"),
00289                                   cmd_ln_float32_r(config, "-wip"),
00290                                   cmd_ln_float32_r(config, "-uw"));
00291 
00292 
00293         /* Now evaluate some text. */
00294         lsnfn = cmd_ln_str_r(config, "-lsn");
00295         text = cmd_ln_str_r(config, "-text");
00296         if (lsnfn) {
00297                 evaluate_file(lm, lmath, lsnfn);
00298         }
00299         else if (text) {
00300                 evaluate_string(lm, lmath, text);
00301         }
00302 
00303         return 0;
00304 }

Generated on Mon Jan 24 21:36:19 2011 for SphinxBase by  doxygen 1.4.7