00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00041 #include <logmath.h>
00042 #include <ngram_model.h>
00043 #include <cmd_ln.h>
00044 #include <ckd_alloc.h>
00045 #include <err.h>
00046 #include <strfuncs.h>
00047
00048 #include <stdio.h>
00049 #include <string.h>
00050 #include <math.h>
00051
00052 static const arg_t defn[] = {
00053 { "-help",
00054 ARG_BOOLEAN,
00055 "no",
00056 "Shows the usage of the tool"},
00057
00058 { "-logbase",
00059 ARG_FLOAT64,
00060 "1.0001",
00061 "Base in which all log-likelihoods calculated" },
00062
00063 { "-lm",
00064 ARG_STRING,
00065 NULL,
00066 "Language model file"},
00067
00068 { "-probdef",
00069 ARG_STRING,
00070 NULL,
00071 "Probability definition file for classes in LM"},
00072
00073 { "-lmctlfn",
00074 ARG_STRING,
00075 NULL,
00076 "Control file listing a set of language models"},
00077
00078 { "-lmname",
00079 ARG_STRING,
00080 NULL,
00081 "Name of language model in -lmctlfn to use for all utterances" },
00082
00083 { "-lsn",
00084 ARG_STRING,
00085 NULL,
00086 "Transcription file to evaluate"},
00087
00088 { "-text",
00089 ARG_STRING,
00090 "Text string to evaluate"},
00091
00092 { "-mmap",
00093 ARG_BOOLEAN,
00094 "no",
00095 "Use memory-mapped I/O for reading binary LM files"},
00096
00097 { "-lw",
00098 ARG_FLOAT32,
00099 "1.0",
00100 "Language model weight" },
00101
00102 { "-wip",
00103 ARG_FLOAT32,
00104 "1.0",
00105 "Word insertion probability" },
00106
00107 { "-uw",
00108 ARG_FLOAT32,
00109 "1.0",
00110 "Unigram probability weight (interpolated with uniform distribution)"},
00111
00112
00113 { NULL, 0, NULL, NULL }
00114 };
00115
00116 static int
00117 calc_entropy(ngram_model_t *lm, char **words, int32 n,
00118 int32 *out_n_ccs, int32 *out_n_oovs)
00119 {
00120 int32 *wids;
00121 int32 startwid;
00122 int32 i, ch, nccs, noovs, unk;
00123
00124 if (n == 0)
00125 return 0;
00126
00127 unk = ngram_unknown_wid(lm);
00128
00129
00130 wids = ckd_calloc(n, sizeof(*wids));
00131 for (i = 0; i < n; ++i)
00132 wids[n-i-1] = ngram_wid(lm, words[i]);
00133
00134 startwid = ngram_wid(lm, "<s>");
00135
00136
00137
00138 ch = noovs = nccs = 0;
00139 for (i = 0; i < n; ++i) {
00140 int32 n_used;
00141 int32 prob;
00142
00143
00144 if (wids[i] == startwid) {
00145 ++nccs;
00146 continue;
00147 }
00148
00149 if (wids[i] == NGRAM_INVALID_WID || wids[i] == unk) {
00150 ++noovs;
00151 continue;
00152 }
00153
00154 prob = ngram_ng_score(lm,
00155 wids[i], wids + i + 1,
00156 n - i - 1, &n_used);
00157 ch -= prob;
00158 }
00159
00160 if (out_n_ccs) *out_n_ccs = nccs;
00161 if (out_n_oovs) *out_n_oovs = noovs;
00162
00163
00164 n -= (nccs + noovs);
00165 if (n <= 0)
00166 return 0;
00167 return ch / n;
00168 }
00169
00170 static void
00171 evaluate_file(ngram_model_t *lm, logmath_t *lmath, const char *lsnfn)
00172 {
00173 FILE *fh;
00174 char line[256];
00175 int32 nccs, noovs, nwords;
00176 float64 ch, log_to_log2;;
00177
00178 if ((fh = fopen(lsnfn, "r")) == NULL)
00179 E_FATAL_SYSTEM("failed to open transcript file %s", lsnfn);
00180
00181
00182
00183 log_to_log2 = log(logmath_get_base(lmath)) / log(2);
00184 nccs = noovs = nwords = 0;
00185 ch = 0.0;
00186 while (fgets(line, sizeof(line), fh)) {
00187 char **words;
00188 int32 n, tmp_ch, tmp_noovs, tmp_nccs;
00189
00190 n = str2words(line, NULL, 0);
00191 if (n < 0)
00192 E_FATAL("str2words(line, NULL, 0) = %d, should not happen\n", n);
00193 if (n == 0)
00194 continue;
00195 words = ckd_calloc(n, sizeof(*words));
00196 str2words(line, words, n);
00197
00198
00199 if (words[n-1][0] == '('
00200 && words[n-1][strlen(words[n-1])-1] == ')')
00201 n = n - 1;
00202
00203 tmp_ch = calc_entropy(lm, words, n, &tmp_nccs, &tmp_noovs);
00204
00205 ch += (float64) tmp_ch * (n - tmp_nccs - tmp_noovs) * log_to_log2;
00206 nccs += tmp_nccs;
00207 noovs += tmp_noovs;
00208 nwords += n;
00209
00210 ckd_free(words);
00211 }
00212
00213 ch /= (nwords - nccs - noovs);
00214 printf("cross-entropy: %f bits\n", ch);
00215
00216
00217 printf("perplexity: %f\n", pow(2.0, ch));
00218
00219
00220 printf("%d words evaluated\n", nwords);
00221 printf("%d OOVs (%.2f%%), %d context cues removed\n",
00222 noovs, (double)noovs / nwords * 100, nccs);
00223 }
00224
00225 static void
00226 evaluate_string(ngram_model_t *lm, logmath_t *lmath, const char *text)
00227 {
00228 char *textfoo;
00229 char **words;
00230 int32 n, ch, noovs, nccs;
00231
00232
00233 textfoo = ckd_salloc(text);
00234 n = str2words(textfoo, NULL, 0);
00235 if (n < 0)
00236 E_FATAL("str2words(textfoo, NULL, 0) = %d, should not happen\n", n);
00237 if (n == 0)
00238 return;
00239 words = ckd_calloc(n, sizeof(*words));
00240 str2words(textfoo, words, n);
00241
00242 ch = calc_entropy(lm, words, n, &nccs, &noovs);
00243
00244 printf("input: %s\n", text);
00245 printf("cross-entropy: %f bits\n",
00246 ch * log(logmath_get_base(lmath)) / log(2));
00247
00248
00249 printf("perplexity: %f\n", logmath_exp(lmath, ch));
00250
00251
00252 printf("%d words evaluated\n", n);
00253 printf("%d OOVs, %d context cues removed\n",
00254 noovs, nccs);
00255
00256 ckd_free(textfoo);
00257 ckd_free(words);
00258 }
00259
00260 int
00261 main(int argc, char *argv[])
00262 {
00263 cmd_ln_t *config;
00264 ngram_model_t *lm = NULL;
00265 logmath_t *lmath;
00266 const char *lmfn, *probdefn, *lsnfn, *text;
00267
00268 if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL)
00269 return 1;
00270
00271
00272 if ((lmath = logmath_init
00273 (cmd_ln_float64_r(config, "-logbase"), 0, 0)) == NULL) {
00274 E_FATAL("Failed to initialize log math\n");
00275 }
00276
00277
00278 lmfn = cmd_ln_str_r(config, "-lm");
00279 if (lmfn == NULL
00280 || (lm = ngram_model_read(config, lmfn,
00281 NGRAM_AUTO, lmath)) == NULL) {
00282 E_FATAL("Failed to load language model from %s\n",
00283 cmd_ln_str_r(config, "-lm"));
00284 }
00285 if ((probdefn = cmd_ln_str_r(config, "-probdef")) != NULL)
00286 ngram_model_read_classdef(lm, probdefn);
00287 ngram_model_apply_weights(lm,
00288 cmd_ln_float32_r(config, "-lw"),
00289 cmd_ln_float32_r(config, "-wip"),
00290 cmd_ln_float32_r(config, "-uw"));
00291
00292
00293
00294 lsnfn = cmd_ln_str_r(config, "-lsn");
00295 text = cmd_ln_str_r(config, "-text");
00296 if (lsnfn) {
00297 evaluate_file(lm, lmath, lsnfn);
00298 }
00299 else if (text) {
00300 evaluate_string(lm, lmath, text);
00301 }
00302
00303 return 0;
00304 }