src/libsphinxbase/lm/ngram_model.c

00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 1999-2007 Carnegie Mellon University.  All rights
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00037 /*
00038  * \file ngram_model.c N-Gram language models.
00039  *
00040  * Author: David Huggins-Daines, much code taken from sphinx3/src/libs3decoder/liblm
00041  */
00042 
00043 #include "config.h"
00044 #include "ngram_model.h"
00045 #include "ngram_model_internal.h"
00046 #include "ckd_alloc.h"
00047 #include "filename.h"
00048 #include "pio.h"
00049 #include "err.h"
00050 #include "logmath.h"
00051 #include "strfuncs.h"
00052 #include "case.h"
00053 
00054 #include <string.h>
00055 #include <assert.h>
00056 #ifdef HAVE_ICONV
00057 #include <iconv.h>
00058 #endif 
00059 
00060 ngram_file_type_t
00061 ngram_file_name_to_type(const char *file_name)
00062 {
00063     const char *ext;
00064 
00065     ext = strrchr(file_name, '.');
00066     if (ext == NULL) {
00067         return NGRAM_ARPA; /* Default file type */
00068     }
00069     if (0 == strcmp_nocase(ext, ".gz")) {
00070         while (--ext >= file_name) {
00071             if (*ext == '.') break;
00072         }
00073         if (ext < file_name) {
00074             return NGRAM_ARPA; /* Default file type */
00075         }
00076     }
00077     else if (0 == strcmp_nocase(ext, ".bz2")) {
00078         while (--ext >= file_name) {
00079             if (*ext == '.') break;
00080         }
00081         if (ext < file_name) {
00082             return NGRAM_ARPA; /* Default file type */
00083         }
00084     }
00085     /* We use strncmp because there might be a .gz on the end. */
00086     if (0 == strncmp_nocase(ext, ".ARPA", 5))
00087         return NGRAM_ARPA;
00088     if (0 == strncmp_nocase(ext, ".DMP32", 6))
00089         return NGRAM_DMP32;
00090     if (0 == strncmp_nocase(ext, ".DMP", 4))
00091         return NGRAM_DMP;
00092     return NGRAM_ARPA; /* Default file type */
00093 }
00094 
00095 ngram_model_t *
00096 ngram_model_read(cmd_ln_t *config,
00097                  const char *file_name,
00098                  ngram_file_type_t file_type,
00099                  logmath_t *lmath)
00100 {
00101     ngram_model_t *model = NULL;
00102 
00103     switch (file_type) {
00104     case NGRAM_AUTO: {
00105         if ((model = ngram_model_arpa_read(config, file_name, lmath)) != NULL)
00106             break;
00107         if ((model = ngram_model_dmp_read(config, file_name, lmath)) != NULL)
00108             break;
00109         if ((model = ngram_model_dmp32_read(config, file_name, lmath)) != NULL)
00110             break;
00111         return NULL;
00112     }
00113     case NGRAM_ARPA:
00114         model = ngram_model_arpa_read(config, file_name, lmath);
00115         break;
00116     case NGRAM_DMP:
00117         model = ngram_model_dmp_read(config, file_name, lmath);
00118         break;
00119     case NGRAM_DMP32:
00120         model = ngram_model_dmp32_read(config, file_name, lmath);
00121         break;
00122     }
00123 
00124     /* Now set weights based on config if present. */
00125     if (config) {
00126         float32 lw = 1.0;
00127         float32 wip = 1.0;
00128         float32 uw = 1.0;
00129 
00130         if (cmd_ln_exists_r(config, "-lw"))
00131             lw = cmd_ln_float32_r(config, "-lw");
00132         if (cmd_ln_exists_r(config, "-wip"))
00133             wip = cmd_ln_float32_r(config, "-wip");
00134         if (cmd_ln_exists_r(config, "-uw"))
00135             uw = cmd_ln_float32_r(config, "-uw");
00136 
00137         ngram_model_apply_weights(model, lw, wip, uw);
00138     }
00139 
00140     return model;
00141 }
00142 
00143 int
00144 ngram_model_write(ngram_model_t *model, const char *file_name,
00145                   ngram_file_type_t file_type)
00146 {
00147     switch (file_type) {
00148     case NGRAM_AUTO: {
00149         file_type = ngram_file_name_to_type(file_name);
00150         return ngram_model_write(model, file_name, file_type);
00151     }
00152     case NGRAM_ARPA:
00153         return ngram_model_arpa_write(model, file_name);
00154     case NGRAM_DMP:
00155         return ngram_model_dmp_write(model, file_name);
00156     case NGRAM_DMP32:
00157         return ngram_model_dmp32_write(model, file_name);
00158     }
00159 
00160     return -1; /* In case your compiler is really stupid. */
00161 }
00162 
00163 int32
00164 ngram_model_init(ngram_model_t *base,
00165                  ngram_funcs_t *funcs,
00166                  logmath_t *lmath,
00167                  int32 n, int32 n_unigram)
00168 {
00169     base->refcount = 1;
00170     base->funcs = funcs;
00171     base->n = n;
00172     /* If this was previously initialized... */
00173     if (base->n_counts == NULL)
00174         base->n_counts = ckd_calloc(3, sizeof(*base->n_counts));
00175     /* Don't reset weights if logmath object hasn't changed. */
00176     if (base->lmath != lmath) {
00177         /* Set default values for weights. */
00178         base->lw = 1.0;
00179         base->log_wip = 0; /* i.e. 1.0 */
00180         base->log_uw = 0;  /* i.e. 1.0 */
00181         base->log_uniform = logmath_log(lmath, 1.0 / n_unigram);
00182         base->log_uniform_weight = logmath_get_zero(lmath);
00183         base->log_zero = logmath_get_zero(lmath);
00184         base->lmath = lmath;
00185     }
00186     /* Allocate or reallocate space for word strings. */
00187     if (base->word_str) {
00188         /* Free all previous word strings if they were allocated. */
00189         if (base->writable) {
00190             int32 i;
00191             for (i = 0; i < base->n_words; ++i) {
00192                 ckd_free(base->word_str[i]);
00193                 base->word_str[i] = NULL;
00194             }
00195         }
00196         base->word_str = ckd_realloc(base->word_str, n_unigram * sizeof(char *));
00197     }
00198     else
00199         base->word_str = ckd_calloc(n_unigram, sizeof(char *));
00200     /* NOTE: They are no longer case-insensitive since we are allowing
00201      * other encodings for word strings.  Beware. */
00202     if (base->wid)
00203         hash_table_empty(base->wid);
00204     else
00205         base->wid = hash_table_new(n_unigram, FALSE);
00206     base->n_counts[0] = base->n_1g_alloc = base->n_words = n_unigram;
00207 
00208     return 0;
00209 }
00210 
00211 ngram_model_t *
00212 ngram_model_retain(ngram_model_t *model)
00213 {
00214     ++model->refcount;
00215     return model;
00216 }
00217 
00218 
00219 void
00220 ngram_model_flush(ngram_model_t *model)
00221 {
00222     if (model->funcs && model->funcs->flush)
00223         (*model->funcs->flush)(model);
00224 }
00225 
00226 int
00227 ngram_model_free(ngram_model_t *model)
00228 {
00229     int i;
00230 
00231     if (model == NULL)
00232         return 0;
00233     if (--model->refcount > 0)
00234         return model->refcount;
00235     if (model->funcs && model->funcs->free)
00236         (*model->funcs->free)(model);
00237     if (model->writable) {
00238         /* Free all words. */
00239         for (i = 0; i < model->n_words; ++i) {
00240             ckd_free(model->word_str[i]);
00241         }
00242     }
00243     else {
00244         /* Free all class words. */
00245         for (i = 0; i < model->n_classes; ++i) {
00246             ngram_class_t *lmclass;
00247             int32 j;
00248 
00249             lmclass = model->classes[i];
00250             for (j = 0; j < lmclass->n_words; ++j) {
00251                 ckd_free(model->word_str[lmclass->start_wid + j]);
00252             }
00253             for (j = 0; j < lmclass->n_hash; ++j) {
00254                 if (lmclass->nword_hash[j].wid != -1) {
00255                     ckd_free(model->word_str[lmclass->nword_hash[j].wid]);
00256                 }
00257             }
00258         }
00259     }
00260     for (i = 0; i < model->n_classes; ++i) {
00261         ngram_class_free(model->classes[i]);
00262     }
00263     ckd_free(model->classes);
00264     hash_table_free(model->wid);
00265     ckd_free(model->word_str);
00266     ckd_free(model->n_counts);
00267     ckd_free(model);
00268     return 0;
00269 }
00270 
00271 
00272 #ifdef HAVE_ICONV
00273 int
00274 ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
00275 {
00276     iconv_t ic;
00277     char *outbuf;
00278     size_t maxlen;
00279     int i, writable;
00280     hash_table_t *new_wid;
00281 
00282     /* FIXME: Need to do a special case thing for the GB-HEX encoding
00283      * used in Sphinx3 Mandarin models. */
00284     if ((ic = iconv_open(to, from)) == (iconv_t)-1) {
00285         E_ERROR_SYSTEM("iconv_open() failed");
00286         return -1;
00287     }
00288     /* iconv(3) is a piece of crap and won't accept a NULL out buffer,
00289      * unlike wcstombs(3). So we have to either call it over and over
00290      * again until our buffer is big enough, or call it with a huge
00291      * buffer and then copy things back to the output.  We will use a
00292      * mix of these two approaches here.  We'll keep a single big
00293      * buffer around, and expand it as necessary.
00294      */
00295     maxlen = 0;
00296     for (i = 0; i < model->n_words; ++i) {
00297         if (strlen(model->word_str[i]) > maxlen)
00298             maxlen = strlen(model->word_str[i]);
00299     }
00300     /* Were word strings already allocated? */
00301     writable = model->writable;
00302     /* Either way, we are going to allocate some word strings. */
00303     model->writable = TRUE;
00304     /* Really should be big enough except for pathological cases. */
00305     maxlen = maxlen * sizeof(int) + 15;
00306     outbuf = ckd_calloc(maxlen, 1);
00307     /* And, don't forget, we need to rebuild the word to unigram ID
00308      * mapping. */
00309     new_wid = hash_table_new(model->n_words, FALSE);
00310     for (i = 0; i < model->n_words; ++i) {
00311         ICONV_CONST char *in;
00312         char *out;
00313         size_t inleft, outleft, result;
00314 
00315     start_conversion:
00316         in = (ICONV_CONST char *)model->word_str[i];
00317         /* Yes, this assumes that we don't have any NUL bytes. */
00318         inleft = strlen(in);
00319         out = outbuf;
00320         outleft = maxlen;
00321 
00322         while ((result = iconv(ic, &in, &inleft, &out, &outleft)) == (size_t)-1) {
00323             if (errno != E2BIG) {
00324                 /* FIXME: if we already converted any words, then they
00325                  * are going to be in an inconsistent state. */
00326                 E_ERROR_SYSTEM("iconv() failed");
00327                 ckd_free(outbuf);
00328                 hash_table_free(new_wid);
00329                 return -1;
00330             }
00331             /* Reset the internal state of conversion. */
00332             iconv(ic, NULL, NULL, NULL, NULL);
00333             /* Make everything bigger. */
00334             maxlen *= 2;
00335             out = outbuf = ckd_realloc(outbuf, maxlen);
00336             /* Reset the input pointers. */
00337             in = (ICONV_CONST char *)model->word_str[i];
00338             inleft = strlen(in);
00339         }
00340 
00341         /* Now flush a shift-out sequence, if any. */
00342         if ((result = iconv(ic, NULL, NULL, &out, &outleft)) == (size_t)-1) {
00343             if (errno != E2BIG) {
00344                 /* FIXME: if we already converted any words, then they
00345                  * are going to be in an inconsistent state. */
00346                 E_ERROR_SYSTEM("iconv() failed (state reset sequence)");
00347                 ckd_free(outbuf);
00348                 hash_table_free(new_wid);
00349                 return -1;
00350             }
00351             /* Reset the internal state of conversion. */
00352             iconv(ic, NULL, NULL, NULL, NULL);
00353             /* Make everything bigger. */
00354             maxlen *= 2;
00355             outbuf = ckd_realloc(outbuf, maxlen);
00356             /* Be very evil. */
00357             goto start_conversion;
00358         }
00359 
00360         result = maxlen - outleft;
00361         /* Okay, that was hard, now let's go shopping. */
00362         if (writable) {
00363             /* Grow or shrink the output string as necessary. */
00364             model->word_str[i] = ckd_realloc(model->word_str[i], result + 1);
00365             model->word_str[i][result] = '\0';
00366         }
00367         else {
00368             /* It actually was not allocated previously, so do that now. */
00369             model->word_str[i] = ckd_calloc(result + 1, 1);
00370         }
00371         /* Copy the new thing in. */
00372         memcpy(model->word_str[i], outbuf, result);
00373 
00374         /* Now update the hash table.  We might have terrible
00375          * collisions if a non-reversible conversion was requested.,
00376          * so warn about them. */
00377         if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) {
00378             E_WARN("Duplicate word in dictionary after conversion: %s\n",
00379                    model->word_str[i]);
00380         }
00381     }
00382     ckd_free(outbuf);
00383     iconv_close(ic);
00384     /* Swap out the hash table. */
00385     hash_table_free(model->wid);
00386     model->wid = new_wid;
00387 
00388     return 0;
00389 }
00390 #else /* !HAVE_ICONV */
00391 int
00392 ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
00393 {
00394     return -1;
00395 }
00396 #endif /* !HAVE_ICONV */
00397 
00398 int
00399 ngram_model_apply_weights(ngram_model_t *model,
00400                           float32 lw, float32 wip, float32 uw)
00401 {
00402     return (*model->funcs->apply_weights)(model, lw, wip, uw);
00403 }
00404 
00405 float32
00406 ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip,
00407                         int32 *out_log_uw)
00408 {
00409     if (out_log_wip) *out_log_wip = model->log_wip;
00410     if (out_log_uw) *out_log_uw = model->log_uw;
00411     return model->lw;
00412 }
00413 
00414 
00415 int32
00416 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history,
00417                int32 n_hist, int32 *n_used)
00418 {
00419     int32 score, class_weight = 0;
00420     int i;
00421 
00422     /* Closed vocabulary, OOV word probability is zero */
00423     if (wid == NGRAM_INVALID_WID)
00424         return model->log_zero;
00425 
00426     /* "Declassify" wid and history */
00427     if (NGRAM_IS_CLASSWID(wid)) {
00428         ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
00429 
00430         class_weight = ngram_class_prob(lmclass, wid);
00431         if (class_weight == 1) /* Meaning, not found in class. */
00432             return model->log_zero;
00433         wid = lmclass->tag_wid;
00434     }
00435     for (i = 0; i < n_hist; ++i) {
00436         if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i]))
00437             history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
00438     }
00439     score = (*model->funcs->score)(model, wid, history, n_hist, n_used);
00440 
00441     /* Multiply by unigram in-class weight. */
00442     return score + class_weight;
00443 }
00444 
00445 int32
00446 ngram_score(ngram_model_t *model, const char *word, ...)
00447 {
00448     va_list history;
00449     const char *hword;
00450     int32 *histid;
00451     int32 n_hist;
00452     int32 n_used;
00453     int32 prob;
00454 
00455     va_start(history, word);
00456     n_hist = 0;
00457     while ((hword = va_arg(history, const char *)) != NULL)
00458         ++n_hist;
00459     va_end(history);
00460 
00461     histid = ckd_calloc(n_hist, sizeof(*histid));
00462     va_start(history, word);
00463     n_hist = 0;
00464     while ((hword = va_arg(history, const char *)) != NULL) {
00465         histid[n_hist] = ngram_wid(model, hword);
00466         ++n_hist;
00467     }
00468     va_end(history);
00469 
00470     prob = ngram_ng_score(model, ngram_wid(model, word),
00471                           histid, n_hist, &n_used);
00472     ckd_free(histid);
00473     return prob;
00474 }
00475 
00476 int32
00477 ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used)
00478 {
00479     int32 hist[2] = { w2, w1 };
00480     return ngram_ng_score(model, w3, hist, 2, n_used);
00481 }
00482 
00483 int32
00484 ngram_bg_score(ngram_model_t *model, int32 w2, int32 w1, int32 *n_used)
00485 {
00486     return ngram_ng_score(model, w2, &w1, 1, n_used);
00487 }
00488 
00489 int32
00490 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history,
00491               int32 n_hist, int32 *n_used)
00492 {
00493     int32 prob, class_weight = 0;
00494     int i;
00495 
00496     /* Closed vocabulary, OOV word probability is zero */
00497     if (wid == NGRAM_INVALID_WID)
00498         return model->log_zero;
00499 
00500     /* "Declassify" wid and history */
00501     if (NGRAM_IS_CLASSWID(wid)) {
00502         ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
00503 
00504         class_weight = ngram_class_prob(lmclass, wid);
00505         if (class_weight == 1) /* Meaning, not found in class. */
00506             return class_weight;
00507         wid = lmclass->tag_wid;
00508     }
00509     for (i = 0; i < n_hist; ++i) {
00510         if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i]))
00511             history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
00512     }
00513     prob = (*model->funcs->raw_score)(model, wid, history,
00514                                       n_hist, n_used);
00515     /* Multiply by unigram in-class weight. */
00516     return prob + class_weight;
00517 }
00518 
00519 int32
00520 ngram_prob(ngram_model_t *model, const char *word, ...)
00521 {
00522     va_list history;
00523     const char *hword;
00524     int32 *histid;
00525     int32 n_hist;
00526     int32 n_used;
00527     int32 prob;
00528 
00529     va_start(history, word);
00530     n_hist = 0;
00531     while ((hword = va_arg(history, const char *)) != NULL)
00532         ++n_hist;
00533     va_end(history);
00534 
00535     histid = ckd_calloc(n_hist, sizeof(*histid));
00536     va_start(history, word);
00537     n_hist = 0;
00538     while ((hword = va_arg(history, const char *)) != NULL) {
00539         histid[n_hist] = ngram_wid(model, hword);
00540         ++n_hist;
00541     }
00542     va_end(history);
00543 
00544     prob = ngram_ng_prob(model, ngram_wid(model, word),
00545                          histid, n_hist, &n_used);
00546     ckd_free(histid);
00547     return prob;
00548 }
00549 
00550 int32
00551 ngram_score_to_prob(ngram_model_t *base, int32 score)
00552 {
00553     int32 prob;
00554 
00555     /* Undo insertion penalty. */
00556     prob = score - base->log_wip;
00557     /* Undo language weight. */
00558     prob = (int32)(prob / base->lw);
00559 
00560     return prob;
00561 }
00562 
00563 int32
00564 ngram_unknown_wid(ngram_model_t *model)
00565 {
00566     int32 val;
00567 
00568     /* FIXME: This could be memoized for speed if necessary. */
00569     /* Look up <UNK>, if not found return NGRAM_INVALID_WID. */
00570     if (hash_table_lookup_int32(model->wid, "<UNK>", &val) == -1)
00571         return NGRAM_INVALID_WID;
00572     else
00573         return val;
00574 }
00575 
00576 int32
00577 ngram_zero(ngram_model_t *model)
00578 {
00579     return model->log_zero;
00580 }
00581 
00582 int32
00583 ngram_model_get_size(ngram_model_t *model)
00584 {
00585   if (model != NULL)
00586     return model->n;
00587   return 0;
00588 }
00589 
00590 int32 const *
00591 ngram_model_get_counts(ngram_model_t *model)
00592 {
00593   if (model != NULL)
00594     return model->n_counts;
00595   return NULL;
00596 }
00597 
00598 void
00599 ngram_iter_init(ngram_iter_t *itor, ngram_model_t *model,
00600                 int m, int successor)
00601 {
00602     itor->model = model;
00603     itor->wids = ckd_calloc(m, sizeof(*itor->wids));
00604     itor->m = m;
00605     itor->successor = successor;
00606 }
00607 
00608 ngram_iter_t *
00609 ngram_model_mgrams(ngram_model_t *model, int m)
00610 {
00611     ngram_iter_t *itor;
00612     /* The fact that m=n-1 is not exactly obvious.  Prevent accidents. */
00613     if (m >= model->n)
00614         return NULL;
00615     if (model->funcs->mgrams == NULL)
00616         return NULL;
00617     itor = (*model->funcs->mgrams)(model, m);
00618     return itor;
00619 }
00620 
00621 ngram_iter_t *
00622 ngram_iter(ngram_model_t *model, const char *word, ...)
00623 {
00624     va_list history;
00625     const char *hword;
00626     int32 *histid;
00627     int32 n_hist;
00628     ngram_iter_t *itor;
00629 
00630     va_start(history, word);
00631     n_hist = 0;
00632     while ((hword = va_arg(history, const char *)) != NULL)
00633         ++n_hist;
00634     va_end(history);
00635 
00636     histid = ckd_calloc(n_hist, sizeof(*histid));
00637     va_start(history, word);
00638     n_hist = 0;
00639     while ((hword = va_arg(history, const char *)) != NULL) {
00640         histid[n_hist] = ngram_wid(model, hword);
00641         ++n_hist;
00642     }
00643     va_end(history);
00644 
00645     itor = ngram_ng_iter(model, ngram_wid(model, word), histid, n_hist);
00646     ckd_free(histid);
00647     return itor;
00648 }
00649 
00650 ngram_iter_t *
00651 ngram_ng_iter(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist)
00652 {
00653     if (n_hist >= model->n)
00654         return NULL;
00655     if (model->funcs->iter == NULL)
00656         return NULL;
00657     return (*model->funcs->iter)(model, wid, history, n_hist);
00658 }
00659 
00660 ngram_iter_t *
00661 ngram_iter_successors(ngram_iter_t *itor)
00662 {
00663     /* Stop when we are at the highest order N-Gram. */
00664     if (itor->m == itor->model->n - 1)
00665         return NULL;
00666     return (*itor->model->funcs->successors)(itor);
00667 }
00668 
00669 int32 const *
00670 ngram_iter_get(ngram_iter_t *itor,
00671                int32 *out_score,
00672                int32 *out_bowt)
00673 {
00674     return (*itor->model->funcs->iter_get)(itor, out_score, out_bowt);
00675 }
00676 
00677 ngram_iter_t *
00678 ngram_iter_next(ngram_iter_t *itor)
00679 {
00680     return (*itor->model->funcs->iter_next)(itor);
00681 }
00682 
00683 void
00684 ngram_iter_free(ngram_iter_t *itor)
00685 {
00686     ckd_free(itor->wids);
00687     (*itor->model->funcs->iter_free)(itor);
00688 }
00689 
00690 int32
00691 ngram_wid(ngram_model_t *model, const char *word)
00692 {
00693     int32 val;
00694 
00695     if (hash_table_lookup_int32(model->wid, word, &val) == -1)
00696         return ngram_unknown_wid(model);
00697     else
00698         return val;
00699 }
00700 
00701 const char *
00702 ngram_word(ngram_model_t *model, int32 wid)
00703 {
00704     /* Remove any class tag */
00705     wid = NGRAM_BASEWID(wid);
00706     if (wid >= model->n_words)
00707         return NULL;
00708     return model->word_str[wid];
00709 }
00710 
00714 int32
00715 ngram_add_word_internal(ngram_model_t *model,
00716                         const char *word,
00717                         int32 classid)
00718 {
00719     void *dummy;
00720     int32 wid;
00721 
00722     /* Take the next available word ID */
00723     wid = model->n_words;
00724     if (classid >= 0) {
00725         wid = NGRAM_CLASSWID(wid, classid);
00726     }
00727     /* Check for hash collisions. */
00728     if (hash_table_lookup(model->wid, word, &dummy) == 0) {
00729         E_ERROR("Duplicate definition of word %s\n", word);
00730         return NGRAM_INVALID_WID;
00731     }
00732     /* Reallocate word_str if necessary. */
00733     if (model->n_words >= model->n_1g_alloc) {
00734         model->n_1g_alloc += UG_ALLOC_STEP;
00735         model->word_str = ckd_realloc(model->word_str,
00736                                       sizeof(*model->word_str) * model->n_1g_alloc);
00737     }
00738     /* Add the word string in the appropriate manner. */
00739     /* Class words are always dynamically allocated. */
00740     model->word_str[model->n_words] = ckd_salloc(word);
00741     /* Now enter it into the hash table. */
00742     if (hash_table_enter_int32(model->wid, model->word_str[model->n_words], wid) != wid) {
00743         E_ERROR("Hash insertion failed for word %s => %p (should not happen)\n",
00744                 model->word_str[model->n_words], (void *)(long)(wid));
00745     }
00746     /* Increment number of words. */
00747     ++model->n_words;
00748     return wid;
00749 }
00750 
00751 int32
00752 ngram_model_add_word(ngram_model_t *model,
00753                      const char *word, float32 weight)
00754 {
00755     int32 wid, prob = model->log_zero;
00756 
00757     wid = ngram_add_word_internal(model, word, -1);
00758     if (wid == NGRAM_INVALID_WID)
00759         return wid;
00760 
00761     /* Do what needs to be done to add the word to the unigram. */
00762     if (model->funcs && model->funcs->add_ug)
00763         prob = (*model->funcs->add_ug)(model, wid, logmath_log(model->lmath, weight));
00764     if (prob == 0) {
00765         if (model->writable)
00766             ckd_free(model->word_str[wid]);
00767         return -1;
00768     }
00769     return wid;
00770 }
00771 
00772 ngram_class_t *
00773 ngram_class_new(ngram_model_t *model, int32 tag_wid, int32 start_wid, glist_t classwords)
00774 {
00775     ngram_class_t *lmclass;
00776     gnode_t *gn;
00777     float32 tprob;
00778     int i;
00779 
00780     lmclass = ckd_calloc(1, sizeof(*lmclass));
00781     lmclass->tag_wid = tag_wid;
00782     /* wid_base is the wid (minus class tag) of the first word in the list. */
00783     lmclass->start_wid = start_wid;
00784     lmclass->n_words = glist_count(classwords);
00785     lmclass->prob1 = ckd_calloc(lmclass->n_words, sizeof(*lmclass->prob1));
00786     lmclass->nword_hash = NULL;
00787     lmclass->n_hash = 0;
00788     tprob = 0.0;
00789     for (gn = classwords; gn; gn = gnode_next(gn)) {
00790         tprob += gnode_float32(gn);
00791     }
00792     if (tprob > 1.1 || tprob < 0.9) {
00793         E_WARN("Total class probability is %f, will normalize\n", tprob);
00794         for (gn = classwords; gn; gn = gnode_next(gn)) {
00795             gn->data.fl /= tprob;
00796         }
00797     }
00798     for (i = 0, gn = classwords; gn; ++i, gn = gnode_next(gn)) {
00799         lmclass->prob1[i] = logmath_log(model->lmath, gnode_float32(gn));
00800     }
00801 
00802     return lmclass;
00803 }
00804 
00805 int32
00806 ngram_class_add_word(ngram_class_t *lmclass, int32 wid, int32 lweight)
00807 {
00808     int32 hash;
00809 
00810     if (lmclass->nword_hash == NULL) {
00811         /* Initialize everything in it to -1 */
00812         lmclass->nword_hash = ckd_malloc(NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
00813         memset(lmclass->nword_hash, 0xff, NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
00814         lmclass->n_hash = NGRAM_HASH_SIZE;
00815         lmclass->n_hash_inuse = 0;
00816     }
00817     /* Stupidest possible hash function.  This will work pretty well
00818      * when this function is called repeatedly with contiguous word
00819      * IDs, though... */
00820     hash = wid & (lmclass->n_hash - 1);
00821     if (lmclass->nword_hash[hash].wid == -1) {
00822         /* Good, no collision. */
00823         lmclass->nword_hash[hash].wid = wid;
00824         lmclass->nword_hash[hash].prob1 = lweight;
00825         ++lmclass->n_hash_inuse;
00826         return hash;
00827     }
00828     else {
00829         int32 next; 
00830         /* Collision... Find the end of the hash chain. */
00831         while (lmclass->nword_hash[hash].next != -1)
00832             hash = lmclass->nword_hash[hash].next;
00833         assert(hash != -1);
00834         /* Does we has any more bukkit? */
00835         if (lmclass->n_hash_inuse == lmclass->n_hash) {
00836             /* Oh noes!  Ok, we makes more. */
00837             lmclass->nword_hash = ckd_realloc(lmclass->nword_hash, 
00838                                               lmclass->n_hash * 2 * sizeof(*lmclass->nword_hash));
00839             memset(lmclass->nword_hash + lmclass->n_hash,
00840                    0xff, lmclass->n_hash * sizeof(*lmclass->nword_hash));
00841             /* Just use the next allocated one (easy) */
00842             next = lmclass->n_hash;
00843             lmclass->n_hash *= 2;
00844         }
00845         else {
00846             /* Look for any available bucket.  We hope this doesn't happen. */
00847             for (next = 0; next < lmclass->n_hash; ++next)
00848                 if (lmclass->nword_hash[next].wid == -1)
00849                     break;
00850             /* This should absolutely not happen. */
00851             assert(next != lmclass->n_hash);
00852         }
00853         lmclass->nword_hash[next].wid = wid;
00854         lmclass->nword_hash[next].prob1 = lweight;
00855         lmclass->nword_hash[hash].next = next;
00856         ++lmclass->n_hash_inuse;
00857         return next;
00858     }
00859 }
00860 
00861 void
00862 ngram_class_free(ngram_class_t *lmclass)
00863 {
00864     ckd_free(lmclass->nword_hash);
00865     ckd_free(lmclass->prob1);
00866     ckd_free(lmclass);
00867 }
00868 
00869 int32
00870 ngram_model_add_class_word(ngram_model_t *model,
00871                            const char *classname,
00872                            const char *word,
00873                            float32 weight)
00874 {
00875     ngram_class_t *lmclass;
00876     int32 classid, tag_wid, wid, i, scale;
00877     float32 fprob;
00878 
00879     /* Find the class corresponding to classname.  Linear search
00880      * probably okay here since there won't be very many classes, and
00881      * this doesn't have to be fast. */
00882     tag_wid = ngram_wid(model, classname);
00883     if (tag_wid == NGRAM_INVALID_WID) {
00884         E_ERROR("No such word or class tag: %s\n", classname);
00885         return tag_wid;
00886     }
00887     for (classid = 0; classid < model->n_classes; ++classid) {
00888         if (model->classes[classid]->tag_wid == tag_wid)
00889             break;
00890     }
00891     /* Hmm, no such class.  It's probably not a good idea to create one. */
00892     if (classid == model->n_classes) {
00893         E_ERROR("Word %s is not a class tag (call ngram_model_add_class() first)\n", classname);
00894         return NGRAM_INVALID_WID;
00895     }
00896     lmclass = model->classes[classid];
00897 
00898     /* Add this word to the model's set of words. */
00899     wid = ngram_add_word_internal(model, word, classid);
00900     if (wid == NGRAM_INVALID_WID)
00901         return wid;
00902 
00903     /* This is the fixed probability of the new word. */
00904     fprob = weight * 1.0f / (lmclass->n_words + lmclass->n_hash_inuse + 1);
00905     /* Now normalize everything else to fit it in.  This is
00906      * accomplished by simply scaling all the other probabilities
00907      * by (1-fprob). */
00908     scale = logmath_log(model->lmath, 1.0 - fprob);
00909     for (i = 0; i < lmclass->n_words; ++i)
00910         lmclass->prob1[i] += scale;
00911     for (i = 0; i < lmclass->n_hash; ++i)
00912         if (lmclass->nword_hash[i].wid != -1)
00913             lmclass->nword_hash[i].prob1 += scale;
00914 
00915     /* Now add it to the class hash table. */
00916     return ngram_class_add_word(lmclass, wid, logmath_log(model->lmath, fprob));
00917 }
00918 
00919 int32
00920 ngram_model_add_class(ngram_model_t *model,
00921                       const char *classname,
00922                       float32 classweight,
00923                       char **words,
00924                       const float32 *weights,
00925                       int32 n_words)
00926 {
00927     ngram_class_t *lmclass;
00928     glist_t classwords = NULL;
00929     int32 i, start_wid = -1;
00930     int32 classid, tag_wid;
00931 
00932     /* Check if classname already exists in model.  If not, add it.*/
00933     if ((tag_wid = ngram_wid(model, classname)) == ngram_unknown_wid(model)) {
00934         tag_wid = ngram_model_add_word(model, classname, classweight);
00935         if (tag_wid == NGRAM_INVALID_WID)
00936             return -1;
00937     }
00938 
00939     if (model->n_classes == 128) {
00940         E_ERROR("Number of classes cannot exceed 128 (sorry)\n");
00941         return -1;
00942     }
00943     classid = model->n_classes;
00944     for (i = 0; i < n_words; ++i) {
00945         int32 wid;
00946 
00947         wid = ngram_add_word_internal(model, words[i], classid);
00948         if (wid == NGRAM_INVALID_WID)
00949             return -1;
00950         if (start_wid == -1)
00951             start_wid = NGRAM_BASEWID(wid);
00952         classwords = glist_add_float32(classwords, weights[i]);
00953     }
00954     classwords = glist_reverse(classwords);
00955     lmclass = ngram_class_new(model, tag_wid, start_wid, classwords);
00956     glist_free(classwords);
00957     if (lmclass == NULL)
00958         return -1;
00959 
00960     ++model->n_classes;
00961     if (model->classes == NULL)
00962         model->classes = ckd_calloc(1, sizeof(*model->classes));
00963     else
00964         model->classes = ckd_realloc(model->classes,
00965                                      model->n_classes * sizeof(*model->classes));
00966     model->classes[classid] = lmclass;
00967     return classid;
00968 }
00969 
00970 int32
00971 ngram_class_prob(ngram_class_t *lmclass, int32 wid)
00972 {
00973     int32 base_wid = NGRAM_BASEWID(wid);
00974 
00975     if (base_wid < lmclass->start_wid
00976         || base_wid > lmclass->start_wid + lmclass->n_words) {
00977         int32 hash;
00978 
00979         /* Look it up in the hash table. */
00980         hash = wid & (lmclass->n_hash - 1);
00981         while (hash != -1 && lmclass->nword_hash[hash].wid != wid)
00982             hash = lmclass->nword_hash[hash].next;
00983         if (hash == -1)
00984             return 1;
00985         return lmclass->nword_hash[hash].prob1;
00986     }
00987     else {
00988         return lmclass->prob1[base_wid - lmclass->start_wid];
00989     }
00990 }
00991 
00992 int32
00993 read_classdef_file(hash_table_t *classes, const char *file_name)
00994 {
00995     FILE *fp;
00996     int32 is_pipe;
00997     int inclass;  
00998     int32 rv = -1;
00999     gnode_t *gn;
01000     glist_t classwords = NULL;
01001     glist_t classprobs = NULL;
01002     char *classname = NULL;
01003 
01004     if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) {
01005         E_ERROR("File %s not found\n", file_name);
01006         return -1;
01007     }
01008 
01009     inclass = FALSE;
01010     while (!feof(fp)) {
01011         char line[512];
01012         char *wptr[2];
01013         int n_words;
01014 
01015         if (fgets(line, sizeof(line), fp) == NULL)
01016             break;
01017 
01018         n_words = str2words(line, wptr, 2);
01019         if (n_words <= 0)
01020             continue;
01021 
01022         if (inclass) {
01023             /* Look for an end of class marker. */
01024             if (n_words == 2 && 0 == strcmp(wptr[0], "END")) {
01025                 classdef_t *classdef;
01026                 gnode_t *word, *weight;
01027                 int32 i;
01028 
01029                 if (classname == NULL || 0 != strcmp(wptr[1], classname))
01030                     goto error_out;
01031                 inclass = FALSE;
01032 
01033                 /* Construct a class from the list of words collected. */
01034                 classdef = ckd_calloc(1, sizeof(*classdef));
01035                 classwords = glist_reverse(classwords);
01036                 classprobs = glist_reverse(classprobs);
01037                 classdef->n_words = glist_count(classwords);
01038                 classdef->words = ckd_calloc(classdef->n_words,
01039                                              sizeof(*classdef->words));
01040                 classdef->weights = ckd_calloc(classdef->n_words,
01041                                                sizeof(*classdef->weights));
01042                 word = classwords;
01043                 weight = classprobs;
01044                 for (i = 0; i < classdef->n_words; ++i) {
01045                     classdef->words[i] = gnode_ptr(word);
01046                     classdef->weights[i] = gnode_float32(weight);
01047                     word = gnode_next(word);
01048                     weight = gnode_next(weight);
01049                 }
01050                 
01051                 /* Add this class to the hash table. */
01052                 if (hash_table_enter(classes, classname, classdef) != classdef) {
01053                     classdef_free(classdef);
01054                     goto error_out;
01055                 }
01056 
01057                 /* Reset everything. */
01058                 glist_free(classwords);
01059                 glist_free(classprobs);
01060                 classwords = NULL;
01061                 classprobs = NULL;
01062                 classname = NULL;
01063             }
01064             else {
01065                 float32 fprob;
01066 
01067                 if (n_words == 2)
01068                     fprob = (float32)atof_c(wptr[1]);
01069                 else
01070                     fprob = 1.0f;
01071                 /* Add it to the list of words for this class. */
01072                 classwords = glist_add_ptr(classwords, ckd_salloc(wptr[0]));
01073                 classprobs = glist_add_float32(classprobs, fprob);
01074             }
01075         }
01076         else {
01077             /* Start a new LM class if the LMCLASS marker is seen */
01078             if (n_words == 2 && 0 == strcmp(wptr[0], "LMCLASS")) {
01079                 if (inclass)
01080                     goto error_out;
01081                 inclass = TRUE;
01082                 classname = ckd_salloc(wptr[1]);
01083             }
01084             /* Otherwise, just ignore whatever junk we got */
01085         }
01086     }
01087     rv = 0; /* Success. */
01088 
01089 error_out:
01090     /* Free all the stuff we might have allocated. */
01091     fclose_comp(fp, is_pipe);
01092     for (gn = classwords; gn; gn = gnode_next(gn))
01093         ckd_free(gnode_ptr(gn));
01094     glist_free(classwords);
01095     glist_free(classprobs);
01096     ckd_free(classname);
01097 
01098     return rv;
01099 }
01100 
01101 void
01102 classdef_free(classdef_t *classdef)
01103 {
01104     int32 i;
01105     for (i = 0; i < classdef->n_words; ++i)
01106         ckd_free(classdef->words[i]);
01107     ckd_free(classdef->words);
01108     ckd_free(classdef->weights);
01109     ckd_free(classdef);
01110 }
01111 
01112 
01113 int32
01114 ngram_model_read_classdef(ngram_model_t *model,
01115                           const char *file_name)
01116 {
01117     hash_table_t *classes;
01118     glist_t hl = NULL;
01119     gnode_t *gn;
01120     int32 rv = -1;
01121 
01122     classes = hash_table_new(0, FALSE);
01123     if (read_classdef_file(classes, file_name) < 0) {
01124         hash_table_free(classes);
01125         return -1;
01126     }
01127     
01128     /* Create a new class in the language model for each classdef. */
01129     hl = hash_table_tolist(classes, NULL);
01130     for (gn = hl; gn; gn = gnode_next(gn)) {
01131         hash_entry_t *he = gnode_ptr(gn);
01132         classdef_t *classdef = he->val;
01133 
01134         if (ngram_model_add_class(model, he->key, 1.0,
01135                                   classdef->words,
01136                                   classdef->weights,
01137                                   classdef->n_words) < 0)
01138             goto error_out;
01139     }
01140     rv = 0;
01141 
01142 error_out:
01143     for (gn = hl; gn; gn = gnode_next(gn)) {
01144         hash_entry_t *he = gnode_ptr(gn);
01145         ckd_free((char *)he->key);
01146         classdef_free(he->val);
01147     }
01148     glist_free(hl);
01149     hash_table_free(classes);
01150     return rv;
01151 }

Generated on Mon Jan 24 21:36:19 2011 for SphinxBase by  doxygen 1.4.7