src/libsphinxbase/lm/ngram_model_dmp.c

00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 1999-2007 Carnegie Mellon University.  All rights
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00037 /*
00038  * \file ngram_model_dmp.c DMP format language models
00039  *
00040  * Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
00041  */
00042 
00043 #include "ckd_alloc.h"
00044 #include "ngram_model_dmp.h"
00045 #include "pio.h"
00046 #include "err.h"
00047 #include "byteorder.h"
00048 #include "listelem_alloc.h"
00049 
00050 #include <stdio.h>
00051 #include <string.h>
00052 #include <stdlib.h>
00053 #include <limits.h>
00054 
00055 static const char darpa_hdr[] = "Darpa Trigram LM";
00056 static ngram_funcs_t ngram_model_dmp_funcs;
00057 
00058 #define TSEG_BASE(m,b)          ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ])
00059 #define FIRST_BG(m,u)           ((m)->lm3g.unigrams[u].bigrams)
00060 #define FIRST_TG(m,b)           (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams))
00061 
00062 static unigram_t *
00063 new_unigram_table(int32 n_ug)
00064 {
00065     unigram_t *table;
00066     int32 i;
00067 
00068     table = ckd_calloc(n_ug, sizeof(unigram_t));
00069     for (i = 0; i < n_ug; i++) {
00070         table[i].prob1.f = -99.0;
00071         table[i].bo_wt1.f = -99.0;
00072     }
00073     return table;
00074 }
00075 
00076 ngram_model_t *
00077 ngram_model_dmp_read(cmd_ln_t *config,
00078                      const char *file_name,
00079                      logmath_t *lmath)
00080 {
00081     ngram_model_t *base;
00082     ngram_model_dmp_t *model;
00083     FILE *fp;
00084     int do_mmap, do_swap;
00085     int32 is_pipe;
00086     int32 i, j, k, vn, n, ts;
00087     int32 n_unigram;
00088     int32 n_bigram;
00089     int32 n_trigram;
00090     char str[1024];
00091     unigram_t *ugptr;
00092     bigram_t *bgptr;
00093     trigram_t *tgptr;
00094     char *tmp_word_str;
00095     char *map_base = NULL;
00096     size_t offset = 0, filesize;
00097 
00098     do_mmap = FALSE;
00099     if (config)
00100         do_mmap = cmd_ln_boolean_r(config, "-mmap");
00101 
00102     if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) {
00103         E_ERROR("Dump file %s not found\n", file_name);
00104         return NULL;
00105     }
00106 
00107     if (is_pipe && do_mmap) {
00108         E_WARN("Dump file is compressed, will not use memory-mapped I/O\n");
00109         do_mmap = 0;
00110     }
00111 
00112     do_swap = FALSE;
00113     fread(&k, sizeof(k), 1, fp);
00114     if (k != strlen(darpa_hdr)+1) {
00115         SWAP_INT32(&k);
00116         if (k != strlen(darpa_hdr)+1) {
00117             E_ERROR("Wrong magic header size number %x: %s is not a dump file\n", k, file_name);
00118             fclose(fp);
00119             return NULL;
00120         }
00121         do_swap = 1;
00122     }
00123     if (fread(str, sizeof(char), k, fp) != (size_t) k) {
00124         E_ERROR("Cannot read header\n");
00125         fclose_comp(fp, is_pipe);
00126         return NULL;
00127     }
00128     if (strncmp(str, darpa_hdr, k) != 0) {
00129         E_ERROR("Wrong header %s: %s is not a dump file\n", darpa_hdr);
00130         fclose(fp);
00131         return NULL;
00132     }
00133 
00134     if (do_mmap) {
00135         if (do_swap) {
00136             E_INFO
00137                 ("Byteswapping required, will not use memory-mapped I/O for LM file\n");
00138             do_mmap = 0;
00139         }
00140         else {
00141             E_INFO("Will use memory-mapped I/O for LM file\n");
00142 #ifdef __ADSPBLACKFIN__ /* This is true for both VisualDSP++ and uClinux. */
00143             E_FATAL("memory mapping is not supported at the moment.");
00144 #else
00145 #endif
00146         }
00147     }
00148 
00149     fread(&k, sizeof(k), 1, fp);
00150     if (do_swap) SWAP_INT32(&k);
00151     if (fread(str, sizeof(char), k, fp) != (size_t) k) {
00152         E_ERROR("Cannot read LM filename in header\n");
00153         fclose(fp);
00154         return NULL;
00155     }
00156 
00157     /* read version#, if present (must be <= 0) */
00158     fread(&vn, sizeof(vn), 1, fp);
00159     if (do_swap) SWAP_INT32(&vn);
00160     if (vn <= 0) {
00161         /* read and don't compare timestamps (we don't care) */
00162         fread(&ts, sizeof(ts), 1, fp);
00163         if (do_swap) SWAP_INT32(&ts);
00164 
00165         /* read and skip format description */
00166         for (;;) {
00167             fread(&k, sizeof(k), 1, fp);
00168             if (do_swap) SWAP_INT32(&k);
00169             if (k == 0)
00170                 break;
00171             if (fread(str, sizeof(char), k, fp) != (size_t) k) {
00172                 E_ERROR("fread(word) failed\n");
00173                 fclose(fp);
00174                 return NULL;
00175             }
00176         }
00177         /* read model->ucount */
00178         fread(&n_unigram, sizeof(n_unigram), 1, fp);
00179         if (do_swap) SWAP_INT32(&n_unigram);
00180     }
00181     else {
00182         n_unigram = vn;
00183     }
00184 
00185     /* read model->bcount, tcount */
00186     fread(&n_bigram, sizeof(n_bigram), 1, fp);
00187     if (do_swap) SWAP_INT32(&n_bigram);
00188     fread(&n_trigram, sizeof(n_trigram), 1, fp);
00189     if (do_swap) SWAP_INT32(&n_trigram);
00190     E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);
00191 
00192     /* Allocate space for LM, including initial OOVs and placeholders; initialize it */
00193     model = ckd_calloc(1, sizeof(*model));
00194     base = &model->base;
00195     if (n_trigram > 0)
00196         n = 3;
00197     else if (n_bigram > 0)
00198         n = 2;
00199     else
00200         n = 1;
00201     ngram_model_init(base, &ngram_model_dmp_funcs, lmath, n, n_unigram);
00202     base->n_counts[0] = n_unigram;
00203     base->n_counts[1] = n_bigram;
00204     base->n_counts[2] = n_trigram;
00205 
00206     /* read unigrams (always in memory, as they contain dictionary
00207      * mappings that can't be precomputed, and also could have OOVs added) */
00208     model->lm3g.unigrams = new_unigram_table(n_unigram + 1);
00209     ugptr = model->lm3g.unigrams;
00210     for (i = 0; i <= n_unigram; ++i) {
00211         /* Skip over the mapping ID, we don't care about it. */
00212         if (fread(ugptr, sizeof(int32), 1, fp) != 1) {
00213             E_ERROR("fread(mapid[%d]) failed\n", i);
00214             ngram_model_free(base);
00215             fclose_comp(fp, is_pipe);
00216             return NULL;
00217         }
00218         /* Read the actual unigram structure. */
00219         if (fread(ugptr, sizeof(unigram_t), 1, fp) != 1)  {
00220             E_ERROR("fread(unigrams) failed\n");
00221             ngram_model_free(base);
00222             fclose_comp(fp, is_pipe);
00223             return NULL;
00224         }
00225         /* Byte swap if necessary. */
00226         if (do_swap) {
00227             SWAP_INT32(&ugptr->prob1.l);
00228             SWAP_INT32(&ugptr->bo_wt1.l);
00229             SWAP_INT32(&ugptr->bigrams);
00230         }
00231         /* Convert values to log. */
00232         ugptr->prob1.l = logmath_log10_to_log(lmath, ugptr->prob1.f);
00233         ugptr->bo_wt1.l = logmath_log10_to_log(lmath, ugptr->bo_wt1.f);
00234         ++ugptr;
00235     }
00236     E_INFO("%8d = LM.unigrams(+trailer) read\n", n_unigram);
00237 
00238     /* Now mmap() the file and read in the rest of the (read-only) stuff. */
00239     if (do_mmap) {
00240         offset = ftell(fp);
00241         fseek(fp, 0, SEEK_END);
00242         filesize = ftell(fp);
00243         fseek(fp, offset, SEEK_SET);
00244 
00245         /* Check for improper word alignment. */
00246         if (offset & 0x3) {
00247             E_WARN("-mmap specified, but tseg_base is not word-aligned.  Will not memory-map.\n");
00248             do_mmap = FALSE;
00249         }
00250         else {
00251             model->dump_mmap = mmio_file_read(file_name);
00252             if (model->dump_mmap == NULL) {
00253                 do_mmap = FALSE;
00254             }
00255             else {
00256                 map_base = mmio_file_ptr(model->dump_mmap);
00257             }
00258         }
00259     }
00260 
00261     /* read bigrams */
00262     if (do_mmap) {
00263         model->lm3g.bigrams = (bigram_t *) (map_base + offset);
00264         offset += (n_bigram + 1) * sizeof(bigram_t);
00265     }
00266     else {
00267         model->lm3g.bigrams =
00268             ckd_calloc(n_bigram + 1, sizeof(bigram_t));
00269         if (fread(model->lm3g.bigrams, sizeof(bigram_t), n_bigram + 1, fp)
00270             != (size_t) n_bigram + 1) {
00271             E_ERROR("fread(bigrams) failed\n");
00272             ngram_model_free(base);
00273             fclose_comp(fp, is_pipe);
00274             return NULL;
00275         }
00276         if (do_swap) {
00277             for (i = 0, bgptr = model->lm3g.bigrams; i <= n_bigram;
00278                  i++, bgptr++) {
00279                 SWAP_INT16(&bgptr->wid);
00280                 SWAP_INT16(&bgptr->prob2);
00281                 SWAP_INT16(&bgptr->bo_wt2);
00282                 SWAP_INT16(&bgptr->trigrams);
00283             }
00284         }
00285     }
00286     E_INFO("%8d = LM.bigrams(+trailer) read\n", n_bigram);
00287 
00288     /* read trigrams */
00289     if (n_trigram > 0) {
00290         if (do_mmap) {
00291             model->lm3g.trigrams = (trigram_t *) (map_base + offset);
00292             offset += n_trigram * sizeof(trigram_t);
00293         }
00294         else {
00295             model->lm3g.trigrams =
00296                 ckd_calloc(n_trigram, sizeof(trigram_t));
00297             if (fread
00298                 (model->lm3g.trigrams, sizeof(trigram_t), n_trigram, fp)
00299                 != (size_t) n_trigram) {
00300                 E_ERROR("fread(trigrams) failed\n");
00301                 ngram_model_free(base);
00302                 fclose_comp(fp, is_pipe);
00303                 return NULL;
00304             }
00305             if (do_swap) {
00306                 for (i = 0, tgptr = model->lm3g.trigrams; i < n_trigram;
00307                      i++, tgptr++) {
00308                     SWAP_INT16(&tgptr->wid);
00309                     SWAP_INT16(&tgptr->prob3);
00310                 }
00311             }
00312         }
00313         E_INFO("%8d = LM.trigrams read\n", n_trigram);
00314         /* Initialize tginfo */
00315         model->lm3g.tginfo = ckd_calloc(n_unigram, sizeof(tginfo_t *));
00316         model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t));
00317     }
00318 
00319     /* read n_prob2 and prob2 array (in memory) */
00320     if (do_mmap)
00321         fseek(fp, offset, SEEK_SET);
00322     fread(&k, sizeof(k), 1, fp);
00323     if (do_swap) SWAP_INT32(&k);
00324     model->lm3g.n_prob2 = k;
00325     model->lm3g.prob2 = ckd_calloc(k, sizeof(*model->lm3g.prob2));
00326     if (fread(model->lm3g.prob2, sizeof(*model->lm3g.prob2), k, fp) != (size_t) k) {
00327         E_ERROR("fread(prob2) failed\n");
00328         ngram_model_free(base);
00329         fclose_comp(fp, is_pipe);
00330         return NULL;
00331     }
00332     for (i = 0; i < k; i++) {
00333         if (do_swap)
00334             SWAP_INT32(&model->lm3g.prob2[i].l);
00335         /* Convert values to log. */
00336         model->lm3g.prob2[i].l = logmath_log10_to_log(lmath, model->lm3g.prob2[i].f);
00337     }
00338     E_INFO("%8d = LM.prob2 entries read\n", k);
00339 
00340     /* read n_bo_wt2 and bo_wt2 array (in memory) */
00341     if (base->n > 2) {
00342         fread(&k, sizeof(k), 1, fp);
00343         if (do_swap) SWAP_INT32(&k);
00344         model->lm3g.n_bo_wt2 = k;
00345         model->lm3g.bo_wt2 = ckd_calloc(k, sizeof(*model->lm3g.bo_wt2));
00346         if (fread(model->lm3g.bo_wt2, sizeof(*model->lm3g.bo_wt2), k, fp) != (size_t) k) {
00347             E_ERROR("fread(bo_wt2) failed\n");
00348             ngram_model_free(base);
00349             fclose_comp(fp, is_pipe);
00350             return NULL;
00351         }
00352         for (i = 0; i < k; i++) {
00353             if (do_swap)
00354                 SWAP_INT32(&model->lm3g.bo_wt2[i].l);
00355             /* Convert values to log. */
00356             model->lm3g.bo_wt2[i].l = logmath_log10_to_log(lmath, model->lm3g.bo_wt2[i].f);
00357         }
00358         E_INFO("%8d = LM.bo_wt2 entries read\n", k);
00359     }
00360 
00361     /* read n_prob3 and prob3 array (in memory) */
00362     if (base->n > 2) {
00363         fread(&k, sizeof(k), 1, fp);
00364         if (do_swap) SWAP_INT32(&k);
00365         model->lm3g.n_prob3 = k;
00366         model->lm3g.prob3 = ckd_calloc(k, sizeof(*model->lm3g.prob3));
00367         if (fread(model->lm3g.prob3, sizeof(*model->lm3g.prob3), k, fp) != (size_t) k) {
00368             E_ERROR("fread(prob3) failed\n");
00369             ngram_model_free(base);
00370             fclose_comp(fp, is_pipe);
00371             return NULL;
00372         }
00373         for (i = 0; i < k; i++) {
00374             if (do_swap)
00375                 SWAP_INT32(&model->lm3g.prob3[i].l);
00376             /* Convert values to log. */
00377             model->lm3g.prob3[i].l = logmath_log10_to_log(lmath, model->lm3g.prob3[i].f);
00378         }
00379         E_INFO("%8d = LM.prob3 entries read\n", k);
00380     }
00381 
00382     /* read tseg_base size and tseg_base */
00383     if (do_mmap)
00384         offset = ftell(fp);
00385     if (n_trigram > 0) {
00386         if (do_mmap) {
00387             memcpy(&k, map_base + offset, sizeof(k));
00388             offset += sizeof(int32);
00389             model->lm3g.tseg_base = (int32 *) (map_base + offset);
00390             offset += k * sizeof(int32);
00391         }
00392         else {
00393             k = (n_bigram + 1) / BG_SEG_SZ + 1;
00394             fread(&k, sizeof(k), 1, fp);
00395             if (do_swap) SWAP_INT32(&k);
00396             model->lm3g.tseg_base = ckd_calloc(k, sizeof(int32));
00397             if (fread(model->lm3g.tseg_base, sizeof(int32), k, fp) !=
00398                 (size_t) k) {
00399                 E_ERROR("fread(tseg_base) failed\n");
00400                 ngram_model_free(base);
00401                 fclose_comp(fp, is_pipe);
00402                 return NULL;
00403             }
00404             if (do_swap)
00405                 for (i = 0; i < k; i++)
00406                     SWAP_INT32(&model->lm3g.tseg_base[i]);
00407         }
00408         E_INFO("%8d = LM.tseg_base entries read\n", k);
00409     }
00410 
00411     /* read ascii word strings */
00412     if (do_mmap) {
00413         memcpy(&k, map_base + offset, sizeof(k));
00414         offset += sizeof(int32);
00415         tmp_word_str = (char *) (map_base + offset);
00416         offset += k;
00417     }
00418     else {
00419         base->writable = TRUE;
00420         fread(&k, sizeof(k), 1, fp);
00421         if (do_swap) SWAP_INT32(&k);
00422         tmp_word_str = ckd_calloc(k, sizeof(char));
00423         if (fread(tmp_word_str, sizeof(char), k, fp) != (size_t) k) {
00424             E_ERROR("fread(word-string) failed\n");
00425             ngram_model_free(base);
00426             fclose_comp(fp, is_pipe);
00427             return NULL;
00428         }
00429     }
00430 
00431     /* First make sure string just read contains n_counts[0] words (PARANOIA!!) */
00432     for (i = 0, j = 0; i < k; i++)
00433         if (tmp_word_str[i] == '\0')
00434             j++;
00435     if (j != n_unigram) {
00436         E_ERROR("Error reading word strings (%d doesn't match n_unigrams %d)\n",
00437                 j, n_unigram);
00438         ngram_model_free(base);
00439         fclose_comp(fp, is_pipe);
00440         return NULL;
00441     }
00442 
00443     /* Break up string just read into words */
00444     if (do_mmap) {
00445         j = 0;
00446         for (i = 0; i < n_unigram; i++) {
00447             base->word_str[i] = tmp_word_str + j;
00448             if (hash_table_enter(base->wid, base->word_str[i],
00449                                  (void *)(long)i) != (void *)(long)i) {
00450                 E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]);
00451             }
00452             j += strlen(base->word_str[i]) + 1;
00453         }
00454     }
00455     else {
00456         j = 0;
00457         for (i = 0; i < n_unigram; i++) {
00458             base->word_str[i] = ckd_salloc(tmp_word_str + j);
00459             if (hash_table_enter(base->wid, base->word_str[i],
00460                                  (void *)(long)i) != (void *)(long)i) {
00461                 E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]);
00462             }
00463             j += strlen(base->word_str[i]) + 1;
00464         }
00465         free(tmp_word_str);
00466     }
00467     E_INFO("%8d = ascii word strings read\n", i);
00468 
00469     fclose_comp(fp, is_pipe);
00470     return base;
00471 }
00472 
00473 int
00474 ngram_model_dmp_write(ngram_model_t *model,
00475                       const char *file_name)
00476 {
00477     return -1;
00478 }
00479 
00480 static int
00481 ngram_model_dmp_apply_weights(ngram_model_t *base, float32 lw,
00482                               float32 wip, float32 uw)
00483 {
00484     ngram_model_dmp_t *model = (ngram_model_dmp_t *)base;
00485     lm3g_apply_weights(base, &model->lm3g, lw, wip, uw);
00486     return 0;
00487 }
00488 
00489 /* Lousy "templating" for things that are largely the same in DMP and
00490  * ARPA models, except for the bigram and trigram types and some
00491  * names. */
00492 #define NGRAM_MODEL_TYPE ngram_model_dmp_t
00493 #include "lm3g_templates.c"
00494 
00495 static void
00496 ngram_model_dmp_free(ngram_model_t *base)
00497 {
00498     ngram_model_dmp_t *model = (ngram_model_dmp_t *)base;
00499 
00500     ckd_free(model->lm3g.unigrams);
00501     ckd_free(model->lm3g.prob2);
00502     if (model->dump_mmap) {
00503         mmio_file_unmap(model->dump_mmap);
00504     } 
00505     else {
00506         ckd_free(model->lm3g.bigrams);
00507         if (base->n > 2) {
00508             ckd_free(model->lm3g.trigrams);
00509             ckd_free(model->lm3g.tseg_base);
00510         }
00511     }
00512     if (base->n > 2) {
00513         ckd_free(model->lm3g.bo_wt2);
00514         ckd_free(model->lm3g.prob3);
00515     }
00516 
00517     lm3g_tginfo_free(base, &model->lm3g);
00518 }
00519 
00520 static ngram_funcs_t ngram_model_dmp_funcs = {
00521     ngram_model_dmp_free,          /* free */
00522     ngram_model_dmp_apply_weights, /* apply_weights */
00523     lm3g_template_score,           /* score */
00524     lm3g_template_raw_score,       /* raw_score */
00525     lm3g_template_add_ug,          /* add_ug */
00526     lm3g_template_flush,           /* flush */
00527     lm3g_template_iter,             /* iter */
00528     lm3g_template_mgrams,          /* mgrams */
00529     lm3g_template_successors,      /* successors */
00530     lm3g_template_iter_get,        /* iter_get */
00531     lm3g_template_iter_next,       /* iter_next */
00532     lm3g_template_iter_free        /* iter_free */
00533 };

Generated on Mon Jan 24 21:36:19 2011 for SphinxBase by  doxygen 1.4.7