src/libsphinxbase/util/huff_code.c

00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 2009 Carnegie Mellon University.  All rights
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00037 
00038 #include "huff_code.h"
00039 #include "ckd_alloc.h"
00040 #include "hash_table.h"
00041 #include "byteorder.h"
00042 #include "heap.h"
00043 #include "pio.h"
00044 #include "err.h"
00045 
00046 #include <string.h>
00047 
00048 typedef struct huff_node_s {
00049     int nbits;
00050     struct huff_node_s *l;
00051     union {
00052         int32 ival;
00053         char *sval;
00054         struct huff_node_s *r;
00055     } r;
00056 } huff_node_t;
00057 
00058 typedef struct huff_codeword_s {
00059     union {
00060         int32 ival;
00061         char *sval;
00062     } r;
00063     uint32 nbits, codeword;
00064 } huff_codeword_t;
00065 
00066 enum {
00067     HUFF_CODE_INT,
00068     HUFF_CODE_STR
00069 };
00070 
00071 struct huff_code_s {
00072     int16 refcount;
00073     uint8 maxbits;
00074     uint8 type;
00075     uint32 *firstcode;
00076     uint32 *numl;
00077     huff_codeword_t **syms;
00078     hash_table_t *codewords;
00079     FILE *fh;
00080     bit_encode_t *be;
00081     int boff;
00082 };
00083 
00084 static huff_node_t *
00085 huff_node_new_int(int32 val)
00086 {
00087     huff_node_t *hn = ckd_calloc(1, sizeof(*hn));
00088     hn->r.ival = val;
00089     return hn;
00090 }
00091 
00092 static huff_node_t *
00093 huff_node_new_str(char const *val)
00094 {
00095     huff_node_t *hn = ckd_calloc(1, sizeof(*hn));
00096     hn->r.sval = ckd_salloc(val);
00097     return hn;
00098 }
00099 
00100 static huff_node_t *
00101 huff_node_new_parent(huff_node_t *l, huff_node_t *r)
00102 {
00103     huff_node_t *hn = ckd_calloc(1, sizeof(*hn));
00104     hn->l = l;
00105     hn->r.r = r;
00106     /* Propagate maximum bit length. */
00107     if (r->nbits > l->nbits)
00108         hn->nbits = r->nbits + 1;
00109     else
00110         hn->nbits = l->nbits + 1;
00111     return hn;
00112 }
00113 
00114 static void
00115 huff_node_free_int(huff_node_t *root)
00116 {
00117     if (root->l) {
00118         huff_node_free_int(root->l);
00119         huff_node_free_int(root->r.r);
00120     }
00121     ckd_free(root);
00122 }
00123 
00124 static void
00125 huff_node_free_str(huff_node_t *root, int freestr)
00126 {
00127     if (root->l) {
00128         huff_node_free_str(root->l, freestr);
00129         huff_node_free_str(root->r.r, freestr);
00130     }
00131     else {
00132         if (freestr)
00133             ckd_free(root->r.sval);
00134     }
00135     ckd_free(root);
00136 }
00137 
00138 static huff_node_t *
00139 huff_code_build_tree(heap_t q)
00140 {
00141     huff_node_t *root = NULL;
00142     int32 rf;
00143 
00144     while (heap_size(q) > 1) {
00145         huff_node_t *l, *r, *p;
00146         int32 lf, rf;
00147 
00148         heap_pop(q, (void *)&l, &lf);
00149         heap_pop(q, (void *)&r, &rf);
00150         p = huff_node_new_parent(l, r);
00151         heap_insert(q, p, lf + rf);
00152     }
00153     heap_pop(q, (void **)&root, &rf);
00154     return root;
00155 }
00156 
00157 static void
00158 huff_code_canonicalize(huff_code_t *hc, huff_node_t *root)
00159 {
00160     glist_t agenda;
00161     uint32 *nextcode;
00162     int i, ncw;
00163 
00164     hc->firstcode = ckd_calloc(hc->maxbits+1, sizeof(*hc->firstcode));
00165     hc->syms = ckd_calloc(hc->maxbits+1, sizeof(*hc->syms));
00166     hc->numl = ckd_calloc(hc->maxbits+1, sizeof(*nextcode));
00167     nextcode = ckd_calloc(hc->maxbits+1, sizeof(*nextcode));
00168 
00169     /* Traverse the tree, annotating it with the actual bit
00170      * lengths, and histogramming them in numl. */
00171     root->nbits = 0;
00172     ncw = 0;
00173     agenda = glist_add_ptr(NULL, root);
00174     while (agenda) {
00175         huff_node_t *node = gnode_ptr(agenda);
00176         agenda = gnode_free(agenda, NULL);
00177         if (node->l) {
00178             node->l->nbits = node->nbits + 1;
00179             agenda = glist_add_ptr(agenda, node->l);
00180             node->r.r->nbits = node->nbits + 1;
00181             agenda = glist_add_ptr(agenda, node->r.r);
00182         }
00183         else {
00184             hc->numl[node->nbits]++;
00185             ncw++;
00186         }
00187     }
00188     /* Create starting codes and symbol tables for each bit length. */
00189     hc->syms[hc->maxbits] = ckd_calloc(hc->numl[hc->maxbits], sizeof(**hc->syms));
00190     for (i = hc->maxbits - 1; i > 0; --i) {
00191         hc->firstcode[i] = (hc->firstcode[i+1] + hc->numl[i+1]) / 2;
00192         hc->syms[i] = ckd_calloc(hc->numl[i], sizeof(**hc->syms));
00193     }
00194     memcpy(nextcode, hc->firstcode, (hc->maxbits + 1) * sizeof(*nextcode));
00195     /* Traverse the tree again to produce the codebook itself. */
00196     hc->codewords = hash_table_new(ncw, HASH_CASE_YES);
00197     agenda = glist_add_ptr(NULL, root);
00198     while (agenda) {
00199         huff_node_t *node = gnode_ptr(agenda);
00200         agenda = gnode_free(agenda, NULL);
00201         if (node->l) {
00202             agenda = glist_add_ptr(agenda, node->l);
00203             agenda = glist_add_ptr(agenda, node->r.r);
00204         }
00205         else {
00206             /* Initialize codebook entry, which also retains symbol pointer. */
00207             huff_codeword_t *cw;
00208             uint32 codeword = nextcode[node->nbits] & ((1 << node->nbits) - 1);
00209             cw = hc->syms[node->nbits] + (codeword - hc->firstcode[node->nbits]);
00210             cw->nbits = node->nbits;
00211             cw->r.sval = node->r.sval; /* Will copy ints too... */
00212             cw->codeword = codeword;
00213             if (hc->type == HUFF_CODE_INT) {
00214                 hash_table_enter_bkey(hc->codewords,
00215                                       (char const *)&cw->r.ival,
00216                                       sizeof(cw->r.ival),
00217                                       (void *)cw);
00218             }
00219             else {
00220                 hash_table_enter(hc->codewords, cw->r.sval, (void *)cw);
00221             }
00222             ++nextcode[node->nbits];
00223         }
00224     }
00225     ckd_free(nextcode);
00226 }
00227 
00228 huff_code_t *
00229 huff_code_build_int(int32 const *values, int32 const *frequencies, int nvals)
00230 {
00231     huff_code_t *hc;
00232     huff_node_t *root;
00233     heap_t q;
00234     int i;
00235 
00236     hc = ckd_calloc(1, sizeof(*hc));
00237     hc->refcount = 1;
00238     hc->type = HUFF_CODE_INT;
00239 
00240     /* Initialize the heap with nodes for each symbol. */
00241     q = heap_new();
00242     for (i = 0; i < nvals; ++i) {
00243         heap_insert(q,
00244                     huff_node_new_int(values[i]),
00245                     frequencies[i]);
00246     }
00247 
00248     /* Now build the tree, which gives us codeword lengths. */
00249     root = huff_code_build_tree(q);
00250     heap_destroy(q);
00251     if (root == NULL || root->nbits > 32) {
00252         E_ERROR("Huffman trees currently limited to 32 bits\n");
00253         huff_node_free_int(root);
00254         huff_code_free(hc);
00255         return NULL;
00256     }
00257 
00258     /* Build a canonical codebook. */
00259     hc->maxbits = root->nbits;
00260     huff_code_canonicalize(hc, root);
00261 
00262     /* Tree no longer needed. */
00263     huff_node_free_int(root);
00264 
00265     return hc;
00266 }
00267 
00268 huff_code_t *
00269 huff_code_build_str(char * const *values, int32 const *frequencies, int nvals)
00270 {
00271     huff_code_t *hc;
00272     huff_node_t *root;
00273     heap_t q;
00274     int i;
00275 
00276     hc = ckd_calloc(1, sizeof(*hc));
00277     hc->refcount = 1;
00278     hc->type = HUFF_CODE_STR;
00279 
00280     /* Initialize the heap with nodes for each symbol. */
00281     q = heap_new();
00282     for (i = 0; i < nvals; ++i) {
00283         heap_insert(q,
00284                     huff_node_new_str(values[i]),
00285                     frequencies[i]);
00286     }
00287 
00288     /* Now build the tree, which gives us codeword lengths. */
00289     root = huff_code_build_tree(q);
00290     heap_destroy(q);
00291     if (root == NULL || root->nbits > 32) {
00292         E_ERROR("Huffman trees currently limited to 32 bits\n");
00293         huff_node_free_str(root, TRUE);
00294         huff_code_free(hc);
00295         return NULL;
00296     }
00297 
00298     /* Build a canonical codebook. */
00299     hc->maxbits = root->nbits;
00300     huff_code_canonicalize(hc, root);
00301 
00302     /* Tree no longer needed (note we retain pointers to its strings). */
00303     huff_node_free_str(root, FALSE);
00304 
00305     return hc;
00306 }
00307 
00308 huff_code_t *
00309 huff_code_read(FILE *infh)
00310 {
00311     huff_code_t *hc;
00312     int i, j;
00313 
00314     hc = ckd_calloc(1, sizeof(*hc));
00315     hc->refcount = 1;
00316 
00317     hc->maxbits = fgetc(infh);
00318     hc->type = fgetc(infh);
00319 
00320     /* Two bytes of padding. */
00321     fgetc(infh);
00322     fgetc(infh);
00323 
00324     /* Allocate stuff. */
00325     hc->firstcode = ckd_calloc(hc->maxbits + 1, sizeof(*hc->firstcode));
00326     hc->numl = ckd_calloc(hc->maxbits + 1, sizeof(*hc->numl));
00327     hc->syms = ckd_calloc(hc->maxbits + 1, sizeof(*hc->syms));
00328 
00329     /* Read the symbol tables. */
00330     hc->codewords = hash_table_new(hc->maxbits, HASH_CASE_YES);
00331     for (i = 1; i <= hc->maxbits; ++i) {
00332         if (fread(&hc->firstcode[i], 4, 1, infh) != 1)
00333             goto error_out;
00334         SWAP_BE_32(&hc->firstcode[i]);
00335         if (fread(&hc->numl[i], 4, 1, infh) != 1)
00336             goto error_out;
00337         SWAP_BE_32(&hc->numl[i]);
00338         hc->syms[i] = ckd_calloc(hc->numl[i], sizeof(**hc->syms));
00339         for (j = 0; j < hc->numl[i]; ++j) {
00340             huff_codeword_t *cw = &hc->syms[i][j];
00341             cw->nbits = i;
00342             cw->codeword = hc->firstcode[i] + j;
00343             if (hc->type == HUFF_CODE_INT) {
00344                 if (fread(&cw->r.ival, 4, 1, infh) != 1)
00345                     goto error_out;
00346                 SWAP_BE_32(&cw->r.ival);
00347                 hash_table_enter_bkey(hc->codewords,
00348                                       (char const *)&cw->r.ival,
00349                                       sizeof(cw->r.ival),
00350                                       (void *)cw);
00351             }
00352             else {
00353                 size_t len;
00354                 cw->r.sval = fread_line(infh, &len);
00355                 cw->r.sval[len-1] = '\0';
00356                 hash_table_enter(hc->codewords, cw->r.sval, (void *)cw);
00357             }
00358         }
00359     }
00360 
00361     return hc;
00362 error_out:
00363     huff_code_free(hc);
00364     return NULL;
00365 }
00366 
00367 int
00368 huff_code_write(huff_code_t *hc, FILE *outfh)
00369 {
00370     int i, j;
00371 
00372     /* Maximum codeword length */
00373     fputc(hc->maxbits, outfh);
00374     /* Symbol type */
00375     fputc(hc->type, outfh);
00376     /* Two extra bytes (for future use and alignment) */
00377     fputc(0, outfh);
00378     fputc(0, outfh);
00379     /* For each codeword length: */
00380     for (i = 1; i <= hc->maxbits; ++i) {
00381         uint32 val;
00382 
00383         /* Starting code, number of codes. */
00384         val = hc->firstcode[i];
00385         /* Canonically big-endian (like the data itself) */
00386         SWAP_BE_32(&val);
00387         fwrite(&val, 4, 1, outfh);
00388         val = hc->numl[i];
00389         SWAP_BE_32(&val);
00390         fwrite(&val, 4, 1, outfh);
00391 
00392         /* Symbols for each code (FIXME: Should compress these too) */
00393         for (j = 0; j < hc->numl[i]; ++j) {
00394             if (hc->type == HUFF_CODE_INT) {
00395                 int32 val = hc->syms[i][j].r.ival;
00396                 SWAP_BE_32(&val);
00397                 fwrite(&val, 4, 1, outfh);
00398             }
00399             else {
00400                 /* Write them all separated by newlines, so that
00401                  * fgets() will read them for us. */
00402                 fprintf(outfh, "%s\n", hc->syms[i][j].r.sval);
00403             }
00404         }
00405     }
00406     return 0;
00407 }
00408 
00409 int
00410 huff_code_dump_codebits(FILE *dumpfh, uint32 nbits, uint32 codeword)
00411 {
00412     uint32 i;
00413 
00414     for (i = 0; i < nbits; ++i)
00415         fputc((codeword & (1<<(nbits-i-1))) ? '1' : '0', dumpfh);
00416     return 0;
00417 }
00418 
00419 int
00420 huff_code_dump(huff_code_t *hc, FILE *dumpfh)
00421 {
00422     int i, j;
00423 
00424     /* Print out all codewords. */
00425     fprintf(dumpfh, "Maximum codeword length: %d\n", hc->maxbits);
00426     fprintf(dumpfh, "Symbols are %s\n", (hc->type == HUFF_CODE_STR) ? "strings" : "ints");
00427     fprintf(dumpfh, "Codewords:\n");
00428     for (i = 1; i <= hc->maxbits; ++i) {
00429         for (j = 0; j < hc->numl[i]; ++j) {
00430             if (hc->type == HUFF_CODE_STR)
00431                 fprintf(dumpfh, "%-30s", hc->syms[i][j].r.sval);
00432             else
00433                 fprintf(dumpfh, "%-30d", hc->syms[i][j].r.ival);
00434             huff_code_dump_codebits(dumpfh, hc->syms[i][j].nbits,
00435                                     hc->syms[i][j].codeword);
00436             fprintf(dumpfh, "\n");
00437         }
00438     }
00439     return 0;
00440 }
00441 
00442 huff_code_t *
00443 huff_code_retain(huff_code_t *hc)
00444 {
00445     ++hc->refcount;
00446     return hc;
00447 }
00448 
00449 int
00450 huff_code_free(huff_code_t *hc)
00451 {
00452     int i;
00453 
00454     if (hc == NULL)
00455         return 0;
00456     if (--hc->refcount > 0)
00457         return hc->refcount;
00458     for (i = 0; i <= hc->maxbits; ++i) {
00459         int j;
00460         for (j = 0; j < hc->numl[i]; ++j) {
00461             if (hc->type == HUFF_CODE_STR)
00462                 ckd_free(hc->syms[i][j].r.sval);
00463         }
00464         ckd_free(hc->syms[i]);
00465     }
00466     ckd_free(hc->firstcode);
00467     ckd_free(hc->numl);
00468     ckd_free(hc->syms);
00469     hash_table_free(hc->codewords);
00470     ckd_free(hc);
00471     return 0;
00472 }
00473 
00474 FILE *
00475 huff_code_attach(huff_code_t *hc, FILE *fh, char const *mode)
00476 {
00477     FILE *oldfh = huff_code_detach(hc);
00478 
00479     hc->fh = fh;
00480     if (mode[0] == 'w')
00481         hc->be = bit_encode_attach(hc->fh);
00482     return oldfh;
00483 }
00484 
00485 FILE *
00486 huff_code_detach(huff_code_t *hc)
00487 {
00488     FILE *oldfh = hc->fh;
00489         
00490     if (hc->be) {
00491         bit_encode_flush(hc->be);
00492         bit_encode_free(hc->be);
00493         hc->be = NULL;
00494     }
00495     hc->fh = NULL;
00496     return oldfh;
00497 }
00498 
00499 int
00500 huff_code_encode_int(huff_code_t *hc, int32 sym, uint32 *outcw)
00501 {
00502     huff_codeword_t *cw;
00503 
00504     if (hash_table_lookup_bkey(hc->codewords,
00505                                (char const *)&sym,
00506                                sizeof(sym),
00507                                (void **)&cw) < 0)
00508         return 0;
00509     if (hc->be)
00510         bit_encode_write_cw(hc->be, cw->codeword, cw->nbits);
00511     if (outcw) *outcw = cw->codeword;
00512     return cw->nbits;
00513 }
00514 
00515 int
00516 huff_code_encode_str(huff_code_t *hc, char const *sym, uint32 *outcw)
00517 {
00518     huff_codeword_t *cw;
00519 
00520     if (hash_table_lookup(hc->codewords,
00521                           sym,
00522                           (void **)&cw) < 0)
00523         return 0;
00524     if (hc->be)
00525         bit_encode_write_cw(hc->be, cw->codeword, cw->nbits);
00526     if (outcw) *outcw = cw->codeword;
00527     return cw->nbits;
00528 }
00529 
00530 static huff_codeword_t *
00531 huff_code_decode_data(huff_code_t *hc, char const **inout_data, int *inout_offset)
00532 {
00533     uint32 cw;
00534     int cwlen;
00535     int byte;
00536 
00537     byte = *(*inout_data)++;
00538     cw = !!(byte & (1 << (7-(*inout_offset)++)));
00539     cwlen = 1;
00540     /* printf("%.*x ", cwlen, cw); */
00541     while (cwlen <= hc->maxbits && cw < hc->firstcode[cwlen]) {
00542         ++cwlen;
00543         cw <<= 1;
00544         if (*inout_offset > 7) {
00545             byte = *(*inout_data)++;
00546             *inout_offset = 0;
00547         }
00548         cw |= !!(byte & (1 << (7-(*inout_offset)++)));
00549         /* printf("%.*x ", cwlen, cw); */
00550     }
00551     if (cwlen > hc->maxbits) /* FAIL: invalid data */
00552         return NULL;
00553 
00554     /* Put the last byte back if there are bits left over. */
00555     if (*inout_offset < 8)
00556         --*inout_data;
00557     else
00558         *inout_offset = 0;
00559 
00560     /* printf("%.*x\n", cwlen, cw); */
00561     return hc->syms[cwlen] + (cw - hc->firstcode[cwlen]);
00562 }
00563 
00564 static huff_codeword_t *
00565 huff_code_decode_fh(huff_code_t *hc)
00566 {
00567     uint32 cw;
00568     int cwlen;
00569     int byte;
00570 
00571     if ((byte = fgetc(hc->fh)) == EOF)
00572         return NULL;
00573     cw = !!(byte & (1 << (7-hc->boff++)));
00574     cwlen = 1;
00575     /* printf("%.*x ", cwlen, cw); */
00576     while (cwlen <= hc->maxbits && cw < hc->firstcode[cwlen]) {
00577         ++cwlen;
00578         cw <<= 1;
00579         if (hc->boff > 7) {
00580             if ((byte = fgetc(hc->fh)) == EOF)
00581                 return NULL;
00582             hc->boff = 0;
00583         }
00584         cw |= !!(byte & (1 << (7-hc->boff++)));
00585         /* printf("%.*x ", cwlen, cw); */
00586     }
00587     if (cwlen > hc->maxbits) /* FAIL: invalid data */
00588         return NULL;
00589 
00590     /* Put the last byte back if there are bits left over. */
00591     if (hc->boff < 8)
00592         ungetc(byte, hc->fh);
00593     else
00594         hc->boff = 0;
00595 
00596     /* printf("%.*x\n", cwlen, cw); */
00597     return hc->syms[cwlen] + (cw - hc->firstcode[cwlen]);
00598 }
00599 
00600 int
00601 huff_code_decode_int(huff_code_t *hc, int *outval,
00602                      char const **inout_data, int *inout_offset)
00603 {
00604     huff_codeword_t *cw;
00605 
00606     if (inout_data)
00607         cw = huff_code_decode_data(hc, inout_data, inout_offset);
00608     else if (hc->fh)
00609         cw = huff_code_decode_fh(hc);
00610     else
00611         return -1;
00612 
00613     if (cw == NULL)
00614         return -1;
00615     if (outval)
00616         *outval = cw->r.ival;
00617 
00618     return 0;
00619 }
00620 
00621 char const *
00622 huff_code_decode_str(huff_code_t *hc,
00623                      char const **inout_data, int *inout_offset)
00624 {
00625     huff_codeword_t *cw;
00626 
00627     if (inout_data)
00628         cw = huff_code_decode_data(hc, inout_data, inout_offset);
00629     else if (hc->fh)
00630         cw = huff_code_decode_fh(hc);
00631     else
00632         return NULL;
00633 
00634     if (cw == NULL)
00635         return NULL;
00636 
00637     return cw->r.sval;
00638 }

Generated on Mon Jan 24 21:36:19 2011 for SphinxBase by  doxygen 1.4.7