00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043 #include "config.h"
00044 #include "ngram_model.h"
00045 #include "ngram_model_internal.h"
00046 #include "ckd_alloc.h"
00047 #include "filename.h"
00048 #include "pio.h"
00049 #include "err.h"
00050 #include "logmath.h"
00051 #include "strfuncs.h"
00052 #include "case.h"
00053
00054 #include <string.h>
00055 #include <assert.h>
00056 #ifdef HAVE_ICONV
00057 #include <iconv.h>
00058 #endif
00059
00060 ngram_file_type_t
00061 ngram_file_name_to_type(const char *file_name)
00062 {
00063 const char *ext;
00064
00065 ext = strrchr(file_name, '.');
00066 if (ext == NULL) {
00067 return NGRAM_ARPA;
00068 }
00069 if (0 == strcmp_nocase(ext, ".gz")) {
00070 while (--ext >= file_name) {
00071 if (*ext == '.') break;
00072 }
00073 if (ext < file_name) {
00074 return NGRAM_ARPA;
00075 }
00076 }
00077 else if (0 == strcmp_nocase(ext, ".bz2")) {
00078 while (--ext >= file_name) {
00079 if (*ext == '.') break;
00080 }
00081 if (ext < file_name) {
00082 return NGRAM_ARPA;
00083 }
00084 }
00085
00086 if (0 == strncmp_nocase(ext, ".ARPA", 5))
00087 return NGRAM_ARPA;
00088 if (0 == strncmp_nocase(ext, ".DMP32", 6))
00089 return NGRAM_DMP32;
00090 if (0 == strncmp_nocase(ext, ".DMP", 4))
00091 return NGRAM_DMP;
00092 return NGRAM_ARPA;
00093 }
00094
00095 ngram_model_t *
00096 ngram_model_read(cmd_ln_t *config,
00097 const char *file_name,
00098 ngram_file_type_t file_type,
00099 logmath_t *lmath)
00100 {
00101 ngram_model_t *model = NULL;
00102
00103 switch (file_type) {
00104 case NGRAM_AUTO: {
00105 if ((model = ngram_model_arpa_read(config, file_name, lmath)) != NULL)
00106 break;
00107 if ((model = ngram_model_dmp_read(config, file_name, lmath)) != NULL)
00108 break;
00109 if ((model = ngram_model_dmp32_read(config, file_name, lmath)) != NULL)
00110 break;
00111 return NULL;
00112 }
00113 case NGRAM_ARPA:
00114 model = ngram_model_arpa_read(config, file_name, lmath);
00115 break;
00116 case NGRAM_DMP:
00117 model = ngram_model_dmp_read(config, file_name, lmath);
00118 break;
00119 case NGRAM_DMP32:
00120 model = ngram_model_dmp32_read(config, file_name, lmath);
00121 break;
00122 }
00123
00124
00125 if (config) {
00126 float32 lw = 1.0;
00127 float32 wip = 1.0;
00128 float32 uw = 1.0;
00129
00130 if (cmd_ln_exists_r(config, "-lw"))
00131 lw = cmd_ln_float32_r(config, "-lw");
00132 if (cmd_ln_exists_r(config, "-wip"))
00133 wip = cmd_ln_float32_r(config, "-wip");
00134 if (cmd_ln_exists_r(config, "-uw"))
00135 uw = cmd_ln_float32_r(config, "-uw");
00136
00137 ngram_model_apply_weights(model, lw, wip, uw);
00138 }
00139
00140 return model;
00141 }
00142
00143 int
00144 ngram_model_write(ngram_model_t *model, const char *file_name,
00145 ngram_file_type_t file_type)
00146 {
00147 switch (file_type) {
00148 case NGRAM_AUTO: {
00149 file_type = ngram_file_name_to_type(file_name);
00150 return ngram_model_write(model, file_name, file_type);
00151 }
00152 case NGRAM_ARPA:
00153 return ngram_model_arpa_write(model, file_name);
00154 case NGRAM_DMP:
00155 return ngram_model_dmp_write(model, file_name);
00156 case NGRAM_DMP32:
00157 return ngram_model_dmp32_write(model, file_name);
00158 }
00159
00160 return -1;
00161 }
00162
00163 int32
00164 ngram_model_init(ngram_model_t *base,
00165 ngram_funcs_t *funcs,
00166 logmath_t *lmath,
00167 int32 n, int32 n_unigram)
00168 {
00169 base->refcount = 1;
00170 base->funcs = funcs;
00171 base->n = n;
00172
00173 if (base->n_counts == NULL)
00174 base->n_counts = ckd_calloc(3, sizeof(*base->n_counts));
00175
00176 if (base->lmath != lmath) {
00177
00178 base->lw = 1.0;
00179 base->log_wip = 0;
00180 base->log_uw = 0;
00181 base->log_uniform = logmath_log(lmath, 1.0 / n_unigram);
00182 base->log_uniform_weight = logmath_get_zero(lmath);
00183 base->log_zero = logmath_get_zero(lmath);
00184 base->lmath = lmath;
00185 }
00186
00187 if (base->word_str) {
00188
00189 if (base->writable) {
00190 int32 i;
00191 for (i = 0; i < base->n_words; ++i) {
00192 ckd_free(base->word_str[i]);
00193 base->word_str[i] = NULL;
00194 }
00195 }
00196 base->word_str = ckd_realloc(base->word_str, n_unigram * sizeof(char *));
00197 }
00198 else
00199 base->word_str = ckd_calloc(n_unigram, sizeof(char *));
00200
00201
00202 if (base->wid)
00203 hash_table_empty(base->wid);
00204 else
00205 base->wid = hash_table_new(n_unigram, FALSE);
00206 base->n_counts[0] = base->n_1g_alloc = base->n_words = n_unigram;
00207
00208 return 0;
00209 }
00210
00211 ngram_model_t *
00212 ngram_model_retain(ngram_model_t *model)
00213 {
00214 ++model->refcount;
00215 return model;
00216 }
00217
00218
00219 void
00220 ngram_model_flush(ngram_model_t *model)
00221 {
00222 if (model->funcs && model->funcs->flush)
00223 (*model->funcs->flush)(model);
00224 }
00225
00226 int
00227 ngram_model_free(ngram_model_t *model)
00228 {
00229 int i;
00230
00231 if (model == NULL)
00232 return 0;
00233 if (--model->refcount > 0)
00234 return model->refcount;
00235 if (model->funcs && model->funcs->free)
00236 (*model->funcs->free)(model);
00237 if (model->writable) {
00238
00239 for (i = 0; i < model->n_words; ++i) {
00240 ckd_free(model->word_str[i]);
00241 }
00242 }
00243 else {
00244
00245 for (i = 0; i < model->n_classes; ++i) {
00246 ngram_class_t *lmclass;
00247 int32 j;
00248
00249 lmclass = model->classes[i];
00250 for (j = 0; j < lmclass->n_words; ++j) {
00251 ckd_free(model->word_str[lmclass->start_wid + j]);
00252 }
00253 for (j = 0; j < lmclass->n_hash; ++j) {
00254 if (lmclass->nword_hash[j].wid != -1) {
00255 ckd_free(model->word_str[lmclass->nword_hash[j].wid]);
00256 }
00257 }
00258 }
00259 }
00260 for (i = 0; i < model->n_classes; ++i) {
00261 ngram_class_free(model->classes[i]);
00262 }
00263 ckd_free(model->classes);
00264 hash_table_free(model->wid);
00265 ckd_free(model->word_str);
00266 ckd_free(model->n_counts);
00267 ckd_free(model);
00268 return 0;
00269 }
00270
00271
00272 #ifdef HAVE_ICONV
00273 int
00274 ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
00275 {
00276 iconv_t ic;
00277 char *outbuf;
00278 size_t maxlen;
00279 int i, writable;
00280 hash_table_t *new_wid;
00281
00282
00283
00284 if ((ic = iconv_open(to, from)) == (iconv_t)-1) {
00285 E_ERROR_SYSTEM("iconv_open() failed");
00286 return -1;
00287 }
00288
00289
00290
00291
00292
00293
00294
00295 maxlen = 0;
00296 for (i = 0; i < model->n_words; ++i) {
00297 if (strlen(model->word_str[i]) > maxlen)
00298 maxlen = strlen(model->word_str[i]);
00299 }
00300
00301 writable = model->writable;
00302
00303 model->writable = TRUE;
00304
00305 maxlen = maxlen * sizeof(int) + 15;
00306 outbuf = ckd_calloc(maxlen, 1);
00307
00308
00309 new_wid = hash_table_new(model->n_words, FALSE);
00310 for (i = 0; i < model->n_words; ++i) {
00311 ICONV_CONST char *in;
00312 char *out;
00313 size_t inleft, outleft, result;
00314
00315 start_conversion:
00316 in = (ICONV_CONST char *)model->word_str[i];
00317
00318 inleft = strlen(in);
00319 out = outbuf;
00320 outleft = maxlen;
00321
00322 while ((result = iconv(ic, &in, &inleft, &out, &outleft)) == (size_t)-1) {
00323 if (errno != E2BIG) {
00324
00325
00326 E_ERROR_SYSTEM("iconv() failed");
00327 ckd_free(outbuf);
00328 hash_table_free(new_wid);
00329 return -1;
00330 }
00331
00332 iconv(ic, NULL, NULL, NULL, NULL);
00333
00334 maxlen *= 2;
00335 out = outbuf = ckd_realloc(outbuf, maxlen);
00336
00337 in = (ICONV_CONST char *)model->word_str[i];
00338 inleft = strlen(in);
00339 }
00340
00341
00342 if ((result = iconv(ic, NULL, NULL, &out, &outleft)) == (size_t)-1) {
00343 if (errno != E2BIG) {
00344
00345
00346 E_ERROR_SYSTEM("iconv() failed (state reset sequence)");
00347 ckd_free(outbuf);
00348 hash_table_free(new_wid);
00349 return -1;
00350 }
00351
00352 iconv(ic, NULL, NULL, NULL, NULL);
00353
00354 maxlen *= 2;
00355 outbuf = ckd_realloc(outbuf, maxlen);
00356
00357 goto start_conversion;
00358 }
00359
00360 result = maxlen - outleft;
00361
00362 if (writable) {
00363
00364 model->word_str[i] = ckd_realloc(model->word_str[i], result + 1);
00365 model->word_str[i][result] = '\0';
00366 }
00367 else {
00368
00369 model->word_str[i] = ckd_calloc(result + 1, 1);
00370 }
00371
00372 memcpy(model->word_str[i], outbuf, result);
00373
00374
00375
00376
00377 if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) {
00378 E_WARN("Duplicate word in dictionary after conversion: %s\n",
00379 model->word_str[i]);
00380 }
00381 }
00382 ckd_free(outbuf);
00383 iconv_close(ic);
00384
00385 hash_table_free(model->wid);
00386 model->wid = new_wid;
00387
00388 return 0;
00389 }
00390 #else
00391 int
00392 ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
00393 {
00394 return -1;
00395 }
00396 #endif
00397
00398 int
00399 ngram_model_apply_weights(ngram_model_t *model,
00400 float32 lw, float32 wip, float32 uw)
00401 {
00402 return (*model->funcs->apply_weights)(model, lw, wip, uw);
00403 }
00404
00405 float32
00406 ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip,
00407 int32 *out_log_uw)
00408 {
00409 if (out_log_wip) *out_log_wip = model->log_wip;
00410 if (out_log_uw) *out_log_uw = model->log_uw;
00411 return model->lw;
00412 }
00413
00414
00415 int32
00416 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history,
00417 int32 n_hist, int32 *n_used)
00418 {
00419 int32 score, class_weight = 0;
00420 int i;
00421
00422
00423 if (wid == NGRAM_INVALID_WID)
00424 return model->log_zero;
00425
00426
00427 if (NGRAM_IS_CLASSWID(wid)) {
00428 ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
00429
00430 class_weight = ngram_class_prob(lmclass, wid);
00431 if (class_weight == 1)
00432 return model->log_zero;
00433 wid = lmclass->tag_wid;
00434 }
00435 for (i = 0; i < n_hist; ++i) {
00436 if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i]))
00437 history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
00438 }
00439 score = (*model->funcs->score)(model, wid, history, n_hist, n_used);
00440
00441
00442 return score + class_weight;
00443 }
00444
00445 int32
00446 ngram_score(ngram_model_t *model, const char *word, ...)
00447 {
00448 va_list history;
00449 const char *hword;
00450 int32 *histid;
00451 int32 n_hist;
00452 int32 n_used;
00453 int32 prob;
00454
00455 va_start(history, word);
00456 n_hist = 0;
00457 while ((hword = va_arg(history, const char *)) != NULL)
00458 ++n_hist;
00459 va_end(history);
00460
00461 histid = ckd_calloc(n_hist, sizeof(*histid));
00462 va_start(history, word);
00463 n_hist = 0;
00464 while ((hword = va_arg(history, const char *)) != NULL) {
00465 histid[n_hist] = ngram_wid(model, hword);
00466 ++n_hist;
00467 }
00468 va_end(history);
00469
00470 prob = ngram_ng_score(model, ngram_wid(model, word),
00471 histid, n_hist, &n_used);
00472 ckd_free(histid);
00473 return prob;
00474 }
00475
00476 int32
00477 ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used)
00478 {
00479 int32 hist[2] = { w2, w1 };
00480 return ngram_ng_score(model, w3, hist, 2, n_used);
00481 }
00482
00483 int32
00484 ngram_bg_score(ngram_model_t *model, int32 w2, int32 w1, int32 *n_used)
00485 {
00486 return ngram_ng_score(model, w2, &w1, 1, n_used);
00487 }
00488
00489 int32
00490 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history,
00491 int32 n_hist, int32 *n_used)
00492 {
00493 int32 prob, class_weight = 0;
00494 int i;
00495
00496
00497 if (wid == NGRAM_INVALID_WID)
00498 return model->log_zero;
00499
00500
00501 if (NGRAM_IS_CLASSWID(wid)) {
00502 ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
00503
00504 class_weight = ngram_class_prob(lmclass, wid);
00505 if (class_weight == 1)
00506 return class_weight;
00507 wid = lmclass->tag_wid;
00508 }
00509 for (i = 0; i < n_hist; ++i) {
00510 if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i]))
00511 history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
00512 }
00513 prob = (*model->funcs->raw_score)(model, wid, history,
00514 n_hist, n_used);
00515
00516 return prob + class_weight;
00517 }
00518
00519 int32
00520 ngram_prob(ngram_model_t *model, const char *word, ...)
00521 {
00522 va_list history;
00523 const char *hword;
00524 int32 *histid;
00525 int32 n_hist;
00526 int32 n_used;
00527 int32 prob;
00528
00529 va_start(history, word);
00530 n_hist = 0;
00531 while ((hword = va_arg(history, const char *)) != NULL)
00532 ++n_hist;
00533 va_end(history);
00534
00535 histid = ckd_calloc(n_hist, sizeof(*histid));
00536 va_start(history, word);
00537 n_hist = 0;
00538 while ((hword = va_arg(history, const char *)) != NULL) {
00539 histid[n_hist] = ngram_wid(model, hword);
00540 ++n_hist;
00541 }
00542 va_end(history);
00543
00544 prob = ngram_ng_prob(model, ngram_wid(model, word),
00545 histid, n_hist, &n_used);
00546 ckd_free(histid);
00547 return prob;
00548 }
00549
00550 int32
00551 ngram_score_to_prob(ngram_model_t *base, int32 score)
00552 {
00553 int32 prob;
00554
00555
00556 prob = score - base->log_wip;
00557
00558 prob = (int32)(prob / base->lw);
00559
00560 return prob;
00561 }
00562
00563 int32
00564 ngram_unknown_wid(ngram_model_t *model)
00565 {
00566 int32 val;
00567
00568
00569
00570 if (hash_table_lookup_int32(model->wid, "<UNK>", &val) == -1)
00571 return NGRAM_INVALID_WID;
00572 else
00573 return val;
00574 }
00575
00576 int32
00577 ngram_zero(ngram_model_t *model)
00578 {
00579 return model->log_zero;
00580 }
00581
00582 int32
00583 ngram_model_get_size(ngram_model_t *model)
00584 {
00585 if (model != NULL)
00586 return model->n;
00587 return 0;
00588 }
00589
00590 int32 const *
00591 ngram_model_get_counts(ngram_model_t *model)
00592 {
00593 if (model != NULL)
00594 return model->n_counts;
00595 return NULL;
00596 }
00597
00598 void
00599 ngram_iter_init(ngram_iter_t *itor, ngram_model_t *model,
00600 int m, int successor)
00601 {
00602 itor->model = model;
00603 itor->wids = ckd_calloc(m, sizeof(*itor->wids));
00604 itor->m = m;
00605 itor->successor = successor;
00606 }
00607
00608 ngram_iter_t *
00609 ngram_model_mgrams(ngram_model_t *model, int m)
00610 {
00611 ngram_iter_t *itor;
00612
00613 if (m >= model->n)
00614 return NULL;
00615 if (model->funcs->mgrams == NULL)
00616 return NULL;
00617 itor = (*model->funcs->mgrams)(model, m);
00618 return itor;
00619 }
00620
00621 ngram_iter_t *
00622 ngram_iter(ngram_model_t *model, const char *word, ...)
00623 {
00624 va_list history;
00625 const char *hword;
00626 int32 *histid;
00627 int32 n_hist;
00628 ngram_iter_t *itor;
00629
00630 va_start(history, word);
00631 n_hist = 0;
00632 while ((hword = va_arg(history, const char *)) != NULL)
00633 ++n_hist;
00634 va_end(history);
00635
00636 histid = ckd_calloc(n_hist, sizeof(*histid));
00637 va_start(history, word);
00638 n_hist = 0;
00639 while ((hword = va_arg(history, const char *)) != NULL) {
00640 histid[n_hist] = ngram_wid(model, hword);
00641 ++n_hist;
00642 }
00643 va_end(history);
00644
00645 itor = ngram_ng_iter(model, ngram_wid(model, word), histid, n_hist);
00646 ckd_free(histid);
00647 return itor;
00648 }
00649
00650 ngram_iter_t *
00651 ngram_ng_iter(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist)
00652 {
00653 if (n_hist >= model->n)
00654 return NULL;
00655 if (model->funcs->iter == NULL)
00656 return NULL;
00657 return (*model->funcs->iter)(model, wid, history, n_hist);
00658 }
00659
00660 ngram_iter_t *
00661 ngram_iter_successors(ngram_iter_t *itor)
00662 {
00663
00664 if (itor->m == itor->model->n - 1)
00665 return NULL;
00666 return (*itor->model->funcs->successors)(itor);
00667 }
00668
00669 int32 const *
00670 ngram_iter_get(ngram_iter_t *itor,
00671 int32 *out_score,
00672 int32 *out_bowt)
00673 {
00674 return (*itor->model->funcs->iter_get)(itor, out_score, out_bowt);
00675 }
00676
00677 ngram_iter_t *
00678 ngram_iter_next(ngram_iter_t *itor)
00679 {
00680 return (*itor->model->funcs->iter_next)(itor);
00681 }
00682
00683 void
00684 ngram_iter_free(ngram_iter_t *itor)
00685 {
00686 ckd_free(itor->wids);
00687 (*itor->model->funcs->iter_free)(itor);
00688 }
00689
00690 int32
00691 ngram_wid(ngram_model_t *model, const char *word)
00692 {
00693 int32 val;
00694
00695 if (hash_table_lookup_int32(model->wid, word, &val) == -1)
00696 return ngram_unknown_wid(model);
00697 else
00698 return val;
00699 }
00700
00701 const char *
00702 ngram_word(ngram_model_t *model, int32 wid)
00703 {
00704
00705 wid = NGRAM_BASEWID(wid);
00706 if (wid >= model->n_words)
00707 return NULL;
00708 return model->word_str[wid];
00709 }
00710
00714 int32
00715 ngram_add_word_internal(ngram_model_t *model,
00716 const char *word,
00717 int32 classid)
00718 {
00719 void *dummy;
00720 int32 wid;
00721
00722
00723 wid = model->n_words;
00724 if (classid >= 0) {
00725 wid = NGRAM_CLASSWID(wid, classid);
00726 }
00727
00728 if (hash_table_lookup(model->wid, word, &dummy) == 0) {
00729 E_ERROR("Duplicate definition of word %s\n", word);
00730 return NGRAM_INVALID_WID;
00731 }
00732
00733 if (model->n_words >= model->n_1g_alloc) {
00734 model->n_1g_alloc += UG_ALLOC_STEP;
00735 model->word_str = ckd_realloc(model->word_str,
00736 sizeof(*model->word_str) * model->n_1g_alloc);
00737 }
00738
00739
00740 model->word_str[model->n_words] = ckd_salloc(word);
00741
00742 if (hash_table_enter_int32(model->wid, model->word_str[model->n_words], wid) != wid) {
00743 E_ERROR("Hash insertion failed for word %s => %p (should not happen)\n",
00744 model->word_str[model->n_words], (void *)(long)(wid));
00745 }
00746
00747 ++model->n_words;
00748 return wid;
00749 }
00750
00751 int32
00752 ngram_model_add_word(ngram_model_t *model,
00753 const char *word, float32 weight)
00754 {
00755 int32 wid, prob = model->log_zero;
00756
00757 wid = ngram_add_word_internal(model, word, -1);
00758 if (wid == NGRAM_INVALID_WID)
00759 return wid;
00760
00761
00762 if (model->funcs && model->funcs->add_ug)
00763 prob = (*model->funcs->add_ug)(model, wid, logmath_log(model->lmath, weight));
00764 if (prob == 0) {
00765 if (model->writable)
00766 ckd_free(model->word_str[wid]);
00767 return -1;
00768 }
00769 return wid;
00770 }
00771
00772 ngram_class_t *
00773 ngram_class_new(ngram_model_t *model, int32 tag_wid, int32 start_wid, glist_t classwords)
00774 {
00775 ngram_class_t *lmclass;
00776 gnode_t *gn;
00777 float32 tprob;
00778 int i;
00779
00780 lmclass = ckd_calloc(1, sizeof(*lmclass));
00781 lmclass->tag_wid = tag_wid;
00782
00783 lmclass->start_wid = start_wid;
00784 lmclass->n_words = glist_count(classwords);
00785 lmclass->prob1 = ckd_calloc(lmclass->n_words, sizeof(*lmclass->prob1));
00786 lmclass->nword_hash = NULL;
00787 lmclass->n_hash = 0;
00788 tprob = 0.0;
00789 for (gn = classwords; gn; gn = gnode_next(gn)) {
00790 tprob += gnode_float32(gn);
00791 }
00792 if (tprob > 1.1 || tprob < 0.9) {
00793 E_WARN("Total class probability is %f, will normalize\n", tprob);
00794 for (gn = classwords; gn; gn = gnode_next(gn)) {
00795 gn->data.fl /= tprob;
00796 }
00797 }
00798 for (i = 0, gn = classwords; gn; ++i, gn = gnode_next(gn)) {
00799 lmclass->prob1[i] = logmath_log(model->lmath, gnode_float32(gn));
00800 }
00801
00802 return lmclass;
00803 }
00804
00805 int32
00806 ngram_class_add_word(ngram_class_t *lmclass, int32 wid, int32 lweight)
00807 {
00808 int32 hash;
00809
00810 if (lmclass->nword_hash == NULL) {
00811
00812 lmclass->nword_hash = ckd_malloc(NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
00813 memset(lmclass->nword_hash, 0xff, NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
00814 lmclass->n_hash = NGRAM_HASH_SIZE;
00815 lmclass->n_hash_inuse = 0;
00816 }
00817
00818
00819
00820 hash = wid & (lmclass->n_hash - 1);
00821 if (lmclass->nword_hash[hash].wid == -1) {
00822
00823 lmclass->nword_hash[hash].wid = wid;
00824 lmclass->nword_hash[hash].prob1 = lweight;
00825 ++lmclass->n_hash_inuse;
00826 return hash;
00827 }
00828 else {
00829 int32 next;
00830
00831 while (lmclass->nword_hash[hash].next != -1)
00832 hash = lmclass->nword_hash[hash].next;
00833 assert(hash != -1);
00834
00835 if (lmclass->n_hash_inuse == lmclass->n_hash) {
00836
00837 lmclass->nword_hash = ckd_realloc(lmclass->nword_hash,
00838 lmclass->n_hash * 2 * sizeof(*lmclass->nword_hash));
00839 memset(lmclass->nword_hash + lmclass->n_hash,
00840 0xff, lmclass->n_hash * sizeof(*lmclass->nword_hash));
00841
00842 next = lmclass->n_hash;
00843 lmclass->n_hash *= 2;
00844 }
00845 else {
00846
00847 for (next = 0; next < lmclass->n_hash; ++next)
00848 if (lmclass->nword_hash[next].wid == -1)
00849 break;
00850
00851 assert(next != lmclass->n_hash);
00852 }
00853 lmclass->nword_hash[next].wid = wid;
00854 lmclass->nword_hash[next].prob1 = lweight;
00855 lmclass->nword_hash[hash].next = next;
00856 ++lmclass->n_hash_inuse;
00857 return next;
00858 }
00859 }
00860
00861 void
00862 ngram_class_free(ngram_class_t *lmclass)
00863 {
00864 ckd_free(lmclass->nword_hash);
00865 ckd_free(lmclass->prob1);
00866 ckd_free(lmclass);
00867 }
00868
00869 int32
00870 ngram_model_add_class_word(ngram_model_t *model,
00871 const char *classname,
00872 const char *word,
00873 float32 weight)
00874 {
00875 ngram_class_t *lmclass;
00876 int32 classid, tag_wid, wid, i, scale;
00877 float32 fprob;
00878
00879
00880
00881
00882 tag_wid = ngram_wid(model, classname);
00883 if (tag_wid == NGRAM_INVALID_WID) {
00884 E_ERROR("No such word or class tag: %s\n", classname);
00885 return tag_wid;
00886 }
00887 for (classid = 0; classid < model->n_classes; ++classid) {
00888 if (model->classes[classid]->tag_wid == tag_wid)
00889 break;
00890 }
00891
00892 if (classid == model->n_classes) {
00893 E_ERROR("Word %s is not a class tag (call ngram_model_add_class() first)\n", classname);
00894 return NGRAM_INVALID_WID;
00895 }
00896 lmclass = model->classes[classid];
00897
00898
00899 wid = ngram_add_word_internal(model, word, classid);
00900 if (wid == NGRAM_INVALID_WID)
00901 return wid;
00902
00903
00904 fprob = weight * 1.0f / (lmclass->n_words + lmclass->n_hash_inuse + 1);
00905
00906
00907
00908 scale = logmath_log(model->lmath, 1.0 - fprob);
00909 for (i = 0; i < lmclass->n_words; ++i)
00910 lmclass->prob1[i] += scale;
00911 for (i = 0; i < lmclass->n_hash; ++i)
00912 if (lmclass->nword_hash[i].wid != -1)
00913 lmclass->nword_hash[i].prob1 += scale;
00914
00915
00916 return ngram_class_add_word(lmclass, wid, logmath_log(model->lmath, fprob));
00917 }
00918
00919 int32
00920 ngram_model_add_class(ngram_model_t *model,
00921 const char *classname,
00922 float32 classweight,
00923 char **words,
00924 const float32 *weights,
00925 int32 n_words)
00926 {
00927 ngram_class_t *lmclass;
00928 glist_t classwords = NULL;
00929 int32 i, start_wid = -1;
00930 int32 classid, tag_wid;
00931
00932
00933 if ((tag_wid = ngram_wid(model, classname)) == ngram_unknown_wid(model)) {
00934 tag_wid = ngram_model_add_word(model, classname, classweight);
00935 if (tag_wid == NGRAM_INVALID_WID)
00936 return -1;
00937 }
00938
00939 if (model->n_classes == 128) {
00940 E_ERROR("Number of classes cannot exceed 128 (sorry)\n");
00941 return -1;
00942 }
00943 classid = model->n_classes;
00944 for (i = 0; i < n_words; ++i) {
00945 int32 wid;
00946
00947 wid = ngram_add_word_internal(model, words[i], classid);
00948 if (wid == NGRAM_INVALID_WID)
00949 return -1;
00950 if (start_wid == -1)
00951 start_wid = NGRAM_BASEWID(wid);
00952 classwords = glist_add_float32(classwords, weights[i]);
00953 }
00954 classwords = glist_reverse(classwords);
00955 lmclass = ngram_class_new(model, tag_wid, start_wid, classwords);
00956 glist_free(classwords);
00957 if (lmclass == NULL)
00958 return -1;
00959
00960 ++model->n_classes;
00961 if (model->classes == NULL)
00962 model->classes = ckd_calloc(1, sizeof(*model->classes));
00963 else
00964 model->classes = ckd_realloc(model->classes,
00965 model->n_classes * sizeof(*model->classes));
00966 model->classes[classid] = lmclass;
00967 return classid;
00968 }
00969
00970 int32
00971 ngram_class_prob(ngram_class_t *lmclass, int32 wid)
00972 {
00973 int32 base_wid = NGRAM_BASEWID(wid);
00974
00975 if (base_wid < lmclass->start_wid
00976 || base_wid > lmclass->start_wid + lmclass->n_words) {
00977 int32 hash;
00978
00979
00980 hash = wid & (lmclass->n_hash - 1);
00981 while (hash != -1 && lmclass->nword_hash[hash].wid != wid)
00982 hash = lmclass->nword_hash[hash].next;
00983 if (hash == -1)
00984 return 1;
00985 return lmclass->nword_hash[hash].prob1;
00986 }
00987 else {
00988 return lmclass->prob1[base_wid - lmclass->start_wid];
00989 }
00990 }
00991
00992 int32
00993 read_classdef_file(hash_table_t *classes, const char *file_name)
00994 {
00995 FILE *fp;
00996 int32 is_pipe;
00997 int inclass;
00998 int32 rv = -1;
00999 gnode_t *gn;
01000 glist_t classwords = NULL;
01001 glist_t classprobs = NULL;
01002 char *classname = NULL;
01003
01004 if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) {
01005 E_ERROR("File %s not found\n", file_name);
01006 return -1;
01007 }
01008
01009 inclass = FALSE;
01010 while (!feof(fp)) {
01011 char line[512];
01012 char *wptr[2];
01013 int n_words;
01014
01015 if (fgets(line, sizeof(line), fp) == NULL)
01016 break;
01017
01018 n_words = str2words(line, wptr, 2);
01019 if (n_words <= 0)
01020 continue;
01021
01022 if (inclass) {
01023
01024 if (n_words == 2 && 0 == strcmp(wptr[0], "END")) {
01025 classdef_t *classdef;
01026 gnode_t *word, *weight;
01027 int32 i;
01028
01029 if (classname == NULL || 0 != strcmp(wptr[1], classname))
01030 goto error_out;
01031 inclass = FALSE;
01032
01033
01034 classdef = ckd_calloc(1, sizeof(*classdef));
01035 classwords = glist_reverse(classwords);
01036 classprobs = glist_reverse(classprobs);
01037 classdef->n_words = glist_count(classwords);
01038 classdef->words = ckd_calloc(classdef->n_words,
01039 sizeof(*classdef->words));
01040 classdef->weights = ckd_calloc(classdef->n_words,
01041 sizeof(*classdef->weights));
01042 word = classwords;
01043 weight = classprobs;
01044 for (i = 0; i < classdef->n_words; ++i) {
01045 classdef->words[i] = gnode_ptr(word);
01046 classdef->weights[i] = gnode_float32(weight);
01047 word = gnode_next(word);
01048 weight = gnode_next(weight);
01049 }
01050
01051
01052 if (hash_table_enter(classes, classname, classdef) != classdef) {
01053 classdef_free(classdef);
01054 goto error_out;
01055 }
01056
01057
01058 glist_free(classwords);
01059 glist_free(classprobs);
01060 classwords = NULL;
01061 classprobs = NULL;
01062 classname = NULL;
01063 }
01064 else {
01065 float32 fprob;
01066
01067 if (n_words == 2)
01068 fprob = (float32)atof_c(wptr[1]);
01069 else
01070 fprob = 1.0f;
01071
01072 classwords = glist_add_ptr(classwords, ckd_salloc(wptr[0]));
01073 classprobs = glist_add_float32(classprobs, fprob);
01074 }
01075 }
01076 else {
01077
01078 if (n_words == 2 && 0 == strcmp(wptr[0], "LMCLASS")) {
01079 if (inclass)
01080 goto error_out;
01081 inclass = TRUE;
01082 classname = ckd_salloc(wptr[1]);
01083 }
01084
01085 }
01086 }
01087 rv = 0;
01088
01089 error_out:
01090
01091 fclose_comp(fp, is_pipe);
01092 for (gn = classwords; gn; gn = gnode_next(gn))
01093 ckd_free(gnode_ptr(gn));
01094 glist_free(classwords);
01095 glist_free(classprobs);
01096 ckd_free(classname);
01097
01098 return rv;
01099 }
01100
01101 void
01102 classdef_free(classdef_t *classdef)
01103 {
01104 int32 i;
01105 for (i = 0; i < classdef->n_words; ++i)
01106 ckd_free(classdef->words[i]);
01107 ckd_free(classdef->words);
01108 ckd_free(classdef->weights);
01109 ckd_free(classdef);
01110 }
01111
01112
01113 int32
01114 ngram_model_read_classdef(ngram_model_t *model,
01115 const char *file_name)
01116 {
01117 hash_table_t *classes;
01118 glist_t hl = NULL;
01119 gnode_t *gn;
01120 int32 rv = -1;
01121
01122 classes = hash_table_new(0, FALSE);
01123 if (read_classdef_file(classes, file_name) < 0) {
01124 hash_table_free(classes);
01125 return -1;
01126 }
01127
01128
01129 hl = hash_table_tolist(classes, NULL);
01130 for (gn = hl; gn; gn = gnode_next(gn)) {
01131 hash_entry_t *he = gnode_ptr(gn);
01132 classdef_t *classdef = he->val;
01133
01134 if (ngram_model_add_class(model, he->key, 1.0,
01135 classdef->words,
01136 classdef->weights,
01137 classdef->n_words) < 0)
01138 goto error_out;
01139 }
01140 rv = 0;
01141
01142 error_out:
01143 for (gn = hl; gn; gn = gnode_next(gn)) {
01144 hash_entry_t *he = gnode_ptr(gn);
01145 ckd_free((char *)he->key);
01146 classdef_free(he->val);
01147 }
01148 glist_free(hl);
01149 hash_table_free(classes);
01150 return rv;
01151 }