SphinxBase 0.6
|
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 00002 /* ==================================================================== 00003 * Copyright (c) 1999-2007 Carnegie Mellon University. All rights 00004 * reserved. 00005 * 00006 * Redistribution and use in source and binary forms, with or without 00007 * modification, are permitted provided that the following conditions 00008 * are met: 00009 * 00010 * 1. Redistributions of source code must retain the above copyright 00011 * notice, this list of conditions and the following disclaimer. 00012 * 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in 00015 * the documentation and/or other materials provided with the 00016 * distribution. 00017 * 00018 * This work was supported in part by funding from the Defense Advanced 00019 * Research Projects Agency and the National Science Foundation of the 00020 * United States of America, and the CMU Sphinx Speech Consortium. 00021 * 00022 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 00023 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 00024 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00025 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 00026 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00027 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00028 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00029 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00030 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00031 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00032 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00033 * 00034 * ==================================================================== 00035 * 00036 */ 00037 /* 00038 * \file ngram_model.c N-Gram language models. 00039 * 00040 * Author: David Huggins-Daines, much code taken from sphinx3/src/libs3decoder/liblm 00041 */ 00042 00043 #include <config.h> 00044 00045 #include <string.h> 00046 #include <assert.h> 00047 00048 #ifdef HAVE_ICONV 00049 #include <iconv.h> 00050 #endif 00051 00052 #include "sphinxbase/ngram_model.h" 00053 #include "sphinxbase/ckd_alloc.h" 00054 #include "sphinxbase/filename.h" 00055 #include "sphinxbase/pio.h" 00056 #include "sphinxbase/err.h" 00057 #include "sphinxbase/logmath.h" 00058 #include "sphinxbase/strfuncs.h" 00059 #include "sphinxbase/case.h" 00060 00061 #include "ngram_model_internal.h" 00062 00063 ngram_file_type_t 00064 ngram_file_name_to_type(const char *file_name) 00065 { 00066 const char *ext; 00067 00068 ext = strrchr(file_name, '.'); 00069 if (ext == NULL) { 00070 return NGRAM_INVALID; 00071 } 00072 if (0 == strcmp_nocase(ext, ".gz")) { 00073 while (--ext >= file_name) { 00074 if (*ext == '.') break; 00075 } 00076 if (ext < file_name) { 00077 return NGRAM_INVALID; 00078 } 00079 } 00080 else if (0 == strcmp_nocase(ext, ".bz2")) { 00081 while (--ext >= file_name) { 00082 if (*ext == '.') break; 00083 } 00084 if (ext < file_name) { 00085 return NGRAM_INVALID; 00086 } 00087 } 00088 /* We use strncmp because there might be a .gz on the end. */ 00089 if (0 == strncmp_nocase(ext, ".ARPA", 5)) 00090 return NGRAM_ARPA; 00091 if (0 == strncmp_nocase(ext, ".DMP", 4)) 00092 return NGRAM_DMP; 00093 return NGRAM_INVALID; 00094 } 00095 00096 ngram_file_type_t 00097 ngram_str_to_type(const char *str_name) 00098 { 00099 if (0 == strcmp_nocase(str_name, "arpa")) 00100 return NGRAM_ARPA; 00101 if (0 == strcmp_nocase(str_name, "dmp")) 00102 return NGRAM_DMP; 00103 return NGRAM_INVALID; 00104 } 00105 00106 char const * 00107 ngram_type_to_str(int type) 00108 { 00109 switch (type) { 00110 case NGRAM_ARPA: 00111 return "arpa"; 00112 case NGRAM_DMP: 00113 return "dmp"; 00114 default: 00115 return NULL; 00116 } 00117 } 00118 00119 00120 ngram_model_t * 00121 ngram_model_read(cmd_ln_t *config, 00122 const char *file_name, 00123 ngram_file_type_t file_type, 00124 logmath_t *lmath) 00125 { 00126 ngram_model_t *model = NULL; 00127 00128 switch (file_type) { 00129 case NGRAM_AUTO: { 00130 if ((model = ngram_model_arpa_read(config, file_name, lmath)) != NULL) 00131 break; 00132 if ((model = ngram_model_dmp_read(config, file_name, lmath)) != NULL) 00133 break; 00134 return NULL; 00135 } 00136 case NGRAM_ARPA: 00137 model = ngram_model_arpa_read(config, file_name, lmath); 00138 break; 00139 case NGRAM_DMP: 00140 model = ngram_model_dmp_read(config, file_name, lmath); 00141 break; 00142 default: 00143 E_ERROR("language model file type not supported\n"); 00144 return NULL; 00145 } 00146 00147 /* Now set weights based on config if present. */ 00148 if (config) { 00149 float32 lw = 1.0; 00150 float32 wip = 1.0; 00151 float32 uw = 1.0; 00152 00153 if (cmd_ln_exists_r(config, "-lw")) 00154 lw = cmd_ln_float32_r(config, "-lw"); 00155 if (cmd_ln_exists_r(config, "-wip")) 00156 wip = cmd_ln_float32_r(config, "-wip"); 00157 if (cmd_ln_exists_r(config, "-uw")) 00158 uw = cmd_ln_float32_r(config, "-uw"); 00159 00160 ngram_model_apply_weights(model, lw, wip, uw); 00161 } 00162 00163 return model; 00164 } 00165 00166 int 00167 ngram_model_write(ngram_model_t *model, const char *file_name, 00168 ngram_file_type_t file_type) 00169 { 00170 switch (file_type) { 00171 case NGRAM_AUTO: { 00172 file_type = ngram_file_name_to_type(file_name); 00173 /* Default to ARPA (catches .lm and other things) */ 00174 if (file_type == NGRAM_INVALID) 00175 file_type = NGRAM_ARPA; 00176 return ngram_model_write(model, file_name, file_type); 00177 } 00178 case NGRAM_ARPA: 00179 return ngram_model_arpa_write(model, file_name); 00180 case NGRAM_DMP: 00181 return ngram_model_dmp_write(model, file_name); 00182 default: 00183 E_ERROR("language model file type not supported\n"); 00184 return -1; 00185 } 00186 E_ERROR("language model file type not supported\n"); 00187 return -1; 00188 } 00189 00190 int32 00191 ngram_model_init(ngram_model_t *base, 00192 ngram_funcs_t *funcs, 00193 logmath_t *lmath, 00194 int32 n, int32 n_unigram) 00195 { 00196 base->refcount = 1; 00197 base->funcs = funcs; 00198 base->n = n; 00199 /* If this was previously initialized... */ 00200 if (base->n_counts == NULL) 00201 base->n_counts = ckd_calloc(3, sizeof(*base->n_counts)); 00202 /* Don't reset weights if logmath object hasn't changed. */ 00203 if (base->lmath != lmath) { 00204 /* Set default values for weights. */ 00205 base->lw = 1.0; 00206 base->log_wip = 0; /* i.e. 1.0 */ 00207 base->log_uw = 0; /* i.e. 1.0 */ 00208 base->log_uniform = logmath_log(lmath, 1.0 / n_unigram); 00209 base->log_uniform_weight = logmath_get_zero(lmath); 00210 base->log_zero = logmath_get_zero(lmath); 00211 base->lmath = lmath; 00212 } 00213 /* Allocate or reallocate space for word strings. */ 00214 if (base->word_str) { 00215 /* Free all previous word strings if they were allocated. */ 00216 if (base->writable) { 00217 int32 i; 00218 for (i = 0; i < base->n_words; ++i) { 00219 ckd_free(base->word_str[i]); 00220 base->word_str[i] = NULL; 00221 } 00222 } 00223 base->word_str = ckd_realloc(base->word_str, n_unigram * sizeof(char *)); 00224 } 00225 else 00226 base->word_str = ckd_calloc(n_unigram, sizeof(char *)); 00227 /* NOTE: They are no longer case-insensitive since we are allowing 00228 * other encodings for word strings. Beware. */ 00229 if (base->wid) 00230 hash_table_empty(base->wid); 00231 else 00232 base->wid = hash_table_new(n_unigram, FALSE); 00233 base->n_counts[0] = base->n_1g_alloc = base->n_words = n_unigram; 00234 00235 return 0; 00236 } 00237 00238 ngram_model_t * 00239 ngram_model_retain(ngram_model_t *model) 00240 { 00241 ++model->refcount; 00242 return model; 00243 } 00244 00245 00246 void 00247 ngram_model_flush(ngram_model_t *model) 00248 { 00249 if (model->funcs && model->funcs->flush) 00250 (*model->funcs->flush)(model); 00251 } 00252 00253 int 00254 ngram_model_free(ngram_model_t *model) 00255 { 00256 int i; 00257 00258 if (model == NULL) 00259 return 0; 00260 if (--model->refcount > 0) 00261 return model->refcount; 00262 if (model->funcs && model->funcs->free) 00263 (*model->funcs->free)(model); 00264 if (model->writable) { 00265 /* Free all words. */ 00266 for (i = 0; i < model->n_words; ++i) { 00267 ckd_free(model->word_str[i]); 00268 } 00269 } 00270 else { 00271 /* Free all class words. */ 00272 for (i = 0; i < model->n_classes; ++i) { 00273 ngram_class_t *lmclass; 00274 int32 j; 00275 00276 lmclass = model->classes[i]; 00277 for (j = 0; j < lmclass->n_words; ++j) { 00278 ckd_free(model->word_str[lmclass->start_wid + j]); 00279 } 00280 for (j = 0; j < lmclass->n_hash; ++j) { 00281 if (lmclass->nword_hash[j].wid != -1) { 00282 ckd_free(model->word_str[lmclass->nword_hash[j].wid]); 00283 } 00284 } 00285 } 00286 } 00287 for (i = 0; i < model->n_classes; ++i) { 00288 ngram_class_free(model->classes[i]); 00289 } 00290 ckd_free(model->classes); 00291 hash_table_free(model->wid); 00292 ckd_free(model->word_str); 00293 ckd_free(model->n_counts); 00294 ckd_free(model); 00295 return 0; 00296 } 00297 00298 int 00299 ngram_model_casefold(ngram_model_t *model, int kase) 00300 { 00301 int writable, i; 00302 hash_table_t *new_wid; 00303 00304 /* Were word strings already allocated? */ 00305 writable = model->writable; 00306 /* Either way, we are going to allocate some word strings. */ 00307 model->writable = TRUE; 00308 00309 /* And, don't forget, we need to rebuild the word to unigram ID 00310 * mapping. */ 00311 new_wid = hash_table_new(model->n_words, FALSE); 00312 for (i = 0; i < model->n_words; ++i) { 00313 char *outstr; 00314 if (writable) { 00315 outstr = model->word_str[i]; 00316 } 00317 else { 00318 outstr = ckd_salloc(model->word_str[i]); 00319 } 00320 /* Don't case-fold <tags> or [classes] */ 00321 if (outstr[0] == '<' || outstr[0] == '[') { 00322 } 00323 else { 00324 switch (kase) { 00325 case NGRAM_UPPER: 00326 ucase(outstr); 00327 break; 00328 case NGRAM_LOWER: 00329 lcase(outstr); 00330 break; 00331 default: 00332 ; 00333 } 00334 } 00335 model->word_str[i] = outstr; 00336 00337 /* Now update the hash table. We might have terrible 00338 * collisions here, so warn about them. */ 00339 if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) { 00340 E_WARN("Duplicate word in dictionary after conversion: %s\n", 00341 model->word_str[i]); 00342 } 00343 } 00344 /* Swap out the hash table. */ 00345 hash_table_free(model->wid); 00346 model->wid = new_wid; 00347 return 0; 00348 } 00349 00350 #ifdef HAVE_ICONV 00351 int 00352 ngram_model_recode(ngram_model_t *model, const char *from, const char *to) 00353 { 00354 iconv_t ic; 00355 char *outbuf; 00356 size_t maxlen; 00357 int i, writable; 00358 hash_table_t *new_wid; 00359 00360 /* FIXME: Need to do a special case thing for the GB-HEX encoding 00361 * used in Sphinx3 Mandarin models. */ 00362 if ((ic = iconv_open(to, from)) == (iconv_t)-1) { 00363 E_ERROR_SYSTEM("iconv_open() failed"); 00364 return -1; 00365 } 00366 /* iconv(3) is a piece of crap and won't accept a NULL out buffer, 00367 * unlike wcstombs(3). So we have to either call it over and over 00368 * again until our buffer is big enough, or call it with a huge 00369 * buffer and then copy things back to the output. We will use a 00370 * mix of these two approaches here. We'll keep a single big 00371 * buffer around, and expand it as necessary. 00372 */ 00373 maxlen = 0; 00374 for (i = 0; i < model->n_words; ++i) { 00375 if (strlen(model->word_str[i]) > maxlen) 00376 maxlen = strlen(model->word_str[i]); 00377 } 00378 /* Were word strings already allocated? */ 00379 writable = model->writable; 00380 /* Either way, we are going to allocate some word strings. */ 00381 model->writable = TRUE; 00382 /* Really should be big enough except for pathological cases. */ 00383 maxlen = maxlen * sizeof(int) + 15; 00384 outbuf = ckd_calloc(maxlen, 1); 00385 /* And, don't forget, we need to rebuild the word to unigram ID 00386 * mapping. */ 00387 new_wid = hash_table_new(model->n_words, FALSE); 00388 for (i = 0; i < model->n_words; ++i) { 00389 ICONV_CONST char *in; 00390 char *out; 00391 size_t inleft, outleft, result; 00392 00393 start_conversion: 00394 in = (ICONV_CONST char *)model->word_str[i]; 00395 /* Yes, this assumes that we don't have any NUL bytes. */ 00396 inleft = strlen(in); 00397 out = outbuf; 00398 outleft = maxlen; 00399 00400 while ((result = iconv(ic, &in, &inleft, &out, &outleft)) == (size_t)-1) { 00401 if (errno != E2BIG) { 00402 /* FIXME: if we already converted any words, then they 00403 * are going to be in an inconsistent state. */ 00404 E_ERROR_SYSTEM("iconv() failed"); 00405 ckd_free(outbuf); 00406 hash_table_free(new_wid); 00407 return -1; 00408 } 00409 /* Reset the internal state of conversion. */ 00410 iconv(ic, NULL, NULL, NULL, NULL); 00411 /* Make everything bigger. */ 00412 maxlen *= 2; 00413 out = outbuf = ckd_realloc(outbuf, maxlen); 00414 /* Reset the input pointers. */ 00415 in = (ICONV_CONST char *)model->word_str[i]; 00416 inleft = strlen(in); 00417 } 00418 00419 /* Now flush a shift-out sequence, if any. */ 00420 if ((result = iconv(ic, NULL, NULL, &out, &outleft)) == (size_t)-1) { 00421 if (errno != E2BIG) { 00422 /* FIXME: if we already converted any words, then they 00423 * are going to be in an inconsistent state. */ 00424 E_ERROR_SYSTEM("iconv() failed (state reset sequence)"); 00425 ckd_free(outbuf); 00426 hash_table_free(new_wid); 00427 return -1; 00428 } 00429 /* Reset the internal state of conversion. */ 00430 iconv(ic, NULL, NULL, NULL, NULL); 00431 /* Make everything bigger. */ 00432 maxlen *= 2; 00433 outbuf = ckd_realloc(outbuf, maxlen); 00434 /* Be very evil. */ 00435 goto start_conversion; 00436 } 00437 00438 result = maxlen - outleft; 00439 /* Okay, that was hard, now let's go shopping. */ 00440 if (writable) { 00441 /* Grow or shrink the output string as necessary. */ 00442 model->word_str[i] = ckd_realloc(model->word_str[i], result + 1); 00443 model->word_str[i][result] = '\0'; 00444 } 00445 else { 00446 /* It actually was not allocated previously, so do that now. */ 00447 model->word_str[i] = ckd_calloc(result + 1, 1); 00448 } 00449 /* Copy the new thing in. */ 00450 memcpy(model->word_str[i], outbuf, result); 00451 00452 /* Now update the hash table. We might have terrible 00453 * collisions if a non-reversible conversion was requested., 00454 * so warn about them. */ 00455 if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) { 00456 E_WARN("Duplicate word in dictionary after conversion: %s\n", 00457 model->word_str[i]); 00458 } 00459 } 00460 ckd_free(outbuf); 00461 iconv_close(ic); 00462 /* Swap out the hash table. */ 00463 hash_table_free(model->wid); 00464 model->wid = new_wid; 00465 00466 return 0; 00467 } 00468 #else /* !HAVE_ICONV */ 00469 int 00470 ngram_model_recode(ngram_model_t *model, const char *from, const char *to) 00471 { 00472 return -1; 00473 } 00474 #endif /* !HAVE_ICONV */ 00475 00476 int 00477 ngram_model_apply_weights(ngram_model_t *model, 00478 float32 lw, float32 wip, float32 uw) 00479 { 00480 return (*model->funcs->apply_weights)(model, lw, wip, uw); 00481 } 00482 00483 float32 00484 ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip, 00485 int32 *out_log_uw) 00486 { 00487 if (out_log_wip) *out_log_wip = model->log_wip; 00488 if (out_log_uw) *out_log_uw = model->log_uw; 00489 return model->lw; 00490 } 00491 00492 00493 int32 00494 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history, 00495 int32 n_hist, int32 *n_used) 00496 { 00497 int32 score, class_weight = 0; 00498 int i; 00499 00500 /* Closed vocabulary, OOV word probability is zero */ 00501 if (wid == NGRAM_INVALID_WID) 00502 return model->log_zero; 00503 00504 /* "Declassify" wid and history */ 00505 if (NGRAM_IS_CLASSWID(wid)) { 00506 ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)]; 00507 00508 class_weight = ngram_class_prob(lmclass, wid); 00509 if (class_weight == 1) /* Meaning, not found in class. */ 00510 return model->log_zero; 00511 wid = lmclass->tag_wid; 00512 } 00513 for (i = 0; i < n_hist; ++i) { 00514 if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i])) 00515 history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid; 00516 } 00517 score = (*model->funcs->score)(model, wid, history, n_hist, n_used); 00518 00519 /* Multiply by unigram in-class weight. */ 00520 return score + class_weight; 00521 } 00522 00523 int32 00524 ngram_score(ngram_model_t *model, const char *word, ...) 00525 { 00526 va_list history; 00527 const char *hword; 00528 int32 *histid; 00529 int32 n_hist; 00530 int32 n_used; 00531 int32 prob; 00532 00533 va_start(history, word); 00534 n_hist = 0; 00535 while ((hword = va_arg(history, const char *)) != NULL) 00536 ++n_hist; 00537 va_end(history); 00538 00539 histid = ckd_calloc(n_hist, sizeof(*histid)); 00540 va_start(history, word); 00541 n_hist = 0; 00542 while ((hword = va_arg(history, const char *)) != NULL) { 00543 histid[n_hist] = ngram_wid(model, hword); 00544 ++n_hist; 00545 } 00546 va_end(history); 00547 00548 prob = ngram_ng_score(model, ngram_wid(model, word), 00549 histid, n_hist, &n_used); 00550 ckd_free(histid); 00551 return prob; 00552 } 00553 00554 int32 00555 ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used) 00556 { 00557 int32 hist[2]; 00558 hist[0] = w2; 00559 hist[1] = w1; 00560 return ngram_ng_score(model, w3, hist, 2, n_used); 00561 } 00562 00563 int32 00564 ngram_bg_score(ngram_model_t *model, int32 w2, int32 w1, int32 *n_used) 00565 { 00566 return ngram_ng_score(model, w2, &w1, 1, n_used); 00567 } 00568 00569 int32 00570 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history, 00571 int32 n_hist, int32 *n_used) 00572 { 00573 int32 prob, class_weight = 0; 00574 int i; 00575 00576 /* Closed vocabulary, OOV word probability is zero */ 00577 if (wid == NGRAM_INVALID_WID) 00578 return model->log_zero; 00579 00580 /* "Declassify" wid and history */ 00581 if (NGRAM_IS_CLASSWID(wid)) { 00582 ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)]; 00583 00584 class_weight = ngram_class_prob(lmclass, wid); 00585 if (class_weight == 1) /* Meaning, not found in class. */ 00586 return class_weight; 00587 wid = lmclass->tag_wid; 00588 } 00589 for (i = 0; i < n_hist; ++i) { 00590 if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i])) 00591 history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid; 00592 } 00593 prob = (*model->funcs->raw_score)(model, wid, history, 00594 n_hist, n_used); 00595 /* Multiply by unigram in-class weight. */ 00596 return prob + class_weight; 00597 } 00598 00599 int32 00600 ngram_prob(ngram_model_t *model, const char *word, ...) 00601 { 00602 va_list history; 00603 const char *hword; 00604 int32 *histid; 00605 int32 n_hist; 00606 int32 n_used; 00607 int32 prob; 00608 00609 va_start(history, word); 00610 n_hist = 0; 00611 while ((hword = va_arg(history, const char *)) != NULL) 00612 ++n_hist; 00613 va_end(history); 00614 00615 histid = ckd_calloc(n_hist, sizeof(*histid)); 00616 va_start(history, word); 00617 n_hist = 0; 00618 while ((hword = va_arg(history, const char *)) != NULL) { 00619 histid[n_hist] = ngram_wid(model, hword); 00620 ++n_hist; 00621 } 00622 va_end(history); 00623 00624 prob = ngram_ng_prob(model, ngram_wid(model, word), 00625 histid, n_hist, &n_used); 00626 ckd_free(histid); 00627 return prob; 00628 } 00629 00630 int32 00631 ngram_score_to_prob(ngram_model_t *base, int32 score) 00632 { 00633 int32 prob; 00634 00635 /* Undo insertion penalty. */ 00636 prob = score - base->log_wip; 00637 /* Undo language weight. */ 00638 prob = (int32)(prob / base->lw); 00639 00640 return prob; 00641 } 00642 00643 int32 00644 ngram_unknown_wid(ngram_model_t *model) 00645 { 00646 int32 val; 00647 00648 /* FIXME: This could be memoized for speed if necessary. */ 00649 /* Look up <UNK>, if not found return NGRAM_INVALID_WID. */ 00650 if (hash_table_lookup_int32(model->wid, "<UNK>", &val) == -1) 00651 return NGRAM_INVALID_WID; 00652 else 00653 return val; 00654 } 00655 00656 int32 00657 ngram_zero(ngram_model_t *model) 00658 { 00659 return model->log_zero; 00660 } 00661 00662 int32 00663 ngram_model_get_size(ngram_model_t *model) 00664 { 00665 if (model != NULL) 00666 return model->n; 00667 return 0; 00668 } 00669 00670 int32 const * 00671 ngram_model_get_counts(ngram_model_t *model) 00672 { 00673 if (model != NULL) 00674 return model->n_counts; 00675 return NULL; 00676 } 00677 00678 void 00679 ngram_iter_init(ngram_iter_t *itor, ngram_model_t *model, 00680 int m, int successor) 00681 { 00682 itor->model = model; 00683 itor->wids = ckd_calloc(model->n, sizeof(*itor->wids)); 00684 itor->m = m; 00685 itor->successor = successor; 00686 } 00687 00688 ngram_iter_t * 00689 ngram_model_mgrams(ngram_model_t *model, int m) 00690 { 00691 ngram_iter_t *itor; 00692 /* The fact that m=n-1 is not exactly obvious. Prevent accidents. */ 00693 if (m >= model->n) 00694 return NULL; 00695 if (model->funcs->mgrams == NULL) 00696 return NULL; 00697 itor = (*model->funcs->mgrams)(model, m); 00698 return itor; 00699 } 00700 00701 ngram_iter_t * 00702 ngram_iter(ngram_model_t *model, const char *word, ...) 00703 { 00704 va_list history; 00705 const char *hword; 00706 int32 *histid; 00707 int32 n_hist; 00708 ngram_iter_t *itor; 00709 00710 va_start(history, word); 00711 n_hist = 0; 00712 while ((hword = va_arg(history, const char *)) != NULL) 00713 ++n_hist; 00714 va_end(history); 00715 00716 histid = ckd_calloc(n_hist, sizeof(*histid)); 00717 va_start(history, word); 00718 n_hist = 0; 00719 while ((hword = va_arg(history, const char *)) != NULL) { 00720 histid[n_hist] = ngram_wid(model, hword); 00721 ++n_hist; 00722 } 00723 va_end(history); 00724 00725 itor = ngram_ng_iter(model, ngram_wid(model, word), histid, n_hist); 00726 ckd_free(histid); 00727 return itor; 00728 } 00729 00730 ngram_iter_t * 00731 ngram_ng_iter(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist) 00732 { 00733 if (n_hist >= model->n) 00734 return NULL; 00735 if (model->funcs->iter == NULL) 00736 return NULL; 00737 return (*model->funcs->iter)(model, wid, history, n_hist); 00738 } 00739 00740 ngram_iter_t * 00741 ngram_iter_successors(ngram_iter_t *itor) 00742 { 00743 /* Stop when we are at the highest order N-Gram. */ 00744 if (itor->m == itor->model->n - 1) 00745 return NULL; 00746 return (*itor->model->funcs->successors)(itor); 00747 } 00748 00749 int32 const * 00750 ngram_iter_get(ngram_iter_t *itor, 00751 int32 *out_score, 00752 int32 *out_bowt) 00753 { 00754 return (*itor->model->funcs->iter_get)(itor, out_score, out_bowt); 00755 } 00756 00757 ngram_iter_t * 00758 ngram_iter_next(ngram_iter_t *itor) 00759 { 00760 return (*itor->model->funcs->iter_next)(itor); 00761 } 00762 00763 void 00764 ngram_iter_free(ngram_iter_t *itor) 00765 { 00766 ckd_free(itor->wids); 00767 (*itor->model->funcs->iter_free)(itor); 00768 } 00769 00770 int32 00771 ngram_wid(ngram_model_t *model, const char *word) 00772 { 00773 int32 val; 00774 00775 if (hash_table_lookup_int32(model->wid, word, &val) == -1) 00776 return ngram_unknown_wid(model); 00777 else 00778 return val; 00779 } 00780 00781 const char * 00782 ngram_word(ngram_model_t *model, int32 wid) 00783 { 00784 /* Remove any class tag */ 00785 wid = NGRAM_BASEWID(wid); 00786 if (wid >= model->n_words) 00787 return NULL; 00788 return model->word_str[wid]; 00789 } 00790 00794 int32 00795 ngram_add_word_internal(ngram_model_t *model, 00796 const char *word, 00797 int32 classid) 00798 { 00799 void *dummy; 00800 int32 wid; 00801 00802 /* Take the next available word ID */ 00803 wid = model->n_words; 00804 if (classid >= 0) { 00805 wid = NGRAM_CLASSWID(wid, classid); 00806 } 00807 /* Check for hash collisions. */ 00808 if (hash_table_lookup(model->wid, word, &dummy) == 0) { 00809 E_ERROR("Duplicate definition of word %s\n", word); 00810 return NGRAM_INVALID_WID; 00811 } 00812 /* Reallocate word_str if necessary. */ 00813 if (model->n_words >= model->n_1g_alloc) { 00814 model->n_1g_alloc += UG_ALLOC_STEP; 00815 model->word_str = ckd_realloc(model->word_str, 00816 sizeof(*model->word_str) * model->n_1g_alloc); 00817 } 00818 /* Add the word string in the appropriate manner. */ 00819 /* Class words are always dynamically allocated. */ 00820 model->word_str[model->n_words] = ckd_salloc(word); 00821 /* Now enter it into the hash table. */ 00822 if (hash_table_enter_int32(model->wid, model->word_str[model->n_words], wid) != wid) { 00823 E_ERROR("Hash insertion failed for word %s => %p (should not happen)\n", 00824 model->word_str[model->n_words], (void *)(long)(wid)); 00825 } 00826 /* Increment number of words. */ 00827 ++model->n_words; 00828 return wid; 00829 } 00830 00831 int32 00832 ngram_model_add_word(ngram_model_t *model, 00833 const char *word, float32 weight) 00834 { 00835 int32 wid, prob = model->log_zero; 00836 00837 wid = ngram_add_word_internal(model, word, -1); 00838 if (wid == NGRAM_INVALID_WID) 00839 return wid; 00840 00841 /* Do what needs to be done to add the word to the unigram. */ 00842 if (model->funcs && model->funcs->add_ug) 00843 prob = (*model->funcs->add_ug)(model, wid, logmath_log(model->lmath, weight)); 00844 if (prob == 0) { 00845 if (model->writable) 00846 ckd_free(model->word_str[wid]); 00847 return -1; 00848 } 00849 return wid; 00850 } 00851 00852 ngram_class_t * 00853 ngram_class_new(ngram_model_t *model, int32 tag_wid, int32 start_wid, glist_t classwords) 00854 { 00855 ngram_class_t *lmclass; 00856 gnode_t *gn; 00857 float32 tprob; 00858 int i; 00859 00860 lmclass = ckd_calloc(1, sizeof(*lmclass)); 00861 lmclass->tag_wid = tag_wid; 00862 /* wid_base is the wid (minus class tag) of the first word in the list. */ 00863 lmclass->start_wid = start_wid; 00864 lmclass->n_words = glist_count(classwords); 00865 lmclass->prob1 = ckd_calloc(lmclass->n_words, sizeof(*lmclass->prob1)); 00866 lmclass->nword_hash = NULL; 00867 lmclass->n_hash = 0; 00868 tprob = 0.0; 00869 for (gn = classwords; gn; gn = gnode_next(gn)) { 00870 tprob += gnode_float32(gn); 00871 } 00872 if (tprob > 1.1 || tprob < 0.9) { 00873 E_WARN("Total class probability is %f, will normalize\n", tprob); 00874 for (gn = classwords; gn; gn = gnode_next(gn)) { 00875 gn->data.fl /= tprob; 00876 } 00877 } 00878 for (i = 0, gn = classwords; gn; ++i, gn = gnode_next(gn)) { 00879 lmclass->prob1[i] = logmath_log(model->lmath, gnode_float32(gn)); 00880 } 00881 00882 return lmclass; 00883 } 00884 00885 int32 00886 ngram_class_add_word(ngram_class_t *lmclass, int32 wid, int32 lweight) 00887 { 00888 int32 hash; 00889 00890 if (lmclass->nword_hash == NULL) { 00891 /* Initialize everything in it to -1 */ 00892 lmclass->nword_hash = ckd_malloc(NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash)); 00893 memset(lmclass->nword_hash, 0xff, NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash)); 00894 lmclass->n_hash = NGRAM_HASH_SIZE; 00895 lmclass->n_hash_inuse = 0; 00896 } 00897 /* Stupidest possible hash function. This will work pretty well 00898 * when this function is called repeatedly with contiguous word 00899 * IDs, though... */ 00900 hash = wid & (lmclass->n_hash - 1); 00901 if (lmclass->nword_hash[hash].wid == -1) { 00902 /* Good, no collision. */ 00903 lmclass->nword_hash[hash].wid = wid; 00904 lmclass->nword_hash[hash].prob1 = lweight; 00905 ++lmclass->n_hash_inuse; 00906 return hash; 00907 } 00908 else { 00909 int32 next; 00910 /* Collision... Find the end of the hash chain. */ 00911 while (lmclass->nword_hash[hash].next != -1) 00912 hash = lmclass->nword_hash[hash].next; 00913 assert(hash != -1); 00914 /* Does we has any more bukkit? */ 00915 if (lmclass->n_hash_inuse == lmclass->n_hash) { 00916 /* Oh noes! Ok, we makes more. */ 00917 lmclass->nword_hash = ckd_realloc(lmclass->nword_hash, 00918 lmclass->n_hash * 2 * sizeof(*lmclass->nword_hash)); 00919 memset(lmclass->nword_hash + lmclass->n_hash, 00920 0xff, lmclass->n_hash * sizeof(*lmclass->nword_hash)); 00921 /* Just use the next allocated one (easy) */ 00922 next = lmclass->n_hash; 00923 lmclass->n_hash *= 2; 00924 } 00925 else { 00926 /* Look for any available bucket. We hope this doesn't happen. */ 00927 for (next = 0; next < lmclass->n_hash; ++next) 00928 if (lmclass->nword_hash[next].wid == -1) 00929 break; 00930 /* This should absolutely not happen. */ 00931 assert(next != lmclass->n_hash); 00932 } 00933 lmclass->nword_hash[next].wid = wid; 00934 lmclass->nword_hash[next].prob1 = lweight; 00935 lmclass->nword_hash[hash].next = next; 00936 ++lmclass->n_hash_inuse; 00937 return next; 00938 } 00939 } 00940 00941 void 00942 ngram_class_free(ngram_class_t *lmclass) 00943 { 00944 ckd_free(lmclass->nword_hash); 00945 ckd_free(lmclass->prob1); 00946 ckd_free(lmclass); 00947 } 00948 00949 int32 00950 ngram_model_add_class_word(ngram_model_t *model, 00951 const char *classname, 00952 const char *word, 00953 float32 weight) 00954 { 00955 ngram_class_t *lmclass; 00956 int32 classid, tag_wid, wid, i, scale; 00957 float32 fprob; 00958 00959 /* Find the class corresponding to classname. Linear search 00960 * probably okay here since there won't be very many classes, and 00961 * this doesn't have to be fast. */ 00962 tag_wid = ngram_wid(model, classname); 00963 if (tag_wid == NGRAM_INVALID_WID) { 00964 E_ERROR("No such word or class tag: %s\n", classname); 00965 return tag_wid; 00966 } 00967 for (classid = 0; classid < model->n_classes; ++classid) { 00968 if (model->classes[classid]->tag_wid == tag_wid) 00969 break; 00970 } 00971 /* Hmm, no such class. It's probably not a good idea to create one. */ 00972 if (classid == model->n_classes) { 00973 E_ERROR("Word %s is not a class tag (call ngram_model_add_class() first)\n", classname); 00974 return NGRAM_INVALID_WID; 00975 } 00976 lmclass = model->classes[classid]; 00977 00978 /* Add this word to the model's set of words. */ 00979 wid = ngram_add_word_internal(model, word, classid); 00980 if (wid == NGRAM_INVALID_WID) 00981 return wid; 00982 00983 /* This is the fixed probability of the new word. */ 00984 fprob = weight * 1.0f / (lmclass->n_words + lmclass->n_hash_inuse + 1); 00985 /* Now normalize everything else to fit it in. This is 00986 * accomplished by simply scaling all the other probabilities 00987 * by (1-fprob). */ 00988 scale = logmath_log(model->lmath, 1.0 - fprob); 00989 for (i = 0; i < lmclass->n_words; ++i) 00990 lmclass->prob1[i] += scale; 00991 for (i = 0; i < lmclass->n_hash; ++i) 00992 if (lmclass->nword_hash[i].wid != -1) 00993 lmclass->nword_hash[i].prob1 += scale; 00994 00995 /* Now add it to the class hash table. */ 00996 return ngram_class_add_word(lmclass, wid, logmath_log(model->lmath, fprob)); 00997 } 00998 00999 int32 01000 ngram_model_add_class(ngram_model_t *model, 01001 const char *classname, 01002 float32 classweight, 01003 char **words, 01004 const float32 *weights, 01005 int32 n_words) 01006 { 01007 ngram_class_t *lmclass; 01008 glist_t classwords = NULL; 01009 int32 i, start_wid = -1; 01010 int32 classid, tag_wid; 01011 01012 /* Check if classname already exists in model. If not, add it.*/ 01013 if ((tag_wid = ngram_wid(model, classname)) == ngram_unknown_wid(model)) { 01014 tag_wid = ngram_model_add_word(model, classname, classweight); 01015 if (tag_wid == NGRAM_INVALID_WID) 01016 return -1; 01017 } 01018 01019 if (model->n_classes == 128) { 01020 E_ERROR("Number of classes cannot exceed 128 (sorry)\n"); 01021 return -1; 01022 } 01023 classid = model->n_classes; 01024 for (i = 0; i < n_words; ++i) { 01025 int32 wid; 01026 01027 wid = ngram_add_word_internal(model, words[i], classid); 01028 if (wid == NGRAM_INVALID_WID) 01029 return -1; 01030 if (start_wid == -1) 01031 start_wid = NGRAM_BASEWID(wid); 01032 classwords = glist_add_float32(classwords, weights[i]); 01033 } 01034 classwords = glist_reverse(classwords); 01035 lmclass = ngram_class_new(model, tag_wid, start_wid, classwords); 01036 glist_free(classwords); 01037 if (lmclass == NULL) 01038 return -1; 01039 01040 ++model->n_classes; 01041 if (model->classes == NULL) 01042 model->classes = ckd_calloc(1, sizeof(*model->classes)); 01043 else 01044 model->classes = ckd_realloc(model->classes, 01045 model->n_classes * sizeof(*model->classes)); 01046 model->classes[classid] = lmclass; 01047 return classid; 01048 } 01049 01050 int32 01051 ngram_class_prob(ngram_class_t *lmclass, int32 wid) 01052 { 01053 int32 base_wid = NGRAM_BASEWID(wid); 01054 01055 if (base_wid < lmclass->start_wid 01056 || base_wid > lmclass->start_wid + lmclass->n_words) { 01057 int32 hash; 01058 01059 /* Look it up in the hash table. */ 01060 hash = wid & (lmclass->n_hash - 1); 01061 while (hash != -1 && lmclass->nword_hash[hash].wid != wid) 01062 hash = lmclass->nword_hash[hash].next; 01063 if (hash == -1) 01064 return 1; 01065 return lmclass->nword_hash[hash].prob1; 01066 } 01067 else { 01068 return lmclass->prob1[base_wid - lmclass->start_wid]; 01069 } 01070 } 01071 01072 int32 01073 read_classdef_file(hash_table_t *classes, const char *file_name) 01074 { 01075 FILE *fp; 01076 int32 is_pipe; 01077 int inclass; 01078 int32 rv = -1; 01079 gnode_t *gn; 01080 glist_t classwords = NULL; 01081 glist_t classprobs = NULL; 01082 char *classname = NULL; 01083 01084 if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) { 01085 E_ERROR("File %s not found\n", file_name); 01086 return -1; 01087 } 01088 01089 inclass = FALSE; 01090 while (!feof(fp)) { 01091 char line[512]; 01092 char *wptr[2]; 01093 int n_words; 01094 01095 if (fgets(line, sizeof(line), fp) == NULL) 01096 break; 01097 01098 n_words = str2words(line, wptr, 2); 01099 if (n_words <= 0) 01100 continue; 01101 01102 if (inclass) { 01103 /* Look for an end of class marker. */ 01104 if (n_words == 2 && 0 == strcmp(wptr[0], "END")) { 01105 classdef_t *classdef; 01106 gnode_t *word, *weight; 01107 int32 i; 01108 01109 if (classname == NULL || 0 != strcmp(wptr[1], classname)) 01110 goto error_out; 01111 inclass = FALSE; 01112 01113 /* Construct a class from the list of words collected. */ 01114 classdef = ckd_calloc(1, sizeof(*classdef)); 01115 classwords = glist_reverse(classwords); 01116 classprobs = glist_reverse(classprobs); 01117 classdef->n_words = glist_count(classwords); 01118 classdef->words = ckd_calloc(classdef->n_words, 01119 sizeof(*classdef->words)); 01120 classdef->weights = ckd_calloc(classdef->n_words, 01121 sizeof(*classdef->weights)); 01122 word = classwords; 01123 weight = classprobs; 01124 for (i = 0; i < classdef->n_words; ++i) { 01125 classdef->words[i] = gnode_ptr(word); 01126 classdef->weights[i] = gnode_float32(weight); 01127 word = gnode_next(word); 01128 weight = gnode_next(weight); 01129 } 01130 01131 /* Add this class to the hash table. */ 01132 if (hash_table_enter(classes, classname, classdef) != classdef) { 01133 classdef_free(classdef); 01134 goto error_out; 01135 } 01136 01137 /* Reset everything. */ 01138 glist_free(classwords); 01139 glist_free(classprobs); 01140 classwords = NULL; 01141 classprobs = NULL; 01142 classname = NULL; 01143 } 01144 else { 01145 float32 fprob; 01146 01147 if (n_words == 2) 01148 fprob = (float32)atof_c(wptr[1]); 01149 else 01150 fprob = 1.0f; 01151 /* Add it to the list of words for this class. */ 01152 classwords = glist_add_ptr(classwords, ckd_salloc(wptr[0])); 01153 classprobs = glist_add_float32(classprobs, fprob); 01154 } 01155 } 01156 else { 01157 /* Start a new LM class if the LMCLASS marker is seen */ 01158 if (n_words == 2 && 0 == strcmp(wptr[0], "LMCLASS")) { 01159 if (inclass) 01160 goto error_out; 01161 inclass = TRUE; 01162 classname = ckd_salloc(wptr[1]); 01163 } 01164 /* Otherwise, just ignore whatever junk we got */ 01165 } 01166 } 01167 rv = 0; /* Success. */ 01168 01169 error_out: 01170 /* Free all the stuff we might have allocated. */ 01171 fclose_comp(fp, is_pipe); 01172 for (gn = classwords; gn; gn = gnode_next(gn)) 01173 ckd_free(gnode_ptr(gn)); 01174 glist_free(classwords); 01175 glist_free(classprobs); 01176 ckd_free(classname); 01177 01178 return rv; 01179 } 01180 01181 void 01182 classdef_free(classdef_t *classdef) 01183 { 01184 int32 i; 01185 for (i = 0; i < classdef->n_words; ++i) 01186 ckd_free(classdef->words[i]); 01187 ckd_free(classdef->words); 01188 ckd_free(classdef->weights); 01189 ckd_free(classdef); 01190 } 01191 01192 01193 int32 01194 ngram_model_read_classdef(ngram_model_t *model, 01195 const char *file_name) 01196 { 01197 hash_table_t *classes; 01198 glist_t hl = NULL; 01199 gnode_t *gn; 01200 int32 rv = -1; 01201 01202 classes = hash_table_new(0, FALSE); 01203 if (read_classdef_file(classes, file_name) < 0) { 01204 hash_table_free(classes); 01205 return -1; 01206 } 01207 01208 /* Create a new class in the language model for each classdef. */ 01209 hl = hash_table_tolist(classes, NULL); 01210 for (gn = hl; gn; gn = gnode_next(gn)) { 01211 hash_entry_t *he = gnode_ptr(gn); 01212 classdef_t *classdef = he->val; 01213 01214 if (ngram_model_add_class(model, he->key, 1.0, 01215 classdef->words, 01216 classdef->weights, 01217 classdef->n_words) < 0) 01218 goto error_out; 01219 } 01220 rv = 0; 01221 01222 error_out: 01223 for (gn = hl; gn; gn = gnode_next(gn)) { 01224 hash_entry_t *he = gnode_ptr(gn); 01225 ckd_free((char *)he->key); 01226 classdef_free(he->val); 01227 } 01228 glist_free(hl); 01229 hash_table_free(classes); 01230 return rv; 01231 }