SphinxBase 0.6
ngram_model.c
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 1999-2007 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37/*
38 * \file ngram_model.c N-Gram language models.
39 *
40 * Author: David Huggins-Daines, much code taken from sphinx3/src/libs3decoder/liblm
41 */
42
43#include <config.h>
44
45#include <string.h>
46#include <assert.h>
47
48#ifdef HAVE_ICONV
49#include <iconv.h>
50#endif
51
54#include "sphinxbase/filename.h"
55#include "sphinxbase/pio.h"
56#include "sphinxbase/err.h"
57#include "sphinxbase/logmath.h"
58#include "sphinxbase/strfuncs.h"
59#include "sphinxbase/case.h"
60
61#include "ngram_model_internal.h"
62
64ngram_file_name_to_type(const char *file_name)
65{
66 const char *ext;
67
68 ext = strrchr(file_name, '.');
69 if (ext == NULL) {
70 return NGRAM_INVALID;
71 }
72 if (0 == strcmp_nocase(ext, ".gz")) {
73 while (--ext >= file_name) {
74 if (*ext == '.') break;
75 }
76 if (ext < file_name) {
77 return NGRAM_INVALID;
78 }
79 }
80 else if (0 == strcmp_nocase(ext, ".bz2")) {
81 while (--ext >= file_name) {
82 if (*ext == '.') break;
83 }
84 if (ext < file_name) {
85 return NGRAM_INVALID;
86 }
87 }
88 /* We use strncmp because there might be a .gz on the end. */
89 if (0 == strncmp_nocase(ext, ".ARPA", 5))
90 return NGRAM_ARPA;
91 if (0 == strncmp_nocase(ext, ".DMP", 4))
92 return NGRAM_DMP;
93 return NGRAM_INVALID;
94 }
95
97ngram_str_to_type(const char *str_name)
98{
99 if (0 == strcmp_nocase(str_name, "arpa"))
100 return NGRAM_ARPA;
101 if (0 == strcmp_nocase(str_name, "dmp"))
102 return NGRAM_DMP;
103 return NGRAM_INVALID;
104}
105
106char const *
108{
109 switch (type) {
110 case NGRAM_ARPA:
111 return "arpa";
112 case NGRAM_DMP:
113 return "dmp";
114 default:
115 return NULL;
116 }
117}
118
119
122 const char *file_name,
123 ngram_file_type_t file_type,
124 logmath_t *lmath)
125 {
126 ngram_model_t *model = NULL;
127
128 switch (file_type) {
129 case NGRAM_AUTO: {
130 if ((model = ngram_model_arpa_read(config, file_name, lmath)) != NULL)
131 break;
132 if ((model = ngram_model_dmp_read(config, file_name, lmath)) != NULL)
133 break;
134 return NULL;
135 }
136 case NGRAM_ARPA:
137 model = ngram_model_arpa_read(config, file_name, lmath);
138 break;
139 case NGRAM_DMP:
140 model = ngram_model_dmp_read(config, file_name, lmath);
141 break;
142 default:
143 E_ERROR("language model file type not supported\n");
144 return NULL;
145 }
146
147 /* Now set weights based on config if present. */
148 if (config) {
149 float32 lw = 1.0;
150 float32 wip = 1.0;
151 float32 uw = 1.0;
152
153 if (cmd_ln_exists_r(config, "-lw"))
154 lw = cmd_ln_float32_r(config, "-lw");
155 if (cmd_ln_exists_r(config, "-wip"))
156 wip = cmd_ln_float32_r(config, "-wip");
157 if (cmd_ln_exists_r(config, "-uw"))
158 uw = cmd_ln_float32_r(config, "-uw");
159
160 ngram_model_apply_weights(model, lw, wip, uw);
161 }
162
163 return model;
164 }
165
166 int
167 ngram_model_write(ngram_model_t *model, const char *file_name,
168 ngram_file_type_t file_type)
169 {
170 switch (file_type) {
171 case NGRAM_AUTO: {
172 file_type = ngram_file_name_to_type(file_name);
173 /* Default to ARPA (catches .lm and other things) */
174 if (file_type == NGRAM_INVALID)
175 file_type = NGRAM_ARPA;
176 return ngram_model_write(model, file_name, file_type);
177 }
178 case NGRAM_ARPA:
179 return ngram_model_arpa_write(model, file_name);
180 case NGRAM_DMP:
181 return ngram_model_dmp_write(model, file_name);
182 default:
183 E_ERROR("language model file type not supported\n");
184 return -1;
185 }
186 E_ERROR("language model file type not supported\n");
187 return -1;
188 }
189
190 int32
191 ngram_model_init(ngram_model_t *base,
192 ngram_funcs_t *funcs,
193 logmath_t *lmath,
194 int32 n, int32 n_unigram)
195 {
196 base->refcount = 1;
197 base->funcs = funcs;
198 base->n = n;
199 /* If this was previously initialized... */
200 if (base->n_counts == NULL)
201 base->n_counts = ckd_calloc(3, sizeof(*base->n_counts));
202 /* Don't reset weights if logmath object hasn't changed. */
203 if (base->lmath != lmath) {
204 /* Set default values for weights. */
205 base->lw = 1.0;
206 base->log_wip = 0; /* i.e. 1.0 */
207 base->log_uw = 0; /* i.e. 1.0 */
208 base->log_uniform = logmath_log(lmath, 1.0 / n_unigram);
210 base->log_zero = logmath_get_zero(lmath);
211 base->lmath = lmath;
212 }
213 /* Allocate or reallocate space for word strings. */
214 if (base->word_str) {
215 /* Free all previous word strings if they were allocated. */
216 if (base->writable) {
217 int32 i;
218 for (i = 0; i < base->n_words; ++i) {
219 ckd_free(base->word_str[i]);
220 base->word_str[i] = NULL;
221 }
222 }
223 base->word_str = ckd_realloc(base->word_str, n_unigram * sizeof(char *));
224 }
225 else
226 base->word_str = ckd_calloc(n_unigram, sizeof(char *));
227 /* NOTE: They are no longer case-insensitive since we are allowing
228 * other encodings for word strings. Beware. */
229 if (base->wid)
230 hash_table_empty(base->wid);
231 else
232 base->wid = hash_table_new(n_unigram, FALSE);
233 base->n_counts[0] = base->n_1g_alloc = base->n_words = n_unigram;
234
235 return 0;
236}
237
240{
241 ++model->refcount;
242 return model;
243}
244
245
246void
248{
249 if (model->funcs && model->funcs->flush)
250 (*model->funcs->flush)(model);
251}
252
253int
255{
256 int i;
257
258 if (model == NULL)
259 return 0;
260 if (--model->refcount > 0)
261 return model->refcount;
262 if (model->funcs && model->funcs->free)
263 (*model->funcs->free)(model);
264 if (model->writable) {
265 /* Free all words. */
266 for (i = 0; i < model->n_words; ++i) {
267 ckd_free(model->word_str[i]);
268 }
269 }
270 else {
271 /* Free all class words. */
272 for (i = 0; i < model->n_classes; ++i) {
273 ngram_class_t *lmclass;
274 int32 j;
275
276 lmclass = model->classes[i];
277 for (j = 0; j < lmclass->n_words; ++j) {
278 ckd_free(model->word_str[lmclass->start_wid + j]);
279 }
280 for (j = 0; j < lmclass->n_hash; ++j) {
281 if (lmclass->nword_hash[j].wid != -1) {
282 ckd_free(model->word_str[lmclass->nword_hash[j].wid]);
283 }
284 }
285 }
286 }
287 for (i = 0; i < model->n_classes; ++i) {
288 ngram_class_free(model->classes[i]);
289 }
290 ckd_free(model->classes);
291 hash_table_free(model->wid);
292 ckd_free(model->word_str);
293 ckd_free(model->n_counts);
294 ckd_free(model);
295 return 0;
296}
297
298int
300{
301 int writable, i;
302 hash_table_t *new_wid;
303
304 /* Were word strings already allocated? */
305 writable = model->writable;
306 /* Either way, we are going to allocate some word strings. */
307 model->writable = TRUE;
308
309 /* And, don't forget, we need to rebuild the word to unigram ID
310 * mapping. */
311 new_wid = hash_table_new(model->n_words, FALSE);
312 for (i = 0; i < model->n_words; ++i) {
313 char *outstr;
314 if (writable) {
315 outstr = model->word_str[i];
316 }
317 else {
318 outstr = ckd_salloc(model->word_str[i]);
319 }
320 /* Don't case-fold <tags> or [classes] */
321 if (outstr[0] == '<' || outstr[0] == '[') {
322 }
323 else {
324 switch (kase) {
325 case NGRAM_UPPER:
326 ucase(outstr);
327 break;
328 case NGRAM_LOWER:
329 lcase(outstr);
330 break;
331 default:
332 ;
333 }
334 }
335 model->word_str[i] = outstr;
336
337 /* Now update the hash table. We might have terrible
338 * collisions here, so warn about them. */
339 if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) {
340 E_WARN("Duplicate word in dictionary after conversion: %s\n",
341 model->word_str[i]);
342 }
343 }
344 /* Swap out the hash table. */
345 hash_table_free(model->wid);
346 model->wid = new_wid;
347 return 0;
348}
349
350#ifdef HAVE_ICONV
351int
352ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
353{
354 iconv_t ic;
355 char *outbuf;
356 size_t maxlen;
357 int i, writable;
358 hash_table_t *new_wid;
359
360 /* FIXME: Need to do a special case thing for the GB-HEX encoding
361 * used in Sphinx3 Mandarin models. */
362 if ((ic = iconv_open(to, from)) == (iconv_t)-1) {
363 E_ERROR_SYSTEM("iconv_open() failed");
364 return -1;
365 }
366 /* iconv(3) is a piece of crap and won't accept a NULL out buffer,
367 * unlike wcstombs(3). So we have to either call it over and over
368 * again until our buffer is big enough, or call it with a huge
369 * buffer and then copy things back to the output. We will use a
370 * mix of these two approaches here. We'll keep a single big
371 * buffer around, and expand it as necessary.
372 */
373 maxlen = 0;
374 for (i = 0; i < model->n_words; ++i) {
375 if (strlen(model->word_str[i]) > maxlen)
376 maxlen = strlen(model->word_str[i]);
377 }
378 /* Were word strings already allocated? */
379 writable = model->writable;
380 /* Either way, we are going to allocate some word strings. */
381 model->writable = TRUE;
382 /* Really should be big enough except for pathological cases. */
383 maxlen = maxlen * sizeof(int) + 15;
384 outbuf = ckd_calloc(maxlen, 1);
385 /* And, don't forget, we need to rebuild the word to unigram ID
386 * mapping. */
387 new_wid = hash_table_new(model->n_words, FALSE);
388 for (i = 0; i < model->n_words; ++i) {
389 ICONV_CONST char *in;
390 char *out;
391 size_t inleft, outleft, result;
392
393 start_conversion:
394 in = (ICONV_CONST char *)model->word_str[i];
395 /* Yes, this assumes that we don't have any NUL bytes. */
396 inleft = strlen(in);
397 out = outbuf;
398 outleft = maxlen;
399
400 while ((result = iconv(ic, &in, &inleft, &out, &outleft)) == (size_t)-1) {
401 if (errno != E2BIG) {
402 /* FIXME: if we already converted any words, then they
403 * are going to be in an inconsistent state. */
404 E_ERROR_SYSTEM("iconv() failed");
405 ckd_free(outbuf);
406 hash_table_free(new_wid);
407 return -1;
408 }
409 /* Reset the internal state of conversion. */
410 iconv(ic, NULL, NULL, NULL, NULL);
411 /* Make everything bigger. */
412 maxlen *= 2;
413 out = outbuf = ckd_realloc(outbuf, maxlen);
414 /* Reset the input pointers. */
415 in = (ICONV_CONST char *)model->word_str[i];
416 inleft = strlen(in);
417 }
418
419 /* Now flush a shift-out sequence, if any. */
420 if ((result = iconv(ic, NULL, NULL, &out, &outleft)) == (size_t)-1) {
421 if (errno != E2BIG) {
422 /* FIXME: if we already converted any words, then they
423 * are going to be in an inconsistent state. */
424 E_ERROR_SYSTEM("iconv() failed (state reset sequence)");
425 ckd_free(outbuf);
426 hash_table_free(new_wid);
427 return -1;
428 }
429 /* Reset the internal state of conversion. */
430 iconv(ic, NULL, NULL, NULL, NULL);
431 /* Make everything bigger. */
432 maxlen *= 2;
433 outbuf = ckd_realloc(outbuf, maxlen);
434 /* Be very evil. */
435 goto start_conversion;
436 }
437
438 result = maxlen - outleft;
439 /* Okay, that was hard, now let's go shopping. */
440 if (writable) {
441 /* Grow or shrink the output string as necessary. */
442 model->word_str[i] = ckd_realloc(model->word_str[i], result + 1);
443 model->word_str[i][result] = '\0';
444 }
445 else {
446 /* It actually was not allocated previously, so do that now. */
447 model->word_str[i] = ckd_calloc(result + 1, 1);
448 }
449 /* Copy the new thing in. */
450 memcpy(model->word_str[i], outbuf, result);
451
452 /* Now update the hash table. We might have terrible
453 * collisions if a non-reversible conversion was requested.,
454 * so warn about them. */
455 if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) {
456 E_WARN("Duplicate word in dictionary after conversion: %s\n",
457 model->word_str[i]);
458 }
459 }
460 ckd_free(outbuf);
461 iconv_close(ic);
462 /* Swap out the hash table. */
463 hash_table_free(model->wid);
464 model->wid = new_wid;
465
466 return 0;
467}
468#else /* !HAVE_ICONV */
469int
470ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
471{
472 return -1;
473}
474#endif /* !HAVE_ICONV */
475
476int
478 float32 lw, float32 wip, float32 uw)
479{
480 return (*model->funcs->apply_weights)(model, lw, wip, uw);
481}
482
483float32
484ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip,
485 int32 *out_log_uw)
486{
487 if (out_log_wip) *out_log_wip = model->log_wip;
488 if (out_log_uw) *out_log_uw = model->log_uw;
489 return model->lw;
490}
491
492
493int32
494ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history,
495 int32 n_hist, int32 *n_used)
496{
497 int32 score, class_weight = 0;
498 int i;
499
500 /* Closed vocabulary, OOV word probability is zero */
501 if (wid == NGRAM_INVALID_WID)
502 return model->log_zero;
503
504 /* "Declassify" wid and history */
505 if (NGRAM_IS_CLASSWID(wid)) {
506 ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
507
508 class_weight = ngram_class_prob(lmclass, wid);
509 if (class_weight == 1) /* Meaning, not found in class. */
510 return model->log_zero;
511 wid = lmclass->tag_wid;
512 }
513 for (i = 0; i < n_hist; ++i) {
514 if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i]))
515 history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
516 }
517 score = (*model->funcs->score)(model, wid, history, n_hist, n_used);
518
519 /* Multiply by unigram in-class weight. */
520 return score + class_weight;
521}
522
523int32
524ngram_score(ngram_model_t *model, const char *word, ...)
525{
526 va_list history;
527 const char *hword;
528 int32 *histid;
529 int32 n_hist;
530 int32 n_used;
531 int32 prob;
532
533 va_start(history, word);
534 n_hist = 0;
535 while ((hword = va_arg(history, const char *)) != NULL)
536 ++n_hist;
537 va_end(history);
538
539 histid = ckd_calloc(n_hist, sizeof(*histid));
540 va_start(history, word);
541 n_hist = 0;
542 while ((hword = va_arg(history, const char *)) != NULL) {
543 histid[n_hist] = ngram_wid(model, hword);
544 ++n_hist;
545 }
546 va_end(history);
547
548 prob = ngram_ng_score(model, ngram_wid(model, word),
549 histid, n_hist, &n_used);
550 ckd_free(histid);
551 return prob;
552}
553
554int32
555ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used)
556{
557 int32 hist[2];
558 hist[0] = w2;
559 hist[1] = w1;
560 return ngram_ng_score(model, w3, hist, 2, n_used);
561}
562
563int32
564ngram_bg_score(ngram_model_t *model, int32 w2, int32 w1, int32 *n_used)
565{
566 return ngram_ng_score(model, w2, &w1, 1, n_used);
567}
568
569int32
570ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history,
571 int32 n_hist, int32 *n_used)
572{
573 int32 prob, class_weight = 0;
574 int i;
575
576 /* Closed vocabulary, OOV word probability is zero */
577 if (wid == NGRAM_INVALID_WID)
578 return model->log_zero;
579
580 /* "Declassify" wid and history */
581 if (NGRAM_IS_CLASSWID(wid)) {
582 ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
583
584 class_weight = ngram_class_prob(lmclass, wid);
585 if (class_weight == 1) /* Meaning, not found in class. */
586 return class_weight;
587 wid = lmclass->tag_wid;
588 }
589 for (i = 0; i < n_hist; ++i) {
590 if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i]))
591 history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
592 }
593 prob = (*model->funcs->raw_score)(model, wid, history,
594 n_hist, n_used);
595 /* Multiply by unigram in-class weight. */
596 return prob + class_weight;
597}
598
599int32
600ngram_prob(ngram_model_t *model, const char *word, ...)
601{
602 va_list history;
603 const char *hword;
604 int32 *histid;
605 int32 n_hist;
606 int32 n_used;
607 int32 prob;
608
609 va_start(history, word);
610 n_hist = 0;
611 while ((hword = va_arg(history, const char *)) != NULL)
612 ++n_hist;
613 va_end(history);
614
615 histid = ckd_calloc(n_hist, sizeof(*histid));
616 va_start(history, word);
617 n_hist = 0;
618 while ((hword = va_arg(history, const char *)) != NULL) {
619 histid[n_hist] = ngram_wid(model, hword);
620 ++n_hist;
621 }
622 va_end(history);
623
624 prob = ngram_ng_prob(model, ngram_wid(model, word),
625 histid, n_hist, &n_used);
626 ckd_free(histid);
627 return prob;
628}
629
630int32
632{
633 int32 prob;
634
635 /* Undo insertion penalty. */
636 prob = score - base->log_wip;
637 /* Undo language weight. */
638 prob = (int32)(prob / base->lw);
639
640 return prob;
641}
642
643int32
645{
646 int32 val;
647
648 /* FIXME: This could be memoized for speed if necessary. */
649 /* Look up <UNK>, if not found return NGRAM_INVALID_WID. */
650 if (hash_table_lookup_int32(model->wid, "<UNK>", &val) == -1)
651 return NGRAM_INVALID_WID;
652 else
653 return val;
654}
655
656int32
658{
659 return model->log_zero;
660}
661
662int32
664{
665 if (model != NULL)
666 return model->n;
667 return 0;
668}
669
670int32 const *
672{
673 if (model != NULL)
674 return model->n_counts;
675 return NULL;
676}
677
678void
679ngram_iter_init(ngram_iter_t *itor, ngram_model_t *model,
680 int m, int successor)
681{
682 itor->model = model;
683 itor->wids = ckd_calloc(model->n, sizeof(*itor->wids));
684 itor->m = m;
685 itor->successor = successor;
686}
687
690{
691 ngram_iter_t *itor;
692 /* The fact that m=n-1 is not exactly obvious. Prevent accidents. */
693 if (m >= model->n)
694 return NULL;
695 if (model->funcs->mgrams == NULL)
696 return NULL;
697 itor = (*model->funcs->mgrams)(model, m);
698 return itor;
699}
700
702ngram_iter(ngram_model_t *model, const char *word, ...)
703{
704 va_list history;
705 const char *hword;
706 int32 *histid;
707 int32 n_hist;
708 ngram_iter_t *itor;
709
710 va_start(history, word);
711 n_hist = 0;
712 while ((hword = va_arg(history, const char *)) != NULL)
713 ++n_hist;
714 va_end(history);
715
716 histid = ckd_calloc(n_hist, sizeof(*histid));
717 va_start(history, word);
718 n_hist = 0;
719 while ((hword = va_arg(history, const char *)) != NULL) {
720 histid[n_hist] = ngram_wid(model, hword);
721 ++n_hist;
722 }
723 va_end(history);
724
725 itor = ngram_ng_iter(model, ngram_wid(model, word), histid, n_hist);
726 ckd_free(histid);
727 return itor;
728}
729
731ngram_ng_iter(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist)
732{
733 if (n_hist >= model->n)
734 return NULL;
735 if (model->funcs->iter == NULL)
736 return NULL;
737 return (*model->funcs->iter)(model, wid, history, n_hist);
738}
739
742{
743 /* Stop when we are at the highest order N-Gram. */
744 if (itor->m == itor->model->n - 1)
745 return NULL;
746 return (*itor->model->funcs->successors)(itor);
747}
748
749int32 const *
751 int32 *out_score,
752 int32 *out_bowt)
753{
754 return (*itor->model->funcs->iter_get)(itor, out_score, out_bowt);
755}
756
759{
760 return (*itor->model->funcs->iter_next)(itor);
761}
762
763void
765{
766 ckd_free(itor->wids);
767 (*itor->model->funcs->iter_free)(itor);
768}
769
770int32
771ngram_wid(ngram_model_t *model, const char *word)
772{
773 int32 val;
774
775 if (hash_table_lookup_int32(model->wid, word, &val) == -1)
776 return ngram_unknown_wid(model);
777 else
778 return val;
779}
780
781const char *
782ngram_word(ngram_model_t *model, int32 wid)
783{
784 /* Remove any class tag */
785 wid = NGRAM_BASEWID(wid);
786 if (wid >= model->n_words)
787 return NULL;
788 return model->word_str[wid];
789}
790
794int32
795ngram_add_word_internal(ngram_model_t *model,
796 const char *word,
797 int32 classid)
798{
799 void *dummy;
800 int32 wid;
801
802 /* Take the next available word ID */
803 wid = model->n_words;
804 if (classid >= 0) {
805 wid = NGRAM_CLASSWID(wid, classid);
806 }
807 /* Check for hash collisions. */
808 if (hash_table_lookup(model->wid, word, &dummy) == 0) {
809 E_ERROR("Duplicate definition of word %s\n", word);
810 return NGRAM_INVALID_WID;
811 }
812 /* Reallocate word_str if necessary. */
813 if (model->n_words >= model->n_1g_alloc) {
814 model->n_1g_alloc += UG_ALLOC_STEP;
815 model->word_str = ckd_realloc(model->word_str,
816 sizeof(*model->word_str) * model->n_1g_alloc);
817 }
818 /* Add the word string in the appropriate manner. */
819 /* Class words are always dynamically allocated. */
820 model->word_str[model->n_words] = ckd_salloc(word);
821 /* Now enter it into the hash table. */
822 if (hash_table_enter_int32(model->wid, model->word_str[model->n_words], wid) != wid) {
823 E_ERROR("Hash insertion failed for word %s => %p (should not happen)\n",
824 model->word_str[model->n_words], (void *)(long)(wid));
825 }
826 /* Increment number of words. */
827 ++model->n_words;
828 return wid;
829}
830
831int32
833 const char *word, float32 weight)
834{
835 int32 wid, prob = model->log_zero;
836
837 /* If we add word to unwritable model, we need to make it writable */
838 if (!model->writable) {
839 E_WARN("Can't add word '%s' to read-only language model. "
840 "Disable mmap with '-mmap no' to make it writable\n", word);
841 return -1;
842 }
843
844 wid = ngram_add_word_internal(model, word, -1);
845 if (wid == NGRAM_INVALID_WID)
846 return wid;
847
848 /* Do what needs to be done to add the word to the unigram. */
849 if (model->funcs && model->funcs->add_ug)
850 prob = (*model->funcs->add_ug)(model, wid, logmath_log(model->lmath, weight));
851 if (prob == 0) {
852 return -1;
853 }
854 return wid;
855}
856
858ngram_class_new(ngram_model_t *model, int32 tag_wid, int32 start_wid, glist_t classwords)
859{
860 ngram_class_t *lmclass;
861 gnode_t *gn;
862 float32 tprob;
863 int i;
864
865 lmclass = ckd_calloc(1, sizeof(*lmclass));
866 lmclass->tag_wid = tag_wid;
867 /* wid_base is the wid (minus class tag) of the first word in the list. */
868 lmclass->start_wid = start_wid;
869 lmclass->n_words = glist_count(classwords);
870 lmclass->prob1 = ckd_calloc(lmclass->n_words, sizeof(*lmclass->prob1));
871 lmclass->nword_hash = NULL;
872 lmclass->n_hash = 0;
873 tprob = 0.0;
874 for (gn = classwords; gn; gn = gnode_next(gn)) {
875 tprob += gnode_float32(gn);
876 }
877 if (tprob > 1.1 || tprob < 0.9) {
878 E_WARN("Total class probability is %f, will normalize\n", tprob);
879 for (gn = classwords; gn; gn = gnode_next(gn)) {
880 gn->data.fl /= tprob;
881 }
882 }
883 for (i = 0, gn = classwords; gn; ++i, gn = gnode_next(gn)) {
884 lmclass->prob1[i] = logmath_log(model->lmath, gnode_float32(gn));
885 }
886
887 return lmclass;
888}
889
890int32
891ngram_class_add_word(ngram_class_t *lmclass, int32 wid, int32 lweight)
892{
893 int32 hash;
894
895 if (lmclass->nword_hash == NULL) {
896 /* Initialize everything in it to -1 */
897 lmclass->nword_hash = ckd_malloc(NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
898 memset(lmclass->nword_hash, 0xff, NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
899 lmclass->n_hash = NGRAM_HASH_SIZE;
900 lmclass->n_hash_inuse = 0;
901 }
902 /* Stupidest possible hash function. This will work pretty well
903 * when this function is called repeatedly with contiguous word
904 * IDs, though... */
905 hash = wid & (lmclass->n_hash - 1);
906 if (lmclass->nword_hash[hash].wid == -1) {
907 /* Good, no collision. */
908 lmclass->nword_hash[hash].wid = wid;
909 lmclass->nword_hash[hash].prob1 = lweight;
910 ++lmclass->n_hash_inuse;
911 return hash;
912 }
913 else {
914 int32 next;
915 /* Collision... Find the end of the hash chain. */
916 while (lmclass->nword_hash[hash].next != -1)
917 hash = lmclass->nword_hash[hash].next;
918 assert(hash != -1);
919 /* Does we has any more bukkit? */
920 if (lmclass->n_hash_inuse == lmclass->n_hash) {
921 /* Oh noes! Ok, we makes more. */
922 lmclass->nword_hash = ckd_realloc(lmclass->nword_hash,
923 lmclass->n_hash * 2 * sizeof(*lmclass->nword_hash));
924 memset(lmclass->nword_hash + lmclass->n_hash,
925 0xff, lmclass->n_hash * sizeof(*lmclass->nword_hash));
926 /* Just use the next allocated one (easy) */
927 next = lmclass->n_hash;
928 lmclass->n_hash *= 2;
929 }
930 else {
931 /* Look for any available bucket. We hope this doesn't happen. */
932 for (next = 0; next < lmclass->n_hash; ++next)
933 if (lmclass->nword_hash[next].wid == -1)
934 break;
935 /* This should absolutely not happen. */
936 assert(next != lmclass->n_hash);
937 }
938 lmclass->nword_hash[next].wid = wid;
939 lmclass->nword_hash[next].prob1 = lweight;
940 lmclass->nword_hash[hash].next = next;
941 ++lmclass->n_hash_inuse;
942 return next;
943 }
944}
945
946void
947ngram_class_free(ngram_class_t *lmclass)
948{
949 ckd_free(lmclass->nword_hash);
950 ckd_free(lmclass->prob1);
951 ckd_free(lmclass);
952}
953
954int32
956 const char *classname,
957 const char *word,
958 float32 weight)
959{
960 ngram_class_t *lmclass;
961 int32 classid, tag_wid, wid, i, scale;
962 float32 fprob;
963
964 /* Find the class corresponding to classname. Linear search
965 * probably okay here since there won't be very many classes, and
966 * this doesn't have to be fast. */
967 tag_wid = ngram_wid(model, classname);
968 if (tag_wid == NGRAM_INVALID_WID) {
969 E_ERROR("No such word or class tag: %s\n", classname);
970 return tag_wid;
971 }
972 for (classid = 0; classid < model->n_classes; ++classid) {
973 if (model->classes[classid]->tag_wid == tag_wid)
974 break;
975 }
976 /* Hmm, no such class. It's probably not a good idea to create one. */
977 if (classid == model->n_classes) {
978 E_ERROR("Word %s is not a class tag (call ngram_model_add_class() first)\n", classname);
979 return NGRAM_INVALID_WID;
980 }
981 lmclass = model->classes[classid];
982
983 /* Add this word to the model's set of words. */
984 wid = ngram_add_word_internal(model, word, classid);
985 if (wid == NGRAM_INVALID_WID)
986 return wid;
987
988 /* This is the fixed probability of the new word. */
989 fprob = weight * 1.0f / (lmclass->n_words + lmclass->n_hash_inuse + 1);
990 /* Now normalize everything else to fit it in. This is
991 * accomplished by simply scaling all the other probabilities
992 * by (1-fprob). */
993 scale = logmath_log(model->lmath, 1.0 - fprob);
994 for (i = 0; i < lmclass->n_words; ++i)
995 lmclass->prob1[i] += scale;
996 for (i = 0; i < lmclass->n_hash; ++i)
997 if (lmclass->nword_hash[i].wid != -1)
998 lmclass->nword_hash[i].prob1 += scale;
999
1000 /* Now add it to the class hash table. */
1001 return ngram_class_add_word(lmclass, wid, logmath_log(model->lmath, fprob));
1002}
1003
1004int32
1006 const char *classname,
1007 float32 classweight,
1008 char **words,
1009 const float32 *weights,
1010 int32 n_words)
1011{
1012 ngram_class_t *lmclass;
1013 glist_t classwords = NULL;
1014 int32 i, start_wid = -1;
1015 int32 classid, tag_wid;
1016
1017 /* Check if classname already exists in model. If not, add it.*/
1018 if ((tag_wid = ngram_wid(model, classname)) == ngram_unknown_wid(model)) {
1019 tag_wid = ngram_model_add_word(model, classname, classweight);
1020 if (tag_wid == NGRAM_INVALID_WID)
1021 return -1;
1022 }
1023
1024 if (model->n_classes == 128) {
1025 E_ERROR("Number of classes cannot exceed 128 (sorry)\n");
1026 return -1;
1027 }
1028 classid = model->n_classes;
1029 for (i = 0; i < n_words; ++i) {
1030 int32 wid;
1031
1032 wid = ngram_add_word_internal(model, words[i], classid);
1033 if (wid == NGRAM_INVALID_WID)
1034 return -1;
1035 if (start_wid == -1)
1036 start_wid = NGRAM_BASEWID(wid);
1037 classwords = glist_add_float32(classwords, weights[i]);
1038 }
1039 classwords = glist_reverse(classwords);
1040 lmclass = ngram_class_new(model, tag_wid, start_wid, classwords);
1041 glist_free(classwords);
1042 if (lmclass == NULL)
1043 return -1;
1044
1045 ++model->n_classes;
1046 if (model->classes == NULL)
1047 model->classes = ckd_calloc(1, sizeof(*model->classes));
1048 else
1049 model->classes = ckd_realloc(model->classes,
1050 model->n_classes * sizeof(*model->classes));
1051 model->classes[classid] = lmclass;
1052 return classid;
1053}
1054
1055int32
1056ngram_class_prob(ngram_class_t *lmclass, int32 wid)
1057{
1058 int32 base_wid = NGRAM_BASEWID(wid);
1059
1060 if (base_wid < lmclass->start_wid
1061 || base_wid > lmclass->start_wid + lmclass->n_words) {
1062 int32 hash;
1063
1064 /* Look it up in the hash table. */
1065 hash = wid & (lmclass->n_hash - 1);
1066 while (hash != -1 && lmclass->nword_hash[hash].wid != wid)
1067 hash = lmclass->nword_hash[hash].next;
1068 if (hash == -1)
1069 return 1;
1070 return lmclass->nword_hash[hash].prob1;
1071 }
1072 else {
1073 return lmclass->prob1[base_wid - lmclass->start_wid];
1074 }
1075}
1076
1077int32
1078read_classdef_file(hash_table_t *classes, const char *file_name)
1079{
1080 FILE *fp;
1081 int32 is_pipe;
1082 int inclass;
1083 int32 rv = -1;
1084 gnode_t *gn;
1085 glist_t classwords = NULL;
1086 glist_t classprobs = NULL;
1087 char *classname = NULL;
1088
1089 if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) {
1090 E_ERROR("File %s not found\n", file_name);
1091 return -1;
1092 }
1093
1094 inclass = FALSE;
1095 while (!feof(fp)) {
1096 char line[512];
1097 char *wptr[2];
1098 int n_words;
1099
1100 if (fgets(line, sizeof(line), fp) == NULL)
1101 break;
1102
1103 n_words = str2words(line, wptr, 2);
1104 if (n_words <= 0)
1105 continue;
1106
1107 if (inclass) {
1108 /* Look for an end of class marker. */
1109 if (n_words == 2 && 0 == strcmp(wptr[0], "END")) {
1110 classdef_t *classdef;
1111 gnode_t *word, *weight;
1112 int32 i;
1113
1114 if (classname == NULL || 0 != strcmp(wptr[1], classname))
1115 goto error_out;
1116 inclass = FALSE;
1117
1118 /* Construct a class from the list of words collected. */
1119 classdef = ckd_calloc(1, sizeof(*classdef));
1120 classwords = glist_reverse(classwords);
1121 classprobs = glist_reverse(classprobs);
1122 classdef->n_words = glist_count(classwords);
1123 classdef->words = ckd_calloc(classdef->n_words,
1124 sizeof(*classdef->words));
1125 classdef->weights = ckd_calloc(classdef->n_words,
1126 sizeof(*classdef->weights));
1127 word = classwords;
1128 weight = classprobs;
1129 for (i = 0; i < classdef->n_words; ++i) {
1130 classdef->words[i] = gnode_ptr(word);
1131 classdef->weights[i] = gnode_float32(weight);
1132 word = gnode_next(word);
1133 weight = gnode_next(weight);
1134 }
1135
1136 /* Add this class to the hash table. */
1137 if (hash_table_enter(classes, classname, classdef) != classdef) {
1138 classdef_free(classdef);
1139 goto error_out;
1140 }
1141
1142 /* Reset everything. */
1143 glist_free(classwords);
1144 glist_free(classprobs);
1145 classwords = NULL;
1146 classprobs = NULL;
1147 classname = NULL;
1148 }
1149 else {
1150 float32 fprob;
1151
1152 if (n_words == 2)
1153 fprob = (float32)atof_c(wptr[1]);
1154 else
1155 fprob = 1.0f;
1156 /* Add it to the list of words for this class. */
1157 classwords = glist_add_ptr(classwords, ckd_salloc(wptr[0]));
1158 classprobs = glist_add_float32(classprobs, fprob);
1159 }
1160 }
1161 else {
1162 /* Start a new LM class if the LMCLASS marker is seen */
1163 if (n_words == 2 && 0 == strcmp(wptr[0], "LMCLASS")) {
1164 if (inclass)
1165 goto error_out;
1166 inclass = TRUE;
1167 classname = ckd_salloc(wptr[1]);
1168 }
1169 /* Otherwise, just ignore whatever junk we got */
1170 }
1171 }
1172 rv = 0; /* Success. */
1173
1174error_out:
1175 /* Free all the stuff we might have allocated. */
1176 fclose_comp(fp, is_pipe);
1177 for (gn = classwords; gn; gn = gnode_next(gn))
1178 ckd_free(gnode_ptr(gn));
1179 glist_free(classwords);
1180 glist_free(classprobs);
1181 ckd_free(classname);
1182
1183 return rv;
1184}
1185
1186void
1187classdef_free(classdef_t *classdef)
1188{
1189 int32 i;
1190 for (i = 0; i < classdef->n_words; ++i)
1191 ckd_free(classdef->words[i]);
1192 ckd_free(classdef->words);
1193 ckd_free(classdef->weights);
1194 ckd_free(classdef);
1195}
1196
1197
1198int32
1200 const char *file_name)
1201{
1202 hash_table_t *classes;
1203 glist_t hl = NULL;
1204 gnode_t *gn;
1205 int32 rv = -1;
1206
1207 classes = hash_table_new(0, FALSE);
1208 if (read_classdef_file(classes, file_name) < 0) {
1209 hash_table_free(classes);
1210 return -1;
1211 }
1212
1213 /* Create a new class in the language model for each classdef. */
1214 hl = hash_table_tolist(classes, NULL);
1215 for (gn = hl; gn; gn = gnode_next(gn)) {
1216 hash_entry_t *he = gnode_ptr(gn);
1217 classdef_t *classdef = he->val;
1218
1219 if (ngram_model_add_class(model, he->key, 1.0,
1220 classdef->words,
1221 classdef->weights,
1222 classdef->n_words) < 0)
1223 goto error_out;
1224 }
1225 rv = 0;
1226
1227error_out:
1228 for (gn = hl; gn; gn = gnode_next(gn)) {
1229 hash_entry_t *he = gnode_ptr(gn);
1230 ckd_free((char *)he->key);
1231 classdef_free(he->val);
1232 }
1233 glist_free(hl);
1234 hash_table_free(classes);
1235 return rv;
1236}
Locale-independent implementation of case swapping operation.
SPHINXBASE_EXPORT void ucase(char *str)
Convert str to all upper case.
SPHINXBASE_EXPORT void lcase(char *str)
Convert str to all lower case.
SPHINXBASE_EXPORT int32 strcmp_nocase(const char *str1, const char *str2)
(FIXME! The implementation is incorrect!) Case insensitive string compare.
Definition case.c:94
SPHINXBASE_EXPORT int32 strncmp_nocase(const char *str1, const char *str2, size_t len)
Like strcmp_nocase() but with a maximum length.
Definition case.c:119
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition ckd_alloc.c:241
#define ckd_malloc(sz)
Macro for ckd_malloc
Definition ckd_alloc.h:253
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition ckd_alloc.h:248
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition ckd_alloc.h:264
#define ckd_realloc(ptr, sz)
Macro for ckd_realloc
Definition ckd_alloc.h:258
SPHINXBASE_EXPORT int cmd_ln_exists_r(cmd_ln_t *cmdln, char const *name)
Re-entrant version of cmd_ln_exists().
Definition cmd_ln.c:929
Implementation of logging routines.
#define E_ERROR_SYSTEM
Print error text; Call perror("");.
Definition err.h:142
#define E_ERROR
Print error message to standard error stream.
Definition err.h:169
#define E_WARN
Print warning information to standard error stream.
Definition err.h:164
File names related operation.
SPHINXBASE_EXPORT glist_t glist_reverse(glist_t g)
Reverse the order of the given glist.
Definition glist.c:169
SPHINXBASE_EXPORT void glist_free(glist_t g)
Free the given generic list; user-defined data contained within is not automatically freed.
Definition glist.c:133
SPHINXBASE_EXPORT glist_t glist_add_float32(glist_t g, float32 val)
Create and prepend a new list node containing a single-precision float.
Definition glist.c:110
SPHINXBASE_EXPORT glist_t glist_add_ptr(glist_t g, void *ptr)
Create and prepend a new list node, with the given user-defined data, at the HEAD of the given generi...
Definition glist.c:74
#define gnode_ptr(g)
Head of a list of gnodes.
Definition glist.h:109
SPHINXBASE_EXPORT int32 glist_count(glist_t g)
Count the number of element in a given link list.
Definition glist.c:145
SPHINXBASE_EXPORT void hash_table_free(hash_table_t *h)
Free the specified hash table; the caller is responsible for freeing the key strings pointed to by th...
Definition hash_table.c:695
#define hash_table_enter_int32(h, k, v)
Add a 32-bit integer value to a hash table.
Definition hash_table.h:228
SPHINXBASE_EXPORT glist_t hash_table_tolist(hash_table_t *h, int32 *count)
Build a glist of valid hash_entry_t pointers from the given hash table.
Definition hash_table.c:623
SPHINXBASE_EXPORT int32 hash_table_lookup(hash_table_t *h, const char *key, void **val)
Look up a key in a hash table and optionally return the associated value.
Definition hash_table.c:309
SPHINXBASE_EXPORT void hash_table_empty(hash_table_t *h)
Delete all entries from a hash_table.
Definition hash_table.c:490
SPHINXBASE_EXPORT int32 hash_table_lookup_int32(hash_table_t *h, const char *key, int32 *val)
Look up a 32-bit integer value in a hash table.
Definition hash_table.c:329
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
Definition hash_table.c:508
SPHINXBASE_EXPORT hash_table_t * hash_table_new(int32 size, int32 casearg)
Allocate a new hash table for a given expected size.
Definition hash_table.c:158
Fast integer logarithmic addition operations.
SPHINXBASE_EXPORT int logmath_get_zero(logmath_t *lmath)
Get the smallest possible value represented in this base.
Definition logmath.c:374
SPHINXBASE_EXPORT int logmath_log(logmath_t *lmath, float64 p)
Convert linear floating point number to integer log in base B.
Definition logmath.c:447
N-Gram language models.
SPHINXBASE_EXPORT int32 ngram_model_add_class(ngram_model_t *model, const char *classname, float32 classweight, char **words, const float32 *weights, int32 n_words)
Add a new class to a language model.
SPHINXBASE_EXPORT int32 ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used)
Quick trigram score lookup.
SPHINXBASE_EXPORT int32 ngram_unknown_wid(ngram_model_t *model)
Get the unknown word ID for a language model.
SPHINXBASE_EXPORT int32 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick "raw" probability lookup for a general N-Gram.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter(ngram_model_t *model, const char *word,...)
Get an iterator over M-grams pointing to the specified M-gram.
SPHINXBASE_EXPORT float32 ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip, int32 *out_log_uw)
Get the current weights from a language model.
SPHINXBASE_EXPORT ngram_iter_t * ngram_ng_iter(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist)
Get an iterator over M-grams pointing to the specified M-gram.
SPHINXBASE_EXPORT const char * ngram_word(ngram_model_t *model, int32 wid)
Look up word string for numerical word ID.
SPHINXBASE_EXPORT int32 ngram_model_add_word(ngram_model_t *model, const char *word, float32 weight)
Add a word (unigram) to the language model.
SPHINXBASE_EXPORT int32 ngram_model_add_class_word(ngram_model_t *model, const char *classname, const char *word, float32 weight)
Add a word to a class in a language model.
SPHINXBASE_EXPORT int32 const * ngram_model_get_counts(ngram_model_t *model)
Get the counts of the various N-grams in the model.
#define NGRAM_INVALID_WID
Impossible word ID.
Definition ngram_model.h:84
@ NGRAM_INVALID
Not a valid file type.
Definition ngram_model.h:77
@ NGRAM_AUTO
Determine file type automatically.
Definition ngram_model.h:78
@ NGRAM_ARPA
ARPABO text format (the standard).
Definition ngram_model.h:79
@ NGRAM_DMP
Sphinx .DMP format.
Definition ngram_model.h:80
SPHINXBASE_EXPORT int ngram_model_casefold(ngram_model_t *model, int kase)
Case-fold word strings in an N-Gram model.
SPHINXBASE_EXPORT int32 ngram_model_get_size(ngram_model_t *model)
Get the order of the N-gram model (i.e.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter_successors(ngram_iter_t *itor)
Iterate over all M-gram successors of an M-1-gram.
SPHINXBASE_EXPORT int32 ngram_prob(ngram_model_t *model, const char *word,...)
Get the "raw" log-probability for a general N-Gram.
SPHINXBASE_EXPORT int32 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick general N-Gram score lookup.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_read(cmd_ln_t *config, const char *file_name, ngram_file_type_t file_type, logmath_t *lmath)
Read an N-Gram model from a file on disk.
SPHINXBASE_EXPORT int32 const * ngram_iter_get(ngram_iter_t *itor, int32 *out_score, int32 *out_bowt)
Get information from the current M-gram in an iterator.
SPHINXBASE_EXPORT void ngram_model_flush(ngram_model_t *lm)
Flush any cached N-Gram information.
SPHINXBASE_EXPORT char const * ngram_type_to_str(int type)
Get the canonical name for an N-Gram file type.
SPHINXBASE_EXPORT int32 ngram_bg_score(ngram_model_t *model, int32 w2, int32 w1, int32 *n_used)
Quick bigram score lookup.
SPHINXBASE_EXPORT int32 ngram_model_read_classdef(ngram_model_t *model, const char *file_name)
Read a class definition file and add classes to a language model.
SPHINXBASE_EXPORT int32 ngram_score(ngram_model_t *model, const char *word,...)
Get the score (scaled, interpolated log-probability) for a general N-Gram.
SPHINXBASE_EXPORT int ngram_model_apply_weights(ngram_model_t *model, float32 lw, float32 wip, float32 uw)
Apply a language weight, insertion penalty, and unigram weight to a language model.
SPHINXBASE_EXPORT int ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
Re-encode word strings in an N-Gram model.
enum ngram_file_type_e ngram_file_type_t
File types for N-Gram files.
SPHINXBASE_EXPORT int ngram_model_write(ngram_model_t *model, const char *file_name, ngram_file_type_t format)
Write an N-Gram model to disk.
SPHINXBASE_EXPORT void ngram_iter_free(ngram_iter_t *itor)
Terminate an M-gram iterator.
SPHINXBASE_EXPORT int32 ngram_wid(ngram_model_t *model, const char *word)
Look up numerical word ID.
SPHINXBASE_EXPORT ngram_file_type_t ngram_file_name_to_type(const char *file_name)
Guess the file type for an N-Gram model from the filename.
Definition ngram_model.c:64
SPHINXBASE_EXPORT int32 ngram_score_to_prob(ngram_model_t *model, int32 score)
Convert score to "raw" log-probability.
SPHINXBASE_EXPORT ngram_iter_t * ngram_model_mgrams(ngram_model_t *model, int m)
Iterate over all M-grams.
SPHINXBASE_EXPORT int32 ngram_zero(ngram_model_t *model)
Get the "zero" log-probability value for a language model.
SPHINXBASE_EXPORT int ngram_model_free(ngram_model_t *model)
Release memory associated with an N-Gram model.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_retain(ngram_model_t *model)
Retain ownership of an N-Gram model.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter_next(ngram_iter_t *itor)
Advance an M-gram iterator.
SPHINXBASE_EXPORT ngram_file_type_t ngram_str_to_type(const char *str_name)
Get the N-Gram file type from a string.
Definition ngram_model.c:97
file IO related operations.
SPHINXBASE_EXPORT void fclose_comp(FILE *fp, int32 ispipe)
Close a file opened using fopen_comp.
Definition pio.c:175
SPHINXBASE_EXPORT FILE * fopen_comp(const char *file, const char *mode, int32 *ispipe)
Like fopen, but use popen and zcat if it is determined that "file" is compressed (i....
Definition pio.c:98
Miscellaneous useful string functions.
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
Definition strfuncs.c:115
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
Definition strfuncs.c:56
One class definition from a classdef file.
Opaque structure used to hold the results of command-line parsing.
A node in a generic list.
Definition glist.h:100
A note by ARCHAN at 20050510: Technically what we use is so-called "hash table with buckets" which is...
Definition hash_table.h:149
void * val
Key-length; the key string does not have to be a C-style NULL terminated string; it can have arbitrar...
Definition hash_table.h:155
int32 prob1
Probability for this word.
int32 next
Index of next bucket (or -1 for no collision)
int32 wid
Word ID of this bucket.
Implementation of ngram_class_t.
int32 start_wid
Starting base word ID for this class' words.
int32 * prob1
Probability table for base words.
int32 n_hash_inuse
Number of words in nword_hash.
int32 n_hash
Number of buckets in nword_hash (power of 2)
int32 tag_wid
Base word ID for this class tag.
int32 n_words
Number of base words for this class.
Implementation-specific functions for operating on ngram_model_t objects.
ngram_iter_t *(* mgrams)(ngram_model_t *model, int32 m)
Implementation-specific function for iterating.
int(* apply_weights)(ngram_model_t *model, float32 lw, float32 wip, float32 uw)
Implementation-specific function for applying language model weights.
ngram_iter_t *(* iter)(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist)
Implementation-specific function for iterating.
ngram_iter_t *(* iter_next)(ngram_iter_t *itor)
Implementation-specific function for iterating.
int32(* raw_score)(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Implementation-specific function for querying raw language model probability.
int32(* score)(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Implementation-specific function for querying language model score.
void(* iter_free)(ngram_iter_t *itor)
Implementation-specific function for iterating.
int32 const *(* iter_get)(ngram_iter_t *itor, int32 *out_score, int32 *out_bowt)
Implementation-specific function for iterating.
void(* free)(ngram_model_t *model)
Implementation-specific function for freeing an ngram_model_t.
ngram_iter_t *(* successors)(ngram_iter_t *itor)
Implementation-specific function for iterating.
int32(* add_ug)(ngram_model_t *model, int32 wid, int32 lweight)
Implementation-specific function for adding unigrams.
void(* flush)(ngram_model_t *model)
Implementation-specific function for purging N-Gram cache.
Base iterator structure for N-grams.
int32 * wids
Scratch space for word IDs.
int16 successor
Is this a successor iterator?
int16 m
Order of history.
Common implementation of ngram_model_t.
logmath_t * lmath
Log-math object.
struct ngram_class_s ** classes
Word class definitions.
int refcount
Reference count.
uint8 n
This is an n-gram model (1, 2, 3, ...).
int32 log_wip
Log of word insertion penalty.
int32 n_1g_alloc
Number of allocated word strings (for new word addition)
int32 log_uniform
Log of uniform (0-gram) probability.
int32 log_zero
Zero probability, cached here for quick lookup.
int32 log_uw
Log of unigram weight.
int32 n_words
Number of actual word strings (NOT the same as the number of unigrams, due to class words).
hash_table_t * wid
Mapping of unigram names to word IDs.
float32 lw
Language model scaling factor.
uint8 writable
Are word strings writable?
int32 * n_counts
Counts for 1, 2, 3, ... grams.
int32 log_uniform_weight
Log of uniform weight (i.e.
struct ngram_funcs_s * funcs
Implementation-specific methods.
uint8 n_classes
Number of classes (maximum 128)
char ** word_str
Unigram names.