SphinxBase 0.6
ngram_model_set.c
Go to the documentation of this file.
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 2008 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
42#include <string.h>
43#include <stdlib.h>
44
45#include "sphinxbase/err.h"
47#include "sphinxbase/strfuncs.h"
48#include "sphinxbase/filename.h"
49
50#include "ngram_model_set.h"
51
52static ngram_funcs_t ngram_model_set_funcs;
53
54static int
55my_compare(const void *a, const void *b)
56{
57 /* Make sure <UNK> floats to the beginning. */
58 if (strcmp(*(char * const *)a, "<UNK>") == 0)
59 return -1;
60 else if (strcmp(*(char * const *)b, "<UNK>") == 0)
61 return 1;
62 else
63 return strcmp(*(char * const *)a, *(char * const *)b);
64}
65
66static void
67build_widmap(ngram_model_t *base, logmath_t *lmath, int32 n)
68{
70 ngram_model_t **models = set->lms;
71 hash_table_t *vocab;
72 glist_t hlist;
73 gnode_t *gn;
74 int32 i;
75
76 /* Construct a merged vocabulary and a set of word-ID mappings. */
77 vocab = hash_table_new(models[0]->n_words, FALSE);
78 /* Create the set of merged words. */
79 for (i = 0; i < set->n_models; ++i) {
80 int32 j;
81 for (j = 0; j < models[i]->n_words; ++j) {
82 /* Ignore collisions. */
83 (void)hash_table_enter_int32(vocab, models[i]->word_str[j], j);
84 }
85 }
86 /* Create the array of words, then sort it. */
87 if (hash_table_lookup(vocab, "<UNK>", NULL) != 0)
88 (void)hash_table_enter_int32(vocab, "<UNK>", 0);
89 /* Now we know the number of unigrams, initialize the base model. */
90 ngram_model_init(base, &ngram_model_set_funcs, lmath, n, hash_table_inuse(vocab));
91 base->writable = FALSE; /* We will reuse the pointers from the submodels. */
92 i = 0;
93 hlist = hash_table_tolist(vocab, NULL);
94 for (gn = hlist; gn; gn = gnode_next(gn)) {
95 hash_entry_t *ent = gnode_ptr(gn);
96 base->word_str[i++] = (char *)ent->key;
97 }
98 glist_free(hlist);
99 qsort(base->word_str, base->n_words, sizeof(*base->word_str), my_compare);
100
101 /* Now create the word ID mappings. */
102 if (set->widmap)
103 ckd_free_2d((void **)set->widmap);
104 set->widmap = (int32 **) ckd_calloc_2d(base->n_words, set->n_models,
105 sizeof(**set->widmap));
106 for (i = 0; i < base->n_words; ++i) {
107 int32 j;
108 /* Also create the master wid mapping. */
109 (void)hash_table_enter_int32(base->wid, base->word_str[i], i);
110 /* printf("%s: %d => ", base->word_str[i], i); */
111 for (j = 0; j < set->n_models; ++j) {
112 set->widmap[i][j] = ngram_wid(models[j], base->word_str[i]);
113 /* printf("%d ", set->widmap[i][j]); */
114 }
115 /* printf("\n"); */
116 }
117 hash_table_free(vocab);
118}
119
122 ngram_model_t **models,
123 char **names,
124 const float32 *weights,
125 int32 n_models)
126{
127 ngram_model_set_t *model;
128 ngram_model_t *base;
129 logmath_t *lmath;
130 int32 i, n;
131
132 if (n_models == 0) /* WTF */
133 return NULL;
134
135 /* Do consistency checking on the models. They must all use the
136 * same logbase and shift. */
137 lmath = models[0]->lmath;
138 for (i = 1; i < n_models; ++i) {
139 if (logmath_get_base(models[i]->lmath) != logmath_get_base(lmath)
140 || logmath_get_shift(models[i]->lmath) != logmath_get_shift(lmath)) {
141 E_ERROR("Log-math parameters don't match, will not create LM set\n");
142 return NULL;
143 }
144 }
145
146 /* Allocate the combined model, initialize it. */
147 model = ckd_calloc(1, sizeof(*model));
148 base = &model->base;
149 model->n_models = n_models;
150 model->lms = ckd_calloc(n_models, sizeof(*model->lms));
151 model->names = ckd_calloc(n_models, sizeof(*model->names));
152 /* Initialize weights to a uniform distribution */
153 model->lweights = ckd_calloc(n_models, sizeof(*model->lweights));
154 {
155 int32 uniform = logmath_log(lmath, 1.0/n_models);
156 for (i = 0; i < n_models; ++i)
157 model->lweights[i] = uniform;
158 }
159 /* Default to interpolate if weights were given. */
160 if (weights)
161 model->cur = -1;
162
163 n = 0;
164 for (i = 0; i < n_models; ++i) {
165 model->lms[i] = models[i];
166 model->names[i] = ckd_salloc(names[i]);
167 if (weights)
168 model->lweights[i] = logmath_log(lmath, weights[i]);
169 /* N is the maximum of all merged models. */
170 if (models[i]->n > n)
171 n = models[i]->n;
172 }
173 /* Allocate the history mapping table. */
174 model->maphist = ckd_calloc(n - 1, sizeof(*model->maphist));
175
176 /* Now build the word-ID mapping and merged vocabulary. */
177 build_widmap(base, lmath, n);
178 return base;
179}
180
183 const char *lmctlfile,
184 logmath_t *lmath)
185{
186 FILE *ctlfp;
187 glist_t lms = NULL;
188 glist_t lmnames = NULL;
189 __BIGSTACKVARIABLE__ char str[1024];
190 ngram_model_t *set = NULL;
191 hash_table_t *classes;
192 char *basedir, *c;
193
194 /* Read all the class definition files to accumulate a mapping of
195 * classnames to definitions. */
196 classes = hash_table_new(0, FALSE);
197 if ((ctlfp = fopen(lmctlfile, "r")) == NULL) {
198 E_ERROR_SYSTEM("Failed to open %s", lmctlfile);
199 return NULL;
200 }
201
202 /* Try to find the base directory to append to relative paths in
203 * the lmctl file. */
204 if ((c = strrchr(lmctlfile, '/')) || (c = strrchr(lmctlfile, '\\'))) {
205 /* Include the trailing slash. */
206 basedir = ckd_calloc(c - lmctlfile + 2, 1);
207 memcpy(basedir, lmctlfile, c - lmctlfile + 1);
208 }
209 else {
210 basedir = NULL;
211 }
212 E_INFO("Reading LM control file '%s'\n", lmctlfile);
213 if (basedir)
214 E_INFO("Will prepend '%s' to unqualified paths\n", basedir);
215
216 if (fscanf(ctlfp, "%1023s", str) == 1) {
217 if (strcmp(str, "{") == 0) {
218 /* Load LMclass files */
219 while ((fscanf(ctlfp, "%1023s", str) == 1)
220 && (strcmp(str, "}") != 0)) {
221 char *deffile;
222 if (basedir && !path_is_absolute(str))
223 deffile = string_join(basedir, str, NULL);
224 else
225 deffile = ckd_salloc(str);
226 E_INFO("Reading classdef from '%s'\n", deffile);
227 if (read_classdef_file(classes, deffile) < 0) {
228 ckd_free(deffile);
229 goto error_out;
230 }
231 ckd_free(deffile);
232 }
233
234 if (strcmp(str, "}") != 0) {
235 E_ERROR("Unexpected EOF in %s\n", lmctlfile);
236 goto error_out;
237 }
238
239 /* This might be the first LM name. */
240 if (fscanf(ctlfp, "%1023s", str) != 1)
241 str[0] = '\0';
242 }
243 }
244 else
245 str[0] = '\0';
246
247 /* Read in one LM at a time and add classes to them as necessary. */
248 while (str[0] != '\0') {
249 char *lmfile;
250 ngram_model_t *lm;
251
252 if (basedir && str[0] != '/' && str[0] != '\\')
253 lmfile = string_join(basedir, str, NULL);
254 else
255 lmfile = ckd_salloc(str);
256 E_INFO("Reading lm from '%s'\n", lmfile);
257 lm = ngram_model_read(config, lmfile, NGRAM_AUTO, lmath);
258 if (lm == NULL) {
259 ckd_free(lmfile);
260 goto error_out;
261 }
262 if (fscanf(ctlfp, "%1023s", str) != 1) {
263 E_ERROR("LMname missing after LMFileName '%s'\n", lmfile);
264 ckd_free(lmfile);
265 goto error_out;
266 }
267 ckd_free(lmfile);
268 lms = glist_add_ptr(lms, lm);
269 lmnames = glist_add_ptr(lmnames, ckd_salloc(str));
270
271 if (fscanf(ctlfp, "%1023s", str) == 1) {
272 if (strcmp(str, "{") == 0) {
273 /* LM uses classes; read their names */
274 while ((fscanf(ctlfp, "%1023s", str) == 1) &&
275 (strcmp(str, "}") != 0)) {
276 void *val;
277 classdef_t *classdef;
278
279 if (hash_table_lookup(classes, str, &val) == -1) {
280 E_ERROR("Unknown class %s in control file\n", str);
281 goto error_out;
282 }
283 classdef = val;
284 if (ngram_model_add_class(lm, str, 1.0,
285 classdef->words, classdef->weights,
286 classdef->n_words) < 0) {
287 goto error_out;
288 }
289 E_INFO("Added class %s containing %d words\n",
290 str, classdef->n_words);
291 }
292 if (strcmp(str, "}") != 0) {
293 E_ERROR("Unexpected EOF in %s\n", lmctlfile);
294 goto error_out;
295 }
296 if (fscanf(ctlfp, "%1023s", str) != 1)
297 str[0] = '\0';
298 }
299 }
300 else
301 str[0] = '\0';
302 }
303 fclose(ctlfp);
304
305 /* Now construct arrays out of lms and lmnames, and build an
306 * ngram_model_set. */
307 lms = glist_reverse(lms);
308 lmnames = glist_reverse(lmnames);
309 {
310 int32 n_models;
311 ngram_model_t **lm_array;
312 char **name_array;
313 gnode_t *lm_node, *name_node;
314 int32 i;
315
316 n_models = glist_count(lms);
317 lm_array = ckd_calloc(n_models, sizeof(*lm_array));
318 name_array = ckd_calloc(n_models, sizeof(*name_array));
319 lm_node = lms;
320 name_node = lmnames;
321 for (i = 0; i < n_models; ++i) {
322 lm_array[i] = gnode_ptr(lm_node);
323 name_array[i] = gnode_ptr(name_node);
324 lm_node = gnode_next(lm_node);
325 name_node = gnode_next(name_node);
326 }
327 set = ngram_model_set_init(config, lm_array, name_array,
328 NULL, n_models);
329 ckd_free(lm_array);
330 ckd_free(name_array);
331 }
332error_out:
333 {
334 gnode_t *gn;
335 glist_t hlist;
336
337 if (set == NULL) {
338 for (gn = lms; gn; gn = gnode_next(gn)) {
340 }
341 }
342 glist_free(lms);
343 for (gn = lmnames; gn; gn = gnode_next(gn)) {
344 ckd_free(gnode_ptr(gn));
345 }
346 glist_free(lmnames);
347 hlist = hash_table_tolist(classes, NULL);
348 for (gn = hlist; gn; gn = gnode_next(gn)) {
349 hash_entry_t *he = gnode_ptr(gn);
350 ckd_free((char *)he->key);
351 classdef_free(he->val);
352 }
353 glist_free(hlist);
354 hash_table_free(classes);
355 ckd_free(basedir);
356 }
357 return set;
358}
359
360int32
362{
364 return set->n_models;
365}
366
369{
372
373 if (set == NULL || set->n_models == 0)
374 return NULL;
375 itor = ckd_calloc(1, sizeof(*itor));
376 itor->set = set;
377 return itor;
378}
379
382{
383 if (++itor->cur == itor->set->n_models) {
385 return NULL;
386 }
387 return itor;
388}
389
390void
395
398 char const **lmname)
399{
400 if (lmname) *lmname = itor->set->names[itor->cur];
401 return itor->set->lms[itor->cur];
402}
403
406 const char *name)
407{
409 int32 i;
410
411 if (name == NULL) {
412 if (set->cur == -1)
413 return NULL;
414 else
415 return set->lms[set->cur];
416 }
417
418 /* There probably won't be very many submodels. */
419 for (i = 0; i < set->n_models; ++i)
420 if (0 == strcmp(set->names[i], name))
421 break;
422 if (i == set->n_models)
423 return NULL;
424 return set->lms[i];
425}
426
429 const char *name)
430{
432 int32 i;
433
434 /* There probably won't be very many submodels. */
435 for (i = 0; i < set->n_models; ++i)
436 if (0 == strcmp(set->names[i], name))
437 break;
438 if (i == set->n_models)
439 return NULL;
440 set->cur = i;
441 return set->lms[set->cur];
442}
443
444const char *
446{
448
449 if (set->cur == -1)
450 return NULL;
451 else
452 return set->names[set->cur];
453}
454
455int32
457 int32 set_wid)
458{
460
461 if (set->cur == -1 || set_wid >= base->n_words)
462 return NGRAM_INVALID_WID;
463 else
464 return set->widmap[set_wid][set->cur];
465}
466
467int32
469 int32 set_wid)
470{
472
473 if (set_wid >= base->n_words)
474 return FALSE;
475 else if (set->cur == -1) {
476 int32 i;
477 for (i = 0; i < set->n_models; ++i) {
478 if (set->widmap[set_wid][i] != ngram_unknown_wid(set->lms[i]))
479 return TRUE;
480 }
481 return FALSE;
482 }
483 else
484 return (set->widmap[set_wid][set->cur]
485 != ngram_unknown_wid(set->lms[set->cur]));
486}
487
490 const char **names,
491 const float32 *weights)
492{
494
495 /* If we have a set of weights here, then set them. */
496 if (names && weights) {
497 int32 i, j;
498
499 /* We hope there aren't many models. */
500 for (i = 0; i < set->n_models; ++i) {
501 for (j = 0; j < set->n_models; ++j)
502 if (0 == strcmp(names[i], set->names[j]))
503 break;
504 if (j == set->n_models) {
505 E_ERROR("Unknown LM name %s\n", names[i]);
506 return NULL;
507 }
508 set->lweights[j] = logmath_log(base->lmath, weights[i]);
509 }
510 }
511 else if (weights) {
512 memcpy(set->lweights, weights, set->n_models * sizeof(*set->lweights));
513 }
514 /* Otherwise just enable existing weights. */
515 set->cur = -1;
516 return base;
517}
518
521 ngram_model_t *model,
522 const char *name,
523 float32 weight,
524 int reuse_widmap)
525
526{
528 float32 fprob;
529 int32 scale, i;
530
531 /* Add it to the array of lms. */
532 ++set->n_models;
533 set->lms = ckd_realloc(set->lms, set->n_models * sizeof(*set->lms));
534 set->lms[set->n_models - 1] = model;
535 set->names = ckd_realloc(set->names, set->n_models * sizeof(*set->names));
536 set->names[set->n_models - 1] = ckd_salloc(name);
537 /* Expand the history mapping table if necessary. */
538 if (model->n > base->n) {
539 base->n = model->n;
540 set->maphist = ckd_realloc(set->maphist,
541 (model->n - 1) * sizeof(*set->maphist));
542 }
543
544 /* Renormalize the interpolation weights. */
545 fprob = weight * 1.0 / set->n_models;
546 set->lweights = ckd_realloc(set->lweights,
547 set->n_models * sizeof(*set->lweights));
548 set->lweights[set->n_models - 1] = logmath_log(base->lmath, fprob);
549 /* Now normalize everything else to fit it in. This is
550 * accomplished by simply scaling all the other probabilities
551 * by (1-fprob). */
552 scale = logmath_log(base->lmath, 1.0 - fprob);
553 for (i = 0; i < set->n_models - 1; ++i)
554 set->lweights[i] += scale;
555
556 /* Reuse the old word ID mapping if requested. */
557 if (reuse_widmap) {
558 int32 **new_widmap;
559
560 /* Tack another column onto the widmap array. */
561 new_widmap = (int32 **)ckd_calloc_2d(base->n_words, set->n_models,
562 sizeof (**new_widmap));
563 for (i = 0; i < base->n_words; ++i) {
564 /* Copy all the existing mappings. */
565 memcpy(new_widmap[i], set->widmap[i],
566 (set->n_models - 1) * sizeof(**new_widmap));
567 /* Create the new mapping. */
568 new_widmap[i][set->n_models-1] = ngram_wid(model, base->word_str[i]);
569 }
570 ckd_free_2d((void **)set->widmap);
571 set->widmap = new_widmap;
572 }
573 else {
574 build_widmap(base, base->lmath, base->n);
575 }
576 return model;
577}
578
581 const char *name,
582 int reuse_widmap)
583{
585 ngram_model_t *submodel;
586 int32 lmidx, scale, n, i;
587 float32 fprob;
588
589 for (lmidx = 0; lmidx < set->n_models; ++lmidx)
590 if (0 == strcmp(name, set->names[lmidx]))
591 break;
592 if (lmidx == set->n_models)
593 return NULL;
594 submodel = set->lms[lmidx];
595
596 /* Renormalize the interpolation weights by scaling them by
597 * 1/(1-fprob) */
598 fprob = logmath_exp(base->lmath, set->lweights[lmidx]);
599 scale = logmath_log(base->lmath, 1.0 - fprob);
600
601 /* Remove it from the array of lms, renormalize remaining weights,
602 * and recalcluate n. */
603 --set->n_models;
604 n = 0;
605 ckd_free(set->names[lmidx]);
606 set->names[lmidx] = NULL;
607 for (i = 0; i < set->n_models; ++i) {
608 if (i >= lmidx) {
609 set->lms[i] = set->lms[i+1];
610 set->names[i] = set->names[i+1];
611 set->lweights[i] = set->lweights[i+1];
612 }
613 set->lweights[i] -= scale;
614 if (set->lms[i]->n > n)
615 n = set->lms[i]->n;
616 }
617 /* There's no need to shrink these arrays. */
618 set->lms[set->n_models] = NULL;
619 set->lweights[set->n_models] = base->log_zero;
620 /* No need to shrink maphist either. */
621
622 /* Reuse the existing word ID mapping if requested. */
623 if (reuse_widmap) {
624 /* Just go through and shrink each row. */
625 for (i = 0; i < base->n_words; ++i) {
626 memmove(set->widmap[i] + lmidx, set->widmap[i] + lmidx + 1,
627 (set->n_models - lmidx) * sizeof(**set->widmap));
628 }
629 }
630 else {
631 build_widmap(base, base->lmath, n);
632 }
633 return submodel;
634}
635
636void
638 const char **words,
639 int32 n_words)
640{
642 int32 i;
643
644 /* Recreate the word mapping. */
645 if (base->writable) {
646 for (i = 0; i < base->n_words; ++i) {
647 ckd_free(base->word_str[i]);
648 }
649 }
650 ckd_free(base->word_str);
651 ckd_free_2d((void **)set->widmap);
652 base->writable = TRUE;
653 base->n_words = base->n_1g_alloc = n_words;
654 base->word_str = ckd_calloc(n_words, sizeof(*base->word_str));
655 set->widmap = (int32 **)ckd_calloc_2d(n_words, set->n_models, sizeof(**set->widmap));
656 hash_table_empty(base->wid);
657 for (i = 0; i < n_words; ++i) {
658 int32 j;
659 base->word_str[i] = ckd_salloc(words[i]);
660 (void)hash_table_enter_int32(base->wid, base->word_str[i], i);
661 for (j = 0; j < set->n_models; ++j) {
662 set->widmap[i][j] = ngram_wid(set->lms[j], base->word_str[i]);
663 }
664 }
665}
666
667static int
668ngram_model_set_apply_weights(ngram_model_t *base, float32 lw,
669 float32 wip, float32 uw)
670{
672 int32 i;
673
674 /* Apply weights to each sub-model. */
675 for (i = 0; i < set->n_models; ++i)
676 ngram_model_apply_weights(set->lms[i], lw, wip, uw);
677 return 0;
678}
679
680static int32
681ngram_model_set_score(ngram_model_t *base, int32 wid,
682 int32 *history, int32 n_hist,
683 int32 *n_used)
684{
686 int32 mapwid;
687 int32 score;
688 int32 i;
689
690 /* Truncate the history. */
691 if (n_hist > base->n - 1)
692 n_hist = base->n - 1;
693
694 /* Interpolate if there is no current. */
695 if (set->cur == -1) {
696 score = base->log_zero;
697 for (i = 0; i < set->n_models; ++i) {
698 int32 j;
699 /* Map word and history IDs for each model. */
700 mapwid = set->widmap[wid][i];
701 for (j = 0; j < n_hist; ++j) {
702 if (history[j] == NGRAM_INVALID_WID)
703 set->maphist[j] = NGRAM_INVALID_WID;
704 else
705 set->maphist[j] = set->widmap[history[j]][i];
706 }
707 score = logmath_add(base->lmath, score,
708 set->lweights[i] +
709 ngram_ng_score(set->lms[i],
710 mapwid, set->maphist, n_hist, n_used));
711 }
712 }
713 else {
714 int32 j;
715 /* Map word and history IDs (FIXME: do this in a function?) */
716 mapwid = set->widmap[wid][set->cur];
717 for (j = 0; j < n_hist; ++j) {
718 if (history[j] == NGRAM_INVALID_WID)
719 set->maphist[j] = NGRAM_INVALID_WID;
720 else
721 set->maphist[j] = set->widmap[history[j]][set->cur];
722 }
723 score = ngram_ng_score(set->lms[set->cur],
724 mapwid, set->maphist, n_hist, n_used);
725 }
726
727 return score;
728}
729
730static int32
731ngram_model_set_raw_score(ngram_model_t *base, int32 wid,
732 int32 *history, int32 n_hist,
733 int32 *n_used)
734{
736 int32 mapwid;
737 int32 score;
738 int32 i;
739
740 /* Truncate the history. */
741 if (n_hist > base->n - 1)
742 n_hist = base->n - 1;
743
744 /* Interpolate if there is no current. */
745 if (set->cur == -1) {
746 score = base->log_zero;
747 for (i = 0; i < set->n_models; ++i) {
748 int32 j;
749 /* Map word and history IDs for each model. */
750 mapwid = set->widmap[wid][i];
751 for (j = 0; j < n_hist; ++j) {
752 if (history[j] == NGRAM_INVALID_WID)
753 set->maphist[j] = NGRAM_INVALID_WID;
754 else
755 set->maphist[j] = set->widmap[history[j]][i];
756 }
757 score = logmath_add(base->lmath, score,
758 set->lweights[i] +
759 ngram_ng_prob(set->lms[i],
760 mapwid, set->maphist, n_hist, n_used));
761 }
762 }
763 else {
764 int32 j;
765 /* Map word and history IDs (FIXME: do this in a function?) */
766 mapwid = set->widmap[wid][set->cur];
767 for (j = 0; j < n_hist; ++j) {
768 if (history[j] == NGRAM_INVALID_WID)
769 set->maphist[j] = NGRAM_INVALID_WID;
770 else
771 set->maphist[j] = set->widmap[history[j]][set->cur];
772 }
773 score = ngram_ng_prob(set->lms[set->cur],
774 mapwid, set->maphist, n_hist, n_used);
775 }
776
777 return score;
778}
779
780static int32
781ngram_model_set_add_ug(ngram_model_t *base,
782 int32 wid, int32 lweight)
783{
785 int32 *newwid;
786 int32 i, prob;
787
788 /* At this point the word has already been added to the master
789 model and we have a new word ID for it. Add it to active
790 submodels and track the word IDs. */
791 newwid = ckd_calloc(set->n_models, sizeof(*newwid));
792 prob = base->log_zero;
793 for (i = 0; i < set->n_models; ++i) {
794 int32 wprob, n_hist;
795
796 /* Only add to active models. */
797 if (set->cur == -1 || set->cur == i) {
798 /* Did this word already exist? */
799 newwid[i] = ngram_wid(set->lms[i], base->word_str[wid]);
800 if (newwid[i] == NGRAM_INVALID_WID) {
801 /* Add it to the submodel. */
802 newwid[i] = ngram_model_add_word(set->lms[i], base->word_str[wid],
803 logmath_exp(base->lmath, lweight));
804 if (newwid[i] == NGRAM_INVALID_WID) {
805 ckd_free(newwid);
806 return base->log_zero;
807 }
808 }
809 /* Now get the unigram probability for the new word and either
810 * interpolate it or use it (if this is the current model). */
811 wprob = ngram_ng_prob(set->lms[i], newwid[i], NULL, 0, &n_hist);
812 if (set->cur == i)
813 prob = wprob;
814 else if (set->cur == -1)
815 prob = logmath_add(base->lmath, prob, set->lweights[i] + wprob);
816 }
817 else {
818 newwid[i] = NGRAM_INVALID_WID;
819 }
820 }
821 /* Okay we have the word IDs for this in all the submodels. Now
822 do some complicated memory mangling to add this to the
823 widmap. */
824 set->widmap = ckd_realloc(set->widmap, base->n_words * sizeof(*set->widmap));
825 set->widmap[0] = ckd_realloc(set->widmap[0],
826 base->n_words
827 * set->n_models
828 * sizeof(**set->widmap));
829 for (i = 0; i < base->n_words; ++i)
830 set->widmap[i] = set->widmap[0] + i * set->n_models;
831 memcpy(set->widmap[wid], newwid, set->n_models * sizeof(*newwid));
832 ckd_free(newwid);
833 return prob;
834}
835
836static void
837ngram_model_set_free(ngram_model_t *base)
838{
840 int32 i;
841
842 for (i = 0; i < set->n_models; ++i)
843 ngram_model_free(set->lms[i]);
844 ckd_free(set->lms);
845 for (i = 0; i < set->n_models; ++i)
846 ckd_free(set->names[i]);
847 ckd_free(set->names);
848 ckd_free(set->lweights);
849 ckd_free(set->maphist);
850 ckd_free_2d((void **)set->widmap);
851}
852
853static void
854ngram_model_set_flush(ngram_model_t *base)
855{
857 int32 i;
858
859 for (i = 0; i < set->n_models; ++i)
860 ngram_model_flush(set->lms[i]);
861}
862
863static ngram_funcs_t ngram_model_set_funcs = {
864 ngram_model_set_free, /* free */
865 ngram_model_set_apply_weights, /* apply_weights */
866 ngram_model_set_score, /* score */
867 ngram_model_set_raw_score, /* raw_score */
868 ngram_model_set_add_ug, /* add_ug */
869 ngram_model_set_flush /* flush */
870};
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition ckd_alloc.c:241
SPHINXBASE_EXPORT void ckd_free_2d(void *ptr)
Free a 2-D array (ptr) previously allocated by ckd_calloc_2d.
Definition ckd_alloc.c:252
#define ckd_calloc_2d(d1, d2, sz)
Macro for ckd_calloc_2d
Definition ckd_alloc.h:270
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition ckd_alloc.h:248
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition ckd_alloc.h:264
#define ckd_realloc(ptr, sz)
Macro for ckd_realloc
Definition ckd_alloc.h:258
Implementation of logging routines.
#define E_ERROR_SYSTEM
Print error text; Call perror("");.
Definition err.h:142
#define E_ERROR
Print error message to standard error stream.
Definition err.h:169
#define E_INFO
Print logging information to standard error stream.
Definition err.h:147
File names related operation.
SPHINXBASE_EXPORT int path_is_absolute(const char *file)
Test whether a pathname is absolute for the current OS.
Definition filename.c:126
SPHINXBASE_EXPORT glist_t glist_reverse(glist_t g)
Reverse the order of the given glist.
Definition glist.c:169
SPHINXBASE_EXPORT void glist_free(glist_t g)
Free the given generic list; user-defined data contained within is not automatically freed.
Definition glist.c:133
SPHINXBASE_EXPORT glist_t glist_add_ptr(glist_t g, void *ptr)
Create and prepend a new list node, with the given user-defined data, at the HEAD of the given generi...
Definition glist.c:74
#define gnode_ptr(g)
Head of a list of gnodes.
Definition glist.h:109
SPHINXBASE_EXPORT int32 glist_count(glist_t g)
Count the number of element in a given link list.
Definition glist.c:145
SPHINXBASE_EXPORT void hash_table_free(hash_table_t *h)
Free the specified hash table; the caller is responsible for freeing the key strings pointed to by th...
Definition hash_table.c:695
#define hash_table_enter_int32(h, k, v)
Add a 32-bit integer value to a hash table.
Definition hash_table.h:228
SPHINXBASE_EXPORT glist_t hash_table_tolist(hash_table_t *h, int32 *count)
Build a glist of valid hash_entry_t pointers from the given hash table.
Definition hash_table.c:623
SPHINXBASE_EXPORT int32 hash_table_lookup(hash_table_t *h, const char *key, void **val)
Look up a key in a hash table and optionally return the associated value.
Definition hash_table.c:309
SPHINXBASE_EXPORT void hash_table_empty(hash_table_t *h)
Delete all entries from a hash_table.
Definition hash_table.c:490
SPHINXBASE_EXPORT hash_table_t * hash_table_new(int32 size, int32 casearg)
Allocate a new hash table for a given expected size.
Definition hash_table.c:158
SPHINXBASE_EXPORT int logmath_add(logmath_t *lmath, int logb_p, int logb_q)
Add two values in log space (i.e.
Definition logmath.c:392
SPHINXBASE_EXPORT float64 logmath_get_base(logmath_t *lmath)
Get the log base.
Definition logmath.c:368
SPHINXBASE_EXPORT float64 logmath_exp(logmath_t *lmath, int logb_p)
Convert integer log in base B to linear floating point.
Definition logmath.c:456
SPHINXBASE_EXPORT int logmath_log(logmath_t *lmath, float64 p)
Convert linear floating point number to integer log in base B.
Definition logmath.c:447
SPHINXBASE_EXPORT int logmath_get_shift(logmath_t *lmath)
Get the shift of the values in a log table.
Definition logmath.c:386
SPHINXBASE_EXPORT int32 ngram_model_add_class(ngram_model_t *model, const char *classname, float32 classweight, char **words, const float32 *weights, int32 n_words)
Add a new class to a language model.
SPHINXBASE_EXPORT int32 ngram_unknown_wid(ngram_model_t *model)
Get the unknown word ID for a language model.
SPHINXBASE_EXPORT int32 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick "raw" probability lookup for a general N-Gram.
SPHINXBASE_EXPORT int32 ngram_model_add_word(ngram_model_t *model, const char *word, float32 weight)
Add a word (unigram) to the language model.
#define NGRAM_INVALID_WID
Impossible word ID.
Definition ngram_model.h:84
@ NGRAM_AUTO
Determine file type automatically.
Definition ngram_model.h:78
SPHINXBASE_EXPORT int32 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick general N-Gram score lookup.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_read(cmd_ln_t *config, const char *file_name, ngram_file_type_t file_type, logmath_t *lmath)
Read an N-Gram model from a file on disk.
SPHINXBASE_EXPORT void ngram_model_flush(ngram_model_t *lm)
Flush any cached N-Gram information.
SPHINXBASE_EXPORT int ngram_model_apply_weights(ngram_model_t *model, float32 lw, float32 wip, float32 uw)
Apply a language weight, insertion penalty, and unigram weight to a language model.
SPHINXBASE_EXPORT int32 ngram_wid(ngram_model_t *model, const char *word)
Look up numerical word ID.
SPHINXBASE_EXPORT int ngram_model_free(ngram_model_t *model)
Release memory associated with an N-Gram model.
const char * ngram_model_set_current(ngram_model_t *base)
Get the current language model name, if any.
ngram_model_t * ngram_model_set_read(cmd_ln_t *config, const char *lmctlfile, logmath_t *lmath)
Read a set of language models from a control file.
ngram_model_set_iter_t * ngram_model_set_iter(ngram_model_t *base)
Begin iterating over language models in a set.
ngram_model_set_iter_t * ngram_model_set_iter_next(ngram_model_set_iter_t *itor)
Move to the next language model in a set.
void ngram_model_set_iter_free(ngram_model_set_iter_t *itor)
Finish iteration over a langauge model set.
ngram_model_t * ngram_model_set_add(ngram_model_t *base, ngram_model_t *model, const char *name, float32 weight, int reuse_widmap)
Add a language model to a set.
ngram_model_t * ngram_model_set_select(ngram_model_t *base, const char *name)
Select a single language model from a set for scoring.
ngram_model_t * ngram_model_set_remove(ngram_model_t *base, const char *name, int reuse_widmap)
Remove a language model from a set.
ngram_model_t * ngram_model_set_interp(ngram_model_t *base, const char **names, const float32 *weights)
Set interpolation weights for a set and enables interpolation.
int32 ngram_model_set_current_wid(ngram_model_t *base, int32 set_wid)
Query the word-ID mapping for the current language model.
ngram_model_t * ngram_model_set_init(cmd_ln_t *config, ngram_model_t **models, char **names, const float32 *weights, int32 n_models)
Create a set of language models sharing a common space of word IDs.
ngram_model_t * ngram_model_set_iter_model(ngram_model_set_iter_t *itor, char const **lmname)
Get language model and associated name from an iterator.
ngram_model_t * ngram_model_set_lookup(ngram_model_t *base, const char *name)
Look up a language model by name from a set.
int32 ngram_model_set_known_wid(ngram_model_t *base, int32 set_wid)
Test whether a word ID corresponds to a known word in the current state of the language model set.
void ngram_model_set_map_words(ngram_model_t *base, const char **words, int32 n_words)
Set the word-to-ID mapping for this model set.
int32 ngram_model_set_count(ngram_model_t *base)
Returns the number of language models in a set.
Set of language models.
Miscellaneous useful string functions.
SPHINXBASE_EXPORT char * string_join(const char *base,...)
Concatenate a NULL-terminated argument list of strings, returning a newly allocated string.
Definition strfuncs.c:62
One class definition from a classdef file.
Opaque structure used to hold the results of command-line parsing.
A node in a generic list.
Definition glist.h:100
A note by ARCHAN at 20050510: Technically what we use is so-called "hash table with buckets" which is...
Definition hash_table.h:149
void * val
Key-length; the key string does not have to be a C-style NULL terminated string; it can have arbitrar...
Definition hash_table.h:155
Implementation-specific functions for operating on ngram_model_t objects.
Common implementation of ngram_model_t.
logmath_t * lmath
Log-math object.
uint8 n
This is an n-gram model (1, 2, 3, ...).
int32 n_1g_alloc
Number of allocated word strings (for new word addition)
int32 log_zero
Zero probability, cached here for quick lookup.
int32 n_words
Number of actual word strings (NOT the same as the number of unigrams, due to class words).
hash_table_t * wid
Mapping of unigram names to word IDs.
uint8 writable
Are word strings writable?
char ** word_str
Unigram names.
Iterator over a model set.
Subclass of ngram_model for grouping language models.
int32 cur
Currently selected model, or -1 for none.
int32 * lweights
Log interpolation weights.
ngram_model_t base
Base ngram_model_t structure.
int32 n_models
Number of models in this set.
ngram_model_t ** lms
Language models in this set.
int32 ** widmap
Word ID mapping for submodels.
int32 * maphist
Word ID mapping for N-Gram history.
char ** names
Names for language models.