SphinxBase 0.6
lm3g_model.c
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 1999-2007 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37/*
38 * \file lm3g_model.c Core Sphinx 3-gram code used in
39 * DMP/DMP32/ARPA (for now) model code.
40 *
41 * Author: A cast of thousands, probably.
42 */
43#include <string.h>
44#include <assert.h>
45#include <limits.h>
46
49#include "sphinxbase/err.h"
50
51#include "lm3g_model.h"
52
53void
54lm3g_tginfo_free(ngram_model_t *base, lm3g_model_t *lm3g)
55{
56 if (lm3g->tginfo == NULL)
57 return;
59 ckd_free(lm3g->tginfo);
60}
61
62void
63lm3g_tginfo_reset(ngram_model_t *base, lm3g_model_t *lm3g)
64{
65 if (lm3g->tginfo == NULL)
66 return;
68 memset(lm3g->tginfo, 0, base->n_counts[0] * sizeof(tginfo_t *));
69 lm3g->le = listelem_alloc_init(sizeof(tginfo_t));
70}
71
72void
73lm3g_apply_weights(ngram_model_t *base,
74 lm3g_model_t *lm3g,
75 float32 lw, float32 wip, float32 uw)
76{
77 int32 log_wip, log_uw, log_uniform_weight;
78 int i;
79
80 /* Precalculate some log values we will like. */
81 log_wip = logmath_log(base->lmath, wip);
82 log_uw = logmath_log(base->lmath, uw);
83 log_uniform_weight = logmath_log(base->lmath, 1.0 - uw);
84
85 for (i = 0; i < base->n_counts[0]; ++i) {
86 int32 prob1, bo_wt, n_used;
87
88 /* Backoff weights just get scaled by the lw. */
89 bo_wt = (int32)(lm3g->unigrams[i].bo_wt1.l / base->lw);
90 /* Unscaling unigram probs is a bit more complicated, so punt
91 * it back to the general code. */
92 prob1 = ngram_ng_prob(base, i, NULL, 0, &n_used);
93 /* Now compute the new scaled probabilities. */
94 lm3g->unigrams[i].bo_wt1.l = (int32)(bo_wt * lw);
95 if (strcmp(base->word_str[i], "<s>") == 0) { /* FIXME: configurable start_sym */
96 /* Apply language weight and WIP */
97 lm3g->unigrams[i].prob1.l = (int32)(prob1 * lw) + log_wip;
98 }
99 else {
100 /* Interpolate unigram probability with uniform. */
101 prob1 += log_uw;
102 prob1 = logmath_add(base->lmath, prob1, base->log_uniform + log_uniform_weight);
103 /* Apply language weight and WIP */
104 lm3g->unigrams[i].prob1.l = (int32)(prob1 * lw) + log_wip;
105 }
106 }
107
108 for (i = 0; i < lm3g->n_prob2; ++i) {
109 int32 prob2;
110 /* Can't just punt this back to general code since it is quantized. */
111 prob2 = (int32)((lm3g->prob2[i].l - base->log_wip) / base->lw);
112 lm3g->prob2[i].l = (int32)(prob2 * lw) + log_wip;
113 }
114
115 if (base->n > 2) {
116 for (i = 0; i < lm3g->n_bo_wt2; ++i) {
117 lm3g->bo_wt2[i].l = (int32)(lm3g->bo_wt2[i].l / base->lw * lw);
118 }
119 for (i = 0; i < lm3g->n_prob3; i++) {
120 int32 prob3;
121 /* Can't just punt this back to general code since it is quantized. */
122 prob3 = (int32)((lm3g->prob3[i].l - base->log_wip) / base->lw);
123 lm3g->prob3[i].l = (int32)(prob3 * lw) + log_wip;
124 }
125 }
126
127 /* Store updated values in the model. */
128 base->log_wip = log_wip;
129 base->log_uw = log_uw;
130 base->log_uniform_weight = log_uniform_weight;
131 base->lw = lw;
132}
133
134int32
135lm3g_add_ug(ngram_model_t *base,
136 lm3g_model_t *lm3g, int32 wid, int32 lweight)
137{
138 int32 score;
139
140 /* This would be very bad if this happened! */
141 assert(!NGRAM_IS_CLASSWID(wid));
142
143 /* Reallocate unigram array. */
144 lm3g->unigrams = ckd_realloc(lm3g->unigrams,
145 sizeof(*lm3g->unigrams) * base->n_1g_alloc);
146 memset(lm3g->unigrams + base->n_counts[0], 0,
147 (base->n_1g_alloc - base->n_counts[0]) * sizeof(*lm3g->unigrams));
148 /* Reallocate tginfo array. */
149 lm3g->tginfo = ckd_realloc(lm3g->tginfo,
150 sizeof(*lm3g->tginfo) * base->n_1g_alloc);
151 memset(lm3g->tginfo + base->n_counts[0], 0,
152 (base->n_1g_alloc - base->n_counts[0]) * sizeof(*lm3g->tginfo));
153 /* FIXME: we really ought to update base->log_uniform *and*
154 * renormalize all the other unigrams. This is really slow, so I
155 * will probably just provide a function to renormalize after
156 * adding unigrams, for anyone who really cares. */
157 /* This could be simplified but then we couldn't do it in logmath */
158 score = lweight + base->log_uniform + base->log_uw;
159 score = logmath_add(base->lmath, score,
160 base->log_uniform + base->log_uniform_weight);
161 lm3g->unigrams[wid].prob1.l = score;
162 /* This unigram by definition doesn't participate in any bigrams,
163 * so its backoff weight and bigram pointer are both undefined. */
164 lm3g->unigrams[wid].bo_wt1.l = 0;
165 lm3g->unigrams[wid].bigrams = 0;
166 /* Finally, increase the unigram count */
167 ++base->n_counts[0];
168 /* FIXME: Note that this can actually be quite bogus due to the
169 * presence of class words. If wid falls outside the unigram
170 * count, increase it to compensate, at the cost of no longer
171 * really knowing how many unigrams we have :( */
172 if (wid >= base->n_counts[0])
173 base->n_counts[0] = wid + 1;
174
175 return score;
176}
177
178#define INITIAL_SORTED_ENTRIES MAX_UINT16
179
180void
181init_sorted_list(sorted_list_t * l)
182{
183 l->list = ckd_calloc(INITIAL_SORTED_ENTRIES, sizeof(sorted_entry_t));
184 l->list[0].val.l = INT_MIN;
185 l->list[0].lower = 0;
186 l->list[0].higher = 0;
187 l->free = 1;
188 l->size = INITIAL_SORTED_ENTRIES;
189}
190
191void
192free_sorted_list(sorted_list_t * l)
193{
194 free(l->list);
195}
196
197lmprob_t *
198vals_in_sorted_list(sorted_list_t * l)
199{
200 lmprob_t *vals;
201 int32 i;
202
203 vals = ckd_calloc(l->free, sizeof(lmprob_t));
204 for (i = 0; i < l->free; i++)
205 vals[i] = l->list[i].val;
206 return (vals);
207}
208
209int32
210sorted_id(sorted_list_t * l, int32 *val)
211{
212 int32 i = 0;
213
214 for (;;) {
215 if (*val == l->list[i].val.l)
216 return (i);
217 if (*val < l->list[i].val.l) {
218 if (l->list[i].lower == 0) {
219
220 if (l->free >= l->size) {
221 int newsize = l->size + INITIAL_SORTED_ENTRIES;
222 l->list = ckd_realloc(l->list, sizeof(sorted_entry_t) * newsize);
223 memset(l->list + l->size,
224 0, INITIAL_SORTED_ENTRIES * sizeof(sorted_entry_t));
225 l->size = newsize;
226 }
227
228 l->list[i].lower = l->free;
229 (l->free)++;
230 i = l->list[i].lower;
231 l->list[i].val.l = *val;
232 return (i);
233 }
234 else
235 i = l->list[i].lower;
236 }
237 else {
238 if (l->list[i].higher == 0) {
239
240 if (l->free >= l->size) {
241 int newsize = l->size + INITIAL_SORTED_ENTRIES;
242 l->list = ckd_realloc(l->list, sizeof(sorted_entry_t) * newsize);
243 memset(l->list + l->size,
244 0, INITIAL_SORTED_ENTRIES * sizeof(sorted_entry_t));
245 l->size = newsize;
246 }
247
248 l->list[i].higher = l->free;
249 (l->free)++;
250 i = l->list[i].higher;
251 l->list[i].val.l = *val;
252 return (i);
253 }
254 else
255 i = l->list[i].higher;
256 }
257 }
258}
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition ckd_alloc.c:241
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition ckd_alloc.h:248
#define ckd_realloc(ptr, sz)
Macro for ckd_realloc
Definition ckd_alloc.h:258
Implementation of logging routines.
Fast memory allocator for uniformly sized objects.
SPHINXBASE_EXPORT void listelem_alloc_free(listelem_alloc_t *le)
Finalize and release all memory associated with a list element allocator.
SPHINXBASE_EXPORT listelem_alloc_t * listelem_alloc_init(size_t elemsize)
Initialize and return a list element allocator.
SPHINXBASE_EXPORT int logmath_add(logmath_t *lmath, int logb_p, int logb_q)
Add two values in log space (i.e.
Definition logmath.c:392
SPHINXBASE_EXPORT int logmath_log(logmath_t *lmath, float64 p)
Convert linear floating point number to integer log in base B.
Definition logmath.c:447
SPHINXBASE_EXPORT int32 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick "raw" probability lookup for a general N-Gram.
Common internal structure for Sphinx 3-gram models.
Definition lm3g_model.h:142
int32 n_prob2
prob2 size
Definition lm3g_model.h:147
listelem_alloc_t * le
List element allocator for tginfo.
Definition lm3g_model.h:156
lmprob_t * prob2
Table of actual bigram probs.
Definition lm3g_model.h:146
lmprob_t * bo_wt2
Table of actual bigram backoff weights.
Definition lm3g_model.h:148
tginfo_t ** tginfo
tginfo[lw2] is head of linked list of trigram information for some cached subset of bigrams (*,...
Definition lm3g_model.h:154
int32 n_bo_wt2
bo_wt2 size
Definition lm3g_model.h:149
lmprob_t * prob3
Table of actual trigram probs.
Definition lm3g_model.h:150
int32 n_prob3
prob3 size
Definition lm3g_model.h:151
Common implementation of ngram_model_t.
logmath_t * lmath
Log-math object.
uint8 n
This is an n-gram model (1, 2, 3, ...).
int32 log_wip
Log of word insertion penalty.
int32 n_1g_alloc
Number of allocated word strings (for new word addition)
int32 log_uniform
Log of uniform (0-gram) probability.
int32 log_uw
Log of unigram weight.
float32 lw
Language model scaling factor.
int32 * n_counts
Counts for 1, 2, 3, ... grams.
int32 log_uniform_weight
Log of uniform weight (i.e.
char ** word_str
Unigram names.
Bigram probs and bo-wts, and trigram probs are kept in separate tables rather than within the bigram_...
Definition lm3g_model.h:68
uint32 higher
index of another entry.
Definition lm3g_model.h:73
uint32 lower
index of another entry.
Definition lm3g_model.h:70
lmprob_t val
value being kept in this node
Definition lm3g_model.h:69
The sorted list.
Definition lm3g_model.h:82
int32 free
first free element in list
Definition lm3g_model.h:84
Trigram information cache.
Definition lm3g_model.h:129
lmprob_t prob1
Unigram probability.
Definition lm3g_model.h:92
lmprob_t bo_wt1
Unigram backoff weight.
Definition lm3g_model.h:93
int32 bigrams
Index of 1st entry in lm_t.bigrams[].
Definition lm3g_model.h:94
Type used to store language model probabilities.
Definition lm3g_model.h:54