45#define BINARY_SEARCH_THRESH 16
47find_bg(
bigram_t * bg, int32 n, int32 w)
54 while (e - b > BINARY_SEARCH_THRESH) {
58 else if (bg[i].wid > w)
65 for (i = b; (i < e) && (bg[i].wid != w); i++);
66 return ((i < e) ? i : -1);
70lm3g_bg_score(NGRAM_MODEL_TYPE *model,
71 int32 lw1, int32 lw2, int32 *n_used)
76 if (lw1 < 0 || model->base.n < 2) {
78 return model->lm3g.unigrams[lw2].prob1.l;
81 b = FIRST_BG(model, lw1);
82 n = FIRST_BG(model, lw1 + 1) - b;
83 bg = model->lm3g.bigrams + b;
85 if ((i = find_bg(bg, n, lw2)) >= 0) {
93 score = model->lm3g.unigrams[lw1].bo_wt1.l + model->lm3g.unigrams[lw2].prob1.l;
100load_tginfo(NGRAM_MODEL_TYPE *model, int32 lw1, int32 lw2)
110 tginfo->
next = model->lm3g.tginfo[lw2];
111 model->lm3g.tginfo[lw2] = tginfo;
114 b = model->lm3g.unigrams[lw1].bigrams;
115 n = model->lm3g.unigrams[lw1 + 1].bigrams - b;
116 bg = model->lm3g.bigrams + b;
118 if ((n > 0) && ((i = find_bg(bg, n, lw2)) >= 0)) {
119 tginfo->
bowt = model->lm3g.bo_wt2[bg[i].
bo_wt2].l;
123 t = FIRST_TG(model, b);
125 tginfo->
tg = model->lm3g.trigrams + t;
128 tginfo->
n_tg = FIRST_TG(model, b + 1) - t;
144 while (e - b > BINARY_SEARCH_THRESH) {
148 else if (tg[i].wid > w)
154 for (i = b; (i < e) && (tg[i].wid != w); i++);
155 return ((i < e) ? i : -1);
159lm3g_tg_score(NGRAM_MODEL_TYPE *model, int32 lw1,
160 int32 lw2, int32 lw3, int32 *n_used)
167 if ((base->
n < 3) || (lw1 < 0) || (lw2 < 0))
168 return (lm3g_bg_score(model, lw2, lw3, n_used));
171 for (tginfo = model->lm3g.tginfo[lw2]; tginfo; tginfo = tginfo->
next) {
172 if (tginfo->
w1 == lw1)
174 prev_tginfo = tginfo;
178 load_tginfo(model, lw1, lw2);
179 tginfo = model->lm3g.tginfo[lw2];
181 else if (prev_tginfo) {
183 tginfo->
next = model->lm3g.tginfo[lw2];
184 model->lm3g.tginfo[lw2] = tginfo;
192 if ((i = find_tg(tg, n, lw3)) >= 0) {
198 score = tginfo->
bowt + lm3g_bg_score(model, lw2, lw3, n_used);
206 int32 *history, int32 n_hist,
209 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base;
214 return model->lm3g.unigrams[wid].prob1.l;
216 return lm3g_bg_score(model, history[0], wid, n_used);
220 return lm3g_tg_score(model, history[1], history[0], wid, n_used);
226 int32 *history, int32 n_hist,
229 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base;
237 score = model->lm3g.unigrams[wid].prob1.l - base->
log_wip;
239 score = (int32)(score / base->
lw);
241 if (strcmp(base->
word_str[wid],
"<s>") != 0) {
249 score = lm3g_bg_score(model, history[0], wid, n_used);
254 score = lm3g_tg_score(model, history[1], history[0], wid, n_used);
258 return (int32)((score - base->
log_wip) / base->
lw);
263 int32 wid, int32 lweight)
265 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base;
266 return lm3g_add_ug(base, &model->lm3g, wid, lweight);
272 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base;
273 lm3g_tginfo_reset(base, &model->lm3g);
285 int32 *history, int32 n_hist)
287 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base;
290 ngram_iter_init((
ngram_iter_t *)itor, base, n_hist, FALSE);
294 itor->ug = model->lm3g.unigrams + wid;
297 else if (n_hist == 1) {
300 itor->ug = model->lm3g.unigrams + history[0];
301 b = FIRST_BG(model, history[0]);
302 n = FIRST_BG(model, history[0] + 1) - b;
303 itor->bg = model->lm3g.bigrams + b;
305 if ((i = find_bg(itor->bg, n, wid)) < 0) {
312 else if (n_hist == 2) {
316 itor->ug = model->lm3g.unigrams + history[1];
318 for (tginfo = model->lm3g.tginfo[history[0]];
319 tginfo; tginfo = tginfo->
next) {
320 if (tginfo->
w1 == history[1])
322 prev_tginfo = tginfo;
326 load_tginfo(model, history[1], history[0]);
327 tginfo = model->lm3g.tginfo[history[0]];
329 else if (prev_tginfo) {
331 tginfo->
next = model->lm3g.tginfo[history[0]];
332 model->lm3g.tginfo[history[0]] = tginfo;
339 itor->tg = tginfo->
tg;
340 if ((i = find_tg(itor->tg, n, wid)) >= 0) {
345 itor->bg = model->lm3g.bigrams;
346 while (FIRST_TG(model, (itor->bg - model->lm3g.bigrams + 1))
347 <= (itor->tg - model->lm3g.trigrams))
367 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base;
371 itor->ug = model->lm3g.unigrams;
372 itor->bg = model->lm3g.bigrams;
373 itor->tg = model->lm3g.trigrams;
376 if (m > 1 && base->
n_counts[1] > 1) {
377 while (FIRST_TG(model, (itor->bg - model->lm3g.bigrams + 1))
378 <= (itor->tg - model->lm3g.trigrams))
383 if (m > 0 && base->
n_counts[0] > 1) {
384 while (itor->ug[1].
bigrams <= (itor->bg - model->lm3g.bigrams))
394 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)bitor->model;
403 if (((itor->ug + 1) - model->lm3g.unigrams < bitor->model->n_counts[0] &&
404 itor->ug->
bigrams == (itor->ug + 1)->bigrams) ||
405 itor->ug->
bigrams == bitor->model->n_counts[1])
409 itor->bg = model->lm3g.bigrams + itor->ug->
bigrams;
415 if (((itor->bg + 1) - model->lm3g.bigrams < bitor->model->n_counts[1] &&
416 FIRST_TG (model, itor->bg - model->lm3g.bigrams) ==
417 FIRST_TG (model, (itor->bg + 1) - model->lm3g.bigrams)) ||
418 FIRST_TG (model, itor->bg - model->lm3g.bigrams) == bitor->model->n_counts[2])
422 itor->tg = (model->lm3g.trigrams
423 + FIRST_TG(model, (itor->bg - model->lm3g.bigrams)));
425 printf(
"%s %s => %d (%s)\n",
426 model->base.word_str[itor->ug - model->lm3g.unigrams],
427 model->base.word_str[itor->bg->
wid],
428 FIRST_TG(model, (itor->bg - model->lm3g.bigrams)),
429 model->base.word_str[itor->tg->
wid]);
438 ngram_iter_init((
ngram_iter_t *)itor, bitor->model, bitor->m + 1, TRUE);
447 int32 *out_score, int32 *out_bowt)
449 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base->model;
452 base->wids[0] = itor->ug - model->lm3g.unigrams;
453 if (itor->bg) base->wids[1] = itor->bg->
wid;
454 if (itor->tg) base->wids[2] = itor->tg->
wid;
456 printf(
"itor_get: %d %d %d\n", base->wids[0], base->wids[1], base->wids[2]);
461 *out_score = itor->ug->
prob1.l;
462 *out_bowt = itor->ug->
bo_wt1.l;
465 *out_score = model->lm3g.prob2[itor->bg->
prob2].l;
466 if (model->lm3g.bo_wt2)
467 *out_bowt = model->lm3g.bo_wt2[itor->bg->
bo_wt2].l;
472 *out_score = model->lm3g.prob3[itor->tg->
prob3].l;
484 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base->model;
491 if (itor->ug - model->lm3g.unigrams >= base->model->
n_counts[0])
497 if (itor->bg - model->lm3g.bigrams >= base->model->
n_counts[1])
501 while (itor->bg - model->lm3g.bigrams >= itor->ug[1].
bigrams) {
507 if (itor->ug == model->lm3g.unigrams + base->model->
n_counts[0]) {
508 E_ERROR(
"Bigram %d has no valid unigram parent\n",
509 itor->bg - model->lm3g.bigrams);
517 if (itor->tg - model->lm3g.trigrams >= base->model->
n_counts[2])
520 while (itor->tg - model->lm3g.trigrams >=
521 FIRST_TG(model, (itor->bg - model->lm3g.bigrams + 1))) {
525 if (itor->bg == model->lm3g.bigrams + base->model->
n_counts[1]) {
526 E_ERROR(
"Trigram %d has no valid bigram parent\n",
527 itor->tg - model->lm3g.trigrams);
533 while (itor->bg - model->lm3g.bigrams >= itor->ug[1].
bigrams) {
535 if (itor->ug == model->lm3g.unigrams + base->model->
n_counts[0]) {
536 E_ERROR(
"Trigram %d has no valid unigram parent\n",
537 itor->tg - model->lm3g.trigrams);
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
#define E_ERROR
Print error message to standard error stream.
#define listelem_malloc(le)
Allocate a list element and return pointer to it.
SPHINXBASE_EXPORT float64 logmath_exp(logmath_t *lmath, int logb_p)
Convert integer log in base B to linear floating point.
SPHINXBASE_EXPORT int logmath_log(logmath_t *lmath, float64 p)
Convert linear floating point number to integer log in base B.
SPHINXBASE_EXPORT void ngram_iter_free(ngram_iter_t *itor)
Terminate an M-gram iterator.
uint16 bo_wt2
Index into array of actual bigram backoff wts.
uint16 prob2
Index into array of actual bigram probs.
uint32 wid
Index of unigram entry for this.
Base iterator structure for N-grams.
Common implementation of ngram_model_t.
logmath_t * lmath
Log-math object.
uint8 n
This is an n-gram model (1, 2, 3, ...).
int32 log_wip
Log of word insertion penalty.
int32 log_uniform
Log of uniform (0-gram) probability.
float32 lw
Language model scaling factor.
int32 * n_counts
Counts for 1, 2, 3, ... grams.
int32 log_uniform_weight
Log of uniform weight (i.e.
char ** word_str
Unigram names.
Trigram information cache.
struct tginfo_s * next
Next lw1 with same parent lw2; NULL if none.
trigram_t * tg
Trigrams for lw1,lw2.
int32 used
whether used since last lm_reset
int32 n_tg
number tg for parent bigram lw1,lw2
int32 w1
lw1 component of bigram lw1,lw2.
int32 bowt
tg bowt for lw1,lw2
uint32 wid
Index of unigram entry for this.
uint16 prob3
Index into array of actual trigram probs.
Unigram structure (common among all lm3g implementations)
lmprob_t prob1
Unigram probability.
lmprob_t bo_wt1
Unigram backoff weight.
int32 bigrams
Index of 1st entry in lm_t.bigrams[].