SphinxBase 0.6
ngram_model_arpa.c
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 1999-2007 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37/*
38 * \file ngram_model_arpa.c ARPA format language models
39 *
40 * Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
41 */
42
44#include <string.h>
45#include <limits.h>
46#include <assert.h>
47
48#include "sphinxbase/err.h"
49#include "sphinxbase/pio.h"
51#include "sphinxbase/strfuncs.h"
52
53#include "ngram_model_arpa.h"
54
55static ngram_funcs_t ngram_model_arpa_funcs;
56
57#define TSEG_BASE(m,b) ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ])
58#define FIRST_BG(m,u) ((m)->lm3g.unigrams[u].bigrams)
59#define FIRST_TG(m,b) (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams))
60
61/*
62 * Read and return #unigrams, #bigrams, #trigrams as stated in input file.
63 */
64static int
65ReadNgramCounts(lineiter_t **li, int32 * n_ug, int32 * n_bg, int32 * n_tg)
66{
67 int32 ngram, ngram_cnt;
68
69 /* skip file until past the '\data\' marker */
70 while (*li) {
71 string_trim((*li)->buf, STRING_BOTH);
72 if (strcmp((*li)->buf, "\\data\\") == 0)
73 break;
74 *li = lineiter_next(*li);
75 }
76 if (*li == NULL || strcmp((*li)->buf, "\\data\\") != 0) {
77 E_INFO("No \\data\\ mark in LM file\n");
78 return -1;
79 }
80
81 *n_ug = *n_bg = *n_tg = 0;
82 while ((*li = lineiter_next(*li))) {
83 if (sscanf((*li)->buf, "ngram %d=%d", &ngram, &ngram_cnt) != 2)
84 break;
85 switch (ngram) {
86 case 1:
87 *n_ug = ngram_cnt;
88 break;
89 case 2:
90 *n_bg = ngram_cnt;
91 break;
92 case 3:
93 *n_tg = ngram_cnt;
94 break;
95 default:
96 E_ERROR("Unknown ngram (%d)\n", ngram);
97 return -1;
98 }
99 }
100 if (*li == NULL) {
101 E_ERROR("EOF while reading ngram counts\n");
102 return -1;
103 }
104
105 /* Position iterator to the unigrams header '\1-grams:\' */
106 while ((*li = lineiter_next(*li))) {
107 string_trim((*li)->buf, STRING_BOTH);
108 if (strcmp((*li)->buf, "\\1-grams:") == 0)
109 break;
110 }
111 if (*li == NULL) {
112 E_ERROR_SYSTEM("Failed to read \\1-grams: mark");
113 return -1;
114 }
115
116 if ((*n_ug <= 0) || (*n_bg < 0) || (*n_tg < 0)) {
117 E_ERROR("Bad or missing ngram count\n");
118 return -1;
119 }
120 return 0;
121}
122
123/*
124 * Read in the unigrams from given file into the LM structure model.
125 * On entry to this procedure, the iterator is positioned to the
126 * header line '\1-grams:'.
127 */
128static int
129ReadUnigrams(lineiter_t **li, ngram_model_arpa_t * model)
130{
131 ngram_model_t *base = &model->base;
132 int32 wcnt;
133 float p1;
134
135 E_INFO("Reading unigrams\n");
136
137 wcnt = 0;
138 while ((*li = lineiter_next(*li))) {
139 char *wptr[3], *name;
140 float32 bo_wt = 0.0f;
141 int n;
142
143 string_trim((*li)->buf, STRING_BOTH);
144 if (strcmp((*li)->buf, "\\2-grams:") == 0
145 || strcmp((*li)->buf, "\\end\\") == 0)
146 break;
147
148 if ((n = str2words((*li)->buf, wptr, 3)) < 2) {
149 if ((*li)->buf[0] != '\0')
150 E_WARN("Format error; unigram ignored: %s\n", (*li)->buf);
151 continue;
152 }
153 else {
154 p1 = (float)atof_c(wptr[0]);
155 name = wptr[1];
156 if (n == 3)
157 bo_wt = (float)atof_c(wptr[2]);
158 }
159
160 if (wcnt >= base->n_counts[0]) {
161 E_ERROR("Too many unigrams\n");
162 return -1;
163 }
164
165 /* Associate name with word id */
166 base->word_str[wcnt] = ckd_salloc(name);
167 if ((hash_table_enter(base->wid, base->word_str[wcnt], (void *)(long)wcnt))
168 != (void *)(long)wcnt) {
169 E_WARN("Duplicate word in dictionary: %s\n", base->word_str[wcnt]);
170 }
171 model->lm3g.unigrams[wcnt].prob1.l = logmath_log10_to_log(base->lmath, p1);
172 model->lm3g.unigrams[wcnt].bo_wt1.l = logmath_log10_to_log(base->lmath, bo_wt);
173 wcnt++;
174 }
175
176 if (base->n_counts[0] != wcnt) {
177 E_WARN("lm_t.ucount(%d) != #unigrams read(%d)\n",
178 base->n_counts[0], wcnt);
179 base->n_counts[0] = wcnt;
180 base->n_words = wcnt;
181 }
182 return 0;
183}
184
185/*
186 * Read bigrams from given file into given model structure.
187 */
188static int
189ReadBigrams(lineiter_t **li, ngram_model_arpa_t * model)
190{
191 ngram_model_t *base = &model->base;
192 int32 w1, w2, prev_w1, bgcount;
193 bigram_t *bgptr;
194
195 E_INFO("Reading bigrams\n");
196
197 bgcount = 0;
198 bgptr = model->lm3g.bigrams;
199 prev_w1 = -1;
200
201 while ((*li = lineiter_next(*li))) {
202 float32 p, bo_wt = 0.0f;
203 int32 p2, bo_wt2;
204 char *wptr[4], *word1, *word2;
205 int n;
206
207 string_trim((*li)->buf, STRING_BOTH);
208 wptr[3] = NULL;
209 if ((n = str2words((*li)->buf, wptr, 4)) < 3) {
210 if ((*li)->buf[0] != '\0')
211 break;
212 continue;
213 }
214 else {
215 p = (float32)atof_c(wptr[0]);
216 word1 = wptr[1];
217 word2 = wptr[2];
218 if (wptr[3])
219 bo_wt = (float32)atof_c(wptr[3]);
220 }
221
222 if ((w1 = ngram_wid(base, word1)) == NGRAM_INVALID_WID) {
223 E_ERROR("Unknown word: %s, skipping bigram (%s %s)\n",
224 word1, word1, word2);
225 continue;
226 }
227 if ((w2 = ngram_wid(base, word2)) == NGRAM_INVALID_WID) {
228 E_ERROR("Unknown word: %s, skipping bigram (%s %s)\n",
229 word2, word1, word2);
230 continue;
231 }
232
233 /* FIXME: Should use logmath_t quantization here. */
234 /* HACK!! to quantize probs to 4 decimal digits */
235 p = (float32)((int32)(p * 10000)) / 10000;
236 bo_wt = (float32)((int32)(bo_wt * 10000)) / 10000;
237
238 p2 = logmath_log10_to_log(base->lmath, p);
239 bo_wt2 = logmath_log10_to_log(base->lmath, bo_wt);
240
241 if (bgcount >= base->n_counts[1]) {
242 E_ERROR("Too many bigrams\n");
243 return -1;
244 }
245
246 bgptr->wid = w2;
247 bgptr->prob2 = sorted_id(&model->sorted_prob2, &p2);
248 if (base->n_counts[2] > 0)
249 bgptr->bo_wt2 = sorted_id(&model->sorted_bo_wt2, &bo_wt2);
250
251 if (w1 != prev_w1) {
252 if (w1 < prev_w1) {
253 E_ERROR("Bigrams not in unigram order\n");
254 return -1;
255 }
256
257 for (prev_w1++; prev_w1 <= w1; prev_w1++)
258 model->lm3g.unigrams[prev_w1].bigrams = bgcount;
259 prev_w1 = w1;
260 }
261 bgcount++;
262 bgptr++;
263
264 if ((bgcount & 0x0000ffff) == 0) {
265 E_INFOCONT(".");
266 }
267 }
268 if (*li == NULL || ((strcmp((*li)->buf, "\\end\\") != 0)
269 && (strcmp((*li)->buf, "\\3-grams:") != 0))) {
270 E_ERROR("Bad bigram: %s\n", (*li)->buf);
271 return -1;
272 }
273
274 for (prev_w1++; prev_w1 <= base->n_counts[0]; prev_w1++)
275 model->lm3g.unigrams[prev_w1].bigrams = bgcount;
276
277 return 0;
278}
279
280/*
281 * Very similar to ReadBigrams.
282 */
283static int
284ReadTrigrams(lineiter_t **li, ngram_model_arpa_t * model)
285{
286 ngram_model_t *base = &model->base;
287 int32 i, w1, w2, w3, prev_w1, prev_w2, tgcount, prev_bg, bg, endbg;
288 int32 seg, prev_seg, prev_seg_lastbg;
289 trigram_t *tgptr;
290 bigram_t *bgptr;
291
292 E_INFO("Reading trigrams\n");
293
294 tgcount = 0;
295 tgptr = model->lm3g.trigrams;
296 prev_w1 = -1;
297 prev_w2 = -1;
298 prev_bg = -1;
299 prev_seg = -1;
300
301 while ((*li = lineiter_next(*li))) {
302 float32 p;
303 int32 p3;
304 char *wptr[4], *word1, *word2, *word3;
305
306 string_trim((*li)->buf, STRING_BOTH);
307 if (str2words((*li)->buf, wptr, 4) != 4) {
308 if ((*li)->buf[0] != '\0')
309 break;
310 continue;
311 }
312 else {
313 p = (float32)atof_c(wptr[0]);
314 word1 = wptr[1];
315 word2 = wptr[2];
316 word3 = wptr[3];
317 }
318
319 if ((w1 = ngram_wid(base, word1)) == NGRAM_INVALID_WID) {
320 E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n",
321 word1, word1, word2, word3);
322 continue;
323 }
324 if ((w2 = ngram_wid(base, word2)) == NGRAM_INVALID_WID) {
325 E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n",
326 word2, word1, word2, word3);
327 continue;
328 }
329 if ((w3 = ngram_wid(base, word3)) == NGRAM_INVALID_WID) {
330 E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n",
331 word3, word1, word2, word3);
332 continue;
333 }
334
335 /* FIXME: Should use logmath_t quantization here. */
336 /* HACK!! to quantize probs to 4 decimal digits */
337 p = (float32)((int32)(p * 10000)) / 10000;
338 p3 = logmath_log10_to_log(base->lmath, p);
339
340 if (tgcount >= base->n_counts[2]) {
341 E_ERROR("Too many trigrams\n");
342 return -1;
343 }
344
345 tgptr->wid = w3;
346 tgptr->prob3 = sorted_id(&model->sorted_prob3, &p3);
347
348 if ((w1 != prev_w1) || (w2 != prev_w2)) {
349 /* Trigram for a new bigram; update tg info for all previous bigrams */
350 if ((w1 < prev_w1) || ((w1 == prev_w1) && (w2 < prev_w2))) {
351 E_ERROR("Trigrams not in bigram order\n");
352 return -1;
353 }
354
355 bg = (w1 !=
356 prev_w1) ? model->lm3g.unigrams[w1].bigrams : prev_bg + 1;
357 endbg = model->lm3g.unigrams[w1 + 1].bigrams;
358 bgptr = model->lm3g.bigrams + bg;
359 for (; (bg < endbg) && (bgptr->wid != w2); bg++, bgptr++);
360 if (bg >= endbg) {
361 E_ERROR("Missing bigram for trigram: %s", (*li)->buf);
362 return -1;
363 }
364
365 /* bg = bigram entry index for <w1,w2>. Update tseg_base */
366 seg = bg >> LOG_BG_SEG_SZ;
367 for (i = prev_seg + 1; i <= seg; i++)
368 model->lm3g.tseg_base[i] = tgcount;
369
370 /* Update trigrams pointers for all bigrams until bg */
371 if (prev_seg < seg) {
372 int32 tgoff = 0;
373
374 if (prev_seg >= 0) {
375 tgoff = tgcount - model->lm3g.tseg_base[prev_seg];
376 if (tgoff > 65535) {
377 E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
378 return -1;
379 }
380 }
381
382 prev_seg_lastbg = ((prev_seg + 1) << LOG_BG_SEG_SZ) - 1;
383 bgptr = model->lm3g.bigrams + prev_bg;
384 for (++prev_bg, ++bgptr; prev_bg <= prev_seg_lastbg;
385 prev_bg++, bgptr++)
386 bgptr->trigrams = tgoff;
387
388 for (; prev_bg <= bg; prev_bg++, bgptr++)
389 bgptr->trigrams = 0;
390 }
391 else {
392 int32 tgoff;
393
394 tgoff = tgcount - model->lm3g.tseg_base[prev_seg];
395 if (tgoff > 65535) {
396 E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
397 return -1;
398 }
399
400 bgptr = model->lm3g.bigrams + prev_bg;
401 for (++prev_bg, ++bgptr; prev_bg <= bg; prev_bg++, bgptr++)
402 bgptr->trigrams = tgoff;
403 }
404
405 prev_w1 = w1;
406 prev_w2 = w2;
407 prev_bg = bg;
408 prev_seg = seg;
409 }
410
411 tgcount++;
412 tgptr++;
413
414 if ((tgcount & 0x0000ffff) == 0) {
415 E_INFOCONT(".");
416 }
417 }
418 if (*li == NULL || strcmp((*li)->buf, "\\end\\") != 0) {
419 E_ERROR("Bad trigram: %s\n", (*li)->buf);
420 return -1;
421 }
422
423 for (prev_bg++; prev_bg <= base->n_counts[1]; prev_bg++) {
424 if ((prev_bg & (BG_SEG_SZ - 1)) == 0)
425 model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ] = tgcount;
426 if ((tgcount - model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ]) > 65535) {
427 E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
428 return -1;
429 }
430 model->lm3g.bigrams[prev_bg].trigrams =
431 tgcount - model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ];
432 }
433 return 0;
434}
435
436static unigram_t *
437new_unigram_table(int32 n_ug)
438{
439 unigram_t *table;
440 int32 i;
441
442 table = ckd_calloc(n_ug, sizeof(unigram_t));
443 for (i = 0; i < n_ug; i++) {
444 table[i].prob1.l = INT_MIN;
445 table[i].bo_wt1.l = INT_MIN;
446 }
447 return table;
448}
449
451ngram_model_arpa_read(cmd_ln_t *config,
452 const char *file_name,
453 logmath_t *lmath)
454{
455 lineiter_t *li;
456 FILE *fp;
457 int32 is_pipe;
458 int32 n_unigram;
459 int32 n_bigram;
460 int32 n_trigram;
461 int32 n;
462 ngram_model_arpa_t *model;
463 ngram_model_t *base;
464
465 if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) {
466 E_ERROR("File %s not found\n", file_name);
467 return NULL;
468 }
469 li = lineiter_start(fp);
470
471 /* Read #unigrams, #bigrams, #trigrams from file */
472 if (ReadNgramCounts(&li, &n_unigram, &n_bigram, &n_trigram) == -1) {
473 lineiter_free(li);
474 fclose_comp(fp, is_pipe);
475 return NULL;
476 }
477 E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);
478
479 /* Allocate space for LM, including initial OOVs and placeholders; initialize it */
480 model = ckd_calloc(1, sizeof(*model));
481 base = &model->base;
482 if (n_trigram > 0)
483 n = 3;
484 else if (n_bigram > 0)
485 n = 2;
486 else
487 n = 1;
488 /* Initialize base model. */
489 ngram_model_init(base, &ngram_model_arpa_funcs, lmath, n, n_unigram);
490 base->n_counts[0] = n_unigram;
491 base->n_counts[1] = n_bigram;
492 base->n_counts[2] = n_trigram;
493 base->writable = TRUE;
494
495 /*
496 * Allocate one extra unigram and bigram entry: sentinels to terminate
497 * followers (bigrams and trigrams, respectively) of previous entry.
498 */
499 model->lm3g.unigrams = new_unigram_table(n_unigram + 1);
500 model->lm3g.bigrams =
501 ckd_calloc(n_bigram + 1, sizeof(bigram_t));
502 if (n_trigram > 0)
503 model->lm3g.trigrams =
504 ckd_calloc(n_trigram, sizeof(trigram_t));
505
506 if (n_trigram > 0) {
507 model->lm3g.tseg_base =
508 ckd_calloc((n_bigram + 1) / BG_SEG_SZ + 1,
509 sizeof(int32));
510 }
511 if (ReadUnigrams(&li, model) == -1) {
512 fclose_comp(fp, is_pipe);
513 ngram_model_free(base);
514 return NULL;
515 }
516 E_INFO("%8d = #unigrams created\n", base->n_counts[0]);
517
518 init_sorted_list(&model->sorted_prob2);
519 if (base->n_counts[2] > 0)
520 init_sorted_list(&model->sorted_bo_wt2);
521
522 if (base->n_counts[1] > 0) {
523 if (ReadBigrams(&li, model) == -1) {
524 fclose_comp(fp, is_pipe);
525 ngram_model_free(base);
526 return NULL;
527 }
528
529 base->n_counts[1] = FIRST_BG(model, base->n_counts[0]);
530 model->lm3g.n_prob2 = model->sorted_prob2.free;
531 model->lm3g.prob2 = vals_in_sorted_list(&model->sorted_prob2);
532 free_sorted_list(&model->sorted_prob2);
533 E_INFO("%8d = #bigrams created\n", base->n_counts[1]);
534 E_INFO("%8d = #prob2 entries\n", model->lm3g.n_prob2);
535 }
536
537 if (base->n_counts[2] > 0) {
538 /* Create trigram bo-wts array */
539 model->lm3g.n_bo_wt2 = model->sorted_bo_wt2.free;
540 model->lm3g.bo_wt2 = vals_in_sorted_list(&model->sorted_bo_wt2);
541 free_sorted_list(&model->sorted_bo_wt2);
542 E_INFO("%8d = #bo_wt2 entries\n", model->lm3g.n_bo_wt2);
543
544 init_sorted_list(&model->sorted_prob3);
545
546 if (ReadTrigrams(&li, model) == -1) {
547 fclose_comp(fp, is_pipe);
548 ngram_model_free(base);
549 return NULL;
550 }
551
552 base->n_counts[2] = FIRST_TG(model, base->n_counts[1]);
553 model->lm3g.n_prob3 = model->sorted_prob3.free;
554 model->lm3g.prob3 = vals_in_sorted_list(&model->sorted_prob3);
555 E_INFO("%8d = #trigrams created\n", base->n_counts[2]);
556 E_INFO("%8d = #prob3 entries\n", model->lm3g.n_prob3);
557
558 free_sorted_list(&model->sorted_prob3);
559
560 /* Initialize tginfo */
561 model->lm3g.tginfo = ckd_calloc(n_unigram, sizeof(tginfo_t *));
562 model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t));
563 }
564
565 lineiter_free(li);
566 fclose_comp(fp, is_pipe);
567 return base;
568}
569
570int
571ngram_model_arpa_write(ngram_model_t *model,
572 const char *file_name)
573{
574 ngram_iter_t *itor;
575 FILE *fh;
576 int i;
577
578 if ((fh = fopen(file_name, "w")) == NULL) {
579 E_ERROR_SYSTEM("Failed to open %s for writing", file_name);
580 return -1;
581 }
582 fprintf(fh, "This is an ARPA-format language model file, generated by CMU Sphinx\n");
583
584 /* The ARPA format doesn't require any extra information that
585 * N-Gram iterators can't give us, so this is very
586 * straightforward compared with DMP writing. */
587
588 /* Write N-gram counts. */
589 fprintf(fh, "\\data\\\n");
590 for (i = 0; i < model->n; ++i) {
591 fprintf(fh, "ngram %d=%d\n", i+1, model->n_counts[i]);
592 }
593
594 /* Write N-grams */
595 for (i = 0; i < model->n; ++i) {
596 fprintf(fh, "\n\\%d-grams:\n", i + 1);
597 for (itor = ngram_model_mgrams(model, i); itor; itor = ngram_iter_next(itor)) {
598 int32 const *wids;
599 int32 score, bowt;
600 int j;
601
602 wids = ngram_iter_get(itor, &score, &bowt);
603 fprintf(fh, "%.4f ", logmath_log_to_log10(model->lmath, score));
604 for (j = 0; j <= i; ++j) {
605 assert(wids[j] < model->n_counts[0]);
606 fprintf(fh, "%s ", model->word_str[wids[j]]);
607 }
608 if (i < model->n-1)
609 fprintf(fh, "%.4f", logmath_log_to_log10(model->lmath, bowt));
610 fprintf(fh, "\n");
611 }
612 }
613 fprintf(fh, "\n\\end\\\n");
614 return fclose(fh);
615}
616
617static int
618ngram_model_arpa_apply_weights(ngram_model_t *base, float32 lw,
619 float32 wip, float32 uw)
620{
621 ngram_model_arpa_t *model = (ngram_model_arpa_t *)base;
622 lm3g_apply_weights(base, &model->lm3g, lw, wip, uw);
623 return 0;
624}
625
626/* Lousy "templating" for things that are largely the same in DMP and
627 * ARPA models, except for the bigram and trigram types and some
628 * names. */
629#define NGRAM_MODEL_TYPE ngram_model_arpa_t
630#include "lm3g_templates.c"
631
632static void
633ngram_model_arpa_free(ngram_model_t *base)
634{
635 ngram_model_arpa_t *model = (ngram_model_arpa_t *)base;
636 ckd_free(model->lm3g.unigrams);
637 ckd_free(model->lm3g.bigrams);
638 ckd_free(model->lm3g.trigrams);
639 ckd_free(model->lm3g.prob2);
640 ckd_free(model->lm3g.bo_wt2);
641 ckd_free(model->lm3g.prob3);
642 lm3g_tginfo_free(base, &model->lm3g);
643 ckd_free(model->lm3g.tseg_base);
644}
645
646static ngram_funcs_t ngram_model_arpa_funcs = {
647 ngram_model_arpa_free, /* free */
648 ngram_model_arpa_apply_weights, /* apply_weights */
649 lm3g_template_score, /* score */
650 lm3g_template_raw_score, /* raw_score */
651 lm3g_template_add_ug, /* add_ug */
652 lm3g_template_flush, /* flush */
653 lm3g_template_iter, /* iter */
654 lm3g_template_mgrams, /* mgrams */
655 lm3g_template_successors, /* successors */
656 lm3g_template_iter_get, /* iter_get */
657 lm3g_template_iter_next, /* iter_next */
658 lm3g_template_iter_free /* iter_free */
659};
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition ckd_alloc.c:241
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition ckd_alloc.h:248
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition ckd_alloc.h:264
Implementation of logging routines.
#define E_ERROR_SYSTEM
Print error text; Call perror("");.
Definition err.h:142
#define E_ERROR
Print error message to standard error stream.
Definition err.h:169
#define E_WARN
Print warning information to standard error stream.
Definition err.h:164
#define E_INFO
Print logging information to standard error stream.
Definition err.h:147
#define E_INFOCONT
Print logging information without header, to standard error stream.
Definition err.h:153
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
Definition hash_table.c:508
Fast memory allocator for uniformly sized objects.
SPHINXBASE_EXPORT listelem_alloc_t * listelem_alloc_init(size_t elemsize)
Initialize and return a list element allocator.
SPHINXBASE_EXPORT float64 logmath_log_to_log10(logmath_t *lmath, int logb_p)
Convert integer log in base B to base 10 log (in floating point).
Definition logmath.c:480
SPHINXBASE_EXPORT int logmath_log10_to_log(logmath_t *lmath, float64 log_p)
Convert base 10 log (in floating point) to integer log in base B.
Definition logmath.c:474
#define NGRAM_INVALID_WID
Impossible word ID.
Definition ngram_model.h:84
SPHINXBASE_EXPORT int32 const * ngram_iter_get(ngram_iter_t *itor, int32 *out_score, int32 *out_bowt)
Get information from the current M-gram in an iterator.
SPHINXBASE_EXPORT int32 ngram_wid(ngram_model_t *model, const char *word)
Look up numerical word ID.
SPHINXBASE_EXPORT ngram_iter_t * ngram_model_mgrams(ngram_model_t *model, int m)
Iterate over all M-grams.
SPHINXBASE_EXPORT int ngram_model_free(ngram_model_t *model)
Release memory associated with an N-Gram model.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter_next(ngram_iter_t *itor)
Advance an M-gram iterator.
file IO related operations.
SPHINXBASE_EXPORT void lineiter_free(lineiter_t *li)
Stop reading lines from a file.
Definition pio.c:358
SPHINXBASE_EXPORT void fclose_comp(FILE *fp, int32 ispipe)
Close a file opened using fopen_comp.
Definition pio.c:175
SPHINXBASE_EXPORT FILE * fopen_comp(const char *file, const char *mode, int32 *ispipe)
Like fopen, but use popen and zcat if it is determined that "file" is compressed (i....
Definition pio.c:98
SPHINXBASE_EXPORT lineiter_t * lineiter_start(FILE *fh)
Start reading lines from a file.
Definition pio.c:255
SPHINXBASE_EXPORT lineiter_t * lineiter_next(lineiter_t *li)
Move to the next line in the file.
Definition pio.c:338
Miscellaneous useful string functions.
SPHINXBASE_EXPORT char * string_trim(char *string, enum string_edge_e which)
Remove whitespace from a string, modifying it in-place.
Definition strfuncs.c:89
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
Definition strfuncs.c:115
@ STRING_BOTH
Both ends of string.
Definition strfuncs.h:73
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
Definition strfuncs.c:56
Bigram structure.
uint16 trigrams
Index of 1st entry in lm_t.trigrams[], RELATIVE TO its segment base (see above)
uint16 bo_wt2
Index into array of actual bigram backoff wts.
uint16 prob2
Index into array of actual bigram probs.
uint32 wid
Index of unigram entry for this.
Opaque structure used to hold the results of command-line parsing.
Line iterator for files.
Definition pio.h:177
int32 n_prob2
prob2 size
Definition lm3g_model.h:147
listelem_alloc_t * le
List element allocator for tginfo.
Definition lm3g_model.h:156
lmprob_t * prob2
Table of actual bigram probs.
Definition lm3g_model.h:146
int32 * tseg_base
tseg_base[i>>LOG_BG_SEG_SZ] = index of 1st trigram for bigram segment (i>>LOG_BG_SEG_SZ)
Definition lm3g_model.h:152
lmprob_t * bo_wt2
Table of actual bigram backoff weights.
Definition lm3g_model.h:148
tginfo_t ** tginfo
tginfo[lw2] is head of linked list of trigram information for some cached subset of bigrams (*,...
Definition lm3g_model.h:154
int32 n_bo_wt2
bo_wt2 size
Definition lm3g_model.h:149
lmprob_t * prob3
Table of actual trigram probs.
Definition lm3g_model.h:150
int32 n_prob3
prob3 size
Definition lm3g_model.h:151
Implementation-specific functions for operating on ngram_model_t objects.
Base iterator structure for N-grams.
Subclass of ngram_model for ARPA file reading.
ngram_model_t base
Base ngram_model_t structure.
lm3g_model_t lm3g
Shared lm3g structure.
Common implementation of ngram_model_t.
logmath_t * lmath
Log-math object.
uint8 n
This is an n-gram model (1, 2, 3, ...).
int32 n_words
Number of actual word strings (NOT the same as the number of unigrams, due to class words).
hash_table_t * wid
Mapping of unigram names to word IDs.
uint8 writable
Are word strings writable?
int32 * n_counts
Counts for 1, 2, 3, ... grams.
char ** word_str
Unigram names.
int32 free
first free element in list
Definition lm3g_model.h:84
Trigram information cache.
Definition lm3g_model.h:129
Trigram structure.
uint32 wid
Index of unigram entry for this.
uint16 prob3
Index into array of actual trigram probs.
Unigram structure (common among all lm3g implementations)
Definition lm3g_model.h:91
lmprob_t prob1
Unigram probability.
Definition lm3g_model.h:92
lmprob_t bo_wt1
Unigram backoff weight.
Definition lm3g_model.h:93
int32 bigrams
Index of 1st entry in lm_t.bigrams[].
Definition lm3g_model.h:94