SphinxBase 0.6
ngram_model_dmp.c
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 1999-2007 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37/*
38 * \file ngram_model_dmp.c DMP format language models
39 *
40 * Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
41 */
42
43#include <assert.h>
44#include <stdio.h>
45#include <string.h>
46#include <stdlib.h>
47#include <limits.h>
48
50#include "sphinxbase/pio.h"
51#include "sphinxbase/err.h"
52#include "sphinxbase/byteorder.h"
54
55#include "ngram_model_dmp.h"
56
57static const char darpa_hdr[] = "Darpa Trigram LM";
58static ngram_funcs_t ngram_model_dmp_funcs;
59
60#define TSEG_BASE(m,b) ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ])
61#define FIRST_BG(m,u) ((m)->lm3g.unigrams[u].bigrams)
62#define FIRST_TG(m,b) (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams))
63
64static unigram_t *
65new_unigram_table(int32 n_ug)
66{
67 unigram_t *table;
68 int32 i;
69
70 table = ckd_calloc(n_ug, sizeof(unigram_t));
71 for (i = 0; i < n_ug; i++) {
72 table[i].prob1.f = -99.0;
73 table[i].bo_wt1.f = -99.0;
74 }
75 return table;
76}
77
79ngram_model_dmp_read(cmd_ln_t *config,
80 const char *file_name,
81 logmath_t *lmath)
82{
83 ngram_model_t *base;
84 ngram_model_dmp_t *model;
85 FILE *fp;
86 int do_mmap, do_swap;
87 int32 is_pipe;
88 int32 i, j, k, vn, n, ts;
89 int32 n_unigram;
90 int32 n_bigram;
91 int32 n_trigram;
92 char str[1024];
93 unigram_t *ugptr;
94 bigram_t *bgptr;
95 trigram_t *tgptr;
96 char *tmp_word_str;
97 char *map_base = NULL;
98 size_t offset = 0;
99
100 base = NULL;
101 do_mmap = FALSE;
102 if (config)
103 do_mmap = cmd_ln_boolean_r(config, "-mmap");
104
105 if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) {
106 E_ERROR("Dump file %s not found\n", file_name);
107 goto error_out;
108 }
109
110 if (is_pipe && do_mmap) {
111 E_WARN("Dump file is compressed, will not use memory-mapped I/O\n");
112 do_mmap = 0;
113 }
114
115 do_swap = FALSE;
116 if (fread(&k, sizeof(k), 1, fp) != 1)
117 goto error_out;
118 if (k != strlen(darpa_hdr)+1) {
119 SWAP_INT32(&k);
120 if (k != strlen(darpa_hdr)+1) {
121 E_ERROR("Wrong magic header size number %x: %s is not a dump file\n", k, file_name);
122 goto error_out;
123 }
124 do_swap = 1;
125 }
126 if (fread(str, 1, k, fp) != (size_t) k) {
127 E_ERROR("Cannot read header\n");
128 goto error_out;
129 }
130 if (strncmp(str, darpa_hdr, k) != 0) {
131 E_ERROR("Wrong header %s: %s is not a dump file\n", darpa_hdr);
132 goto error_out;
133 }
134
135 if (do_mmap) {
136 if (do_swap) {
137 E_INFO
138 ("Byteswapping required, will not use memory-mapped I/O for LM file\n");
139 do_mmap = 0;
140 }
141 else {
142 E_INFO("Will use memory-mapped I/O for LM file\n");
143#ifdef __ADSPBLACKFIN__ /* This is true for both VisualDSP++ and uClinux. */
144 E_FATAL("memory mapping is not supported at the moment.");
145#else
146#endif
147 }
148 }
149
150 if (fread(&k, sizeof(k), 1, fp) != 1)
151 goto error_out;
152 if (do_swap) SWAP_INT32(&k);
153 if (fread(str, 1, k, fp) != (size_t) k) {
154 E_ERROR("Cannot read LM filename in header\n");
155 goto error_out;
156 }
157
158 /* read version#, if present (must be <= 0) */
159 if (fread(&vn, sizeof(vn), 1, fp) != 1)
160 goto error_out;
161 if (do_swap) SWAP_INT32(&vn);
162 if (vn <= 0) {
163 /* read and don't compare timestamps (we don't care) */
164 if (fread(&ts, sizeof(ts), 1, fp) != 1)
165 goto error_out;
166 if (do_swap) SWAP_INT32(&ts);
167
168 /* read and skip format description */
169 for (;;) {
170 if (fread(&k, sizeof(k), 1, fp) != 1)
171 goto error_out;
172 if (do_swap) SWAP_INT32(&k);
173 if (k == 0)
174 break;
175 if (fread(str, 1, k, fp) != (size_t) k) {
176 E_ERROR("Failed to read word\n");
177 goto error_out;
178 }
179 }
180 /* read model->ucount */
181 if (fread(&n_unigram, sizeof(n_unigram), 1, fp) != 1)
182 goto error_out;
183 if (do_swap) SWAP_INT32(&n_unigram);
184 }
185 else {
186 n_unigram = vn;
187 }
188
189 /* read model->bcount, tcount */
190 if (fread(&n_bigram, sizeof(n_bigram), 1, fp) != 1)
191 goto error_out;
192 if (do_swap) SWAP_INT32(&n_bigram);
193 if (fread(&n_trigram, sizeof(n_trigram), 1, fp) != 1)
194 goto error_out;
195 if (do_swap) SWAP_INT32(&n_trigram);
196 E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);
197
198 /* Allocate space for LM, including initial OOVs and placeholders; initialize it */
199 model = ckd_calloc(1, sizeof(*model));
200 base = &model->base;
201 if (n_trigram > 0)
202 n = 3;
203 else if (n_bigram > 0)
204 n = 2;
205 else
206 n = 1;
207 ngram_model_init(base, &ngram_model_dmp_funcs, lmath, n, n_unigram);
208 base->n_counts[0] = n_unigram;
209 base->n_counts[1] = n_bigram;
210 base->n_counts[2] = n_trigram;
211
212 /* read unigrams (always in memory, as they contain dictionary
213 * mappings that can't be precomputed, and also could have OOVs added) */
214 model->lm3g.unigrams = new_unigram_table(n_unigram + 1);
215 ugptr = model->lm3g.unigrams;
216 for (i = 0; i <= n_unigram; ++i) {
217 /* Skip over the mapping ID, we don't care about it. */
218 if (fread(ugptr, sizeof(int32), 1, fp) != 1) {
219 E_ERROR("Failed to read maping id %d\n", i);
220 goto error_out;
221 }
222 /* Read the actual unigram structure. */
223 if (fread(ugptr, sizeof(unigram_t), 1, fp) != 1) {
224 E_ERROR("Failed to read unigrams data\n");
225 ngram_model_free(base);
226 fclose_comp(fp, is_pipe);
227 return NULL;
228 }
229 /* Byte swap if necessary. */
230 if (do_swap) {
231 SWAP_INT32(&ugptr->prob1.l);
232 SWAP_INT32(&ugptr->bo_wt1.l);
233 SWAP_INT32(&ugptr->bigrams);
234 }
235 /* Convert values to log. */
236 ugptr->prob1.l = logmath_log10_to_log(lmath, ugptr->prob1.f);
237 ugptr->bo_wt1.l = logmath_log10_to_log(lmath, ugptr->bo_wt1.f);
238 E_DEBUG(2, ("ug %d: prob %d bo %d bigrams %d\n",
239 i, ugptr->prob1.l, ugptr->bo_wt1.l, ugptr->bigrams));
240 ++ugptr;
241 }
242 E_INFO("%8d = LM.unigrams(+trailer) read\n", n_unigram);
243
244 /* Now mmap() the file and read in the rest of the (read-only) stuff. */
245 if (do_mmap) {
246 offset = ftell(fp);
247
248 /* Check for improper word alignment. */
249 if (offset & 0x3) {
250 E_WARN("-mmap specified, but trigram index is not word-aligned. Will not memory-map.\n");
251 do_mmap = FALSE;
252 }
253 else {
254 model->dump_mmap = mmio_file_read(file_name);
255 if (model->dump_mmap == NULL) {
256 do_mmap = FALSE;
257 }
258 else {
259 map_base = mmio_file_ptr(model->dump_mmap);
260 }
261 }
262 }
263
264 if (n_bigram > 0) {
265 /* read bigrams */
266 if (do_mmap) {
267 model->lm3g.bigrams = (bigram_t *) (map_base + offset);
268 offset += (n_bigram + 1) * sizeof(bigram_t);
269 }
270 else {
271 model->lm3g.bigrams =
272 ckd_calloc(n_bigram + 1, sizeof(bigram_t));
273 if (fread(model->lm3g.bigrams, sizeof(bigram_t), n_bigram + 1, fp)
274 != (size_t) n_bigram + 1) {
275 E_ERROR("Failed to read bigrams data\n");
276 goto error_out;
277 }
278 if (do_swap) {
279 for (i = 0, bgptr = model->lm3g.bigrams; i <= n_bigram;
280 i++, bgptr++) {
281 SWAP_INT16(&bgptr->wid);
282 SWAP_INT16(&bgptr->prob2);
283 SWAP_INT16(&bgptr->bo_wt2);
284 SWAP_INT16(&bgptr->trigrams);
285 }
286 }
287 }
288 E_INFO("%8d = LM.bigrams(+trailer) read\n", n_bigram);
289 }
290
291 /* read trigrams */
292 if (n_trigram > 0) {
293 if (do_mmap) {
294 model->lm3g.trigrams = (trigram_t *) (map_base + offset);
295 offset += n_trigram * sizeof(trigram_t);
296 }
297 else {
298 model->lm3g.trigrams =
299 ckd_calloc(n_trigram, sizeof(trigram_t));
300 if (fread
301 (model->lm3g.trigrams, sizeof(trigram_t), n_trigram, fp)
302 != (size_t) n_trigram) {
303 E_ERROR("Failed to read trigrams data\n");
304 goto error_out;
305 }
306 if (do_swap) {
307 for (i = 0, tgptr = model->lm3g.trigrams; i < n_trigram;
308 i++, tgptr++) {
309 SWAP_INT16(&tgptr->wid);
310 SWAP_INT16(&tgptr->prob3);
311 }
312 }
313 }
314 E_INFO("%8d = LM.trigrams read\n", n_trigram);
315 /* Initialize tginfo */
316 model->lm3g.tginfo = ckd_calloc(n_unigram, sizeof(tginfo_t *));
317 model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t));
318 }
319
320 if (n_bigram > 0) {
321 /* read n_prob2 and prob2 array (in memory) */
322 if (do_mmap)
323 fseek(fp, offset, SEEK_SET);
324 if (fread(&k, sizeof(k), 1, fp) != 1)
325 goto error_out;
326 if (do_swap) SWAP_INT32(&k);
327 model->lm3g.n_prob2 = k;
328 model->lm3g.prob2 = ckd_calloc(k, sizeof(*model->lm3g.prob2));
329 if (fread(model->lm3g.prob2, sizeof(*model->lm3g.prob2), k, fp) != (size_t) k) {
330 E_ERROR("fread(prob2) failed\n");
331 goto error_out;
332 }
333 for (i = 0; i < k; i++) {
334 if (do_swap)
335 SWAP_INT32(&model->lm3g.prob2[i].l);
336 /* Convert values to log. */
337 model->lm3g.prob2[i].l = logmath_log10_to_log(lmath, model->lm3g.prob2[i].f);
338 }
339 E_INFO("%8d = LM.prob2 entries read\n", k);
340 }
341
342 /* read n_bo_wt2 and bo_wt2 array (in memory) */
343 if (base->n > 2) {
344 if (fread(&k, sizeof(k), 1, fp) != 1)
345 goto error_out;
346 if (do_swap) SWAP_INT32(&k);
347 model->lm3g.n_bo_wt2 = k;
348 model->lm3g.bo_wt2 = ckd_calloc(k, sizeof(*model->lm3g.bo_wt2));
349 if (fread(model->lm3g.bo_wt2, sizeof(*model->lm3g.bo_wt2), k, fp) != (size_t) k) {
350 E_ERROR("Failed to read backoff weights\n");
351 goto error_out;
352 }
353 for (i = 0; i < k; i++) {
354 if (do_swap)
355 SWAP_INT32(&model->lm3g.bo_wt2[i].l);
356 /* Convert values to log. */
357 model->lm3g.bo_wt2[i].l = logmath_log10_to_log(lmath, model->lm3g.bo_wt2[i].f);
358 }
359 E_INFO("%8d = LM.bo_wt2 entries read\n", k);
360 }
361
362 /* read n_prob3 and prob3 array (in memory) */
363 if (base->n > 2) {
364 if (fread(&k, sizeof(k), 1, fp) != 1)
365 goto error_out;
366 if (do_swap) SWAP_INT32(&k);
367 model->lm3g.n_prob3 = k;
368 model->lm3g.prob3 = ckd_calloc(k, sizeof(*model->lm3g.prob3));
369 if (fread(model->lm3g.prob3, sizeof(*model->lm3g.prob3), k, fp) != (size_t) k) {
370 E_ERROR("Failed to read trigram probability\n");
371 goto error_out;
372 }
373 for (i = 0; i < k; i++) {
374 if (do_swap)
375 SWAP_INT32(&model->lm3g.prob3[i].l);
376 /* Convert values to log. */
377 model->lm3g.prob3[i].l = logmath_log10_to_log(lmath, model->lm3g.prob3[i].f);
378 }
379 E_INFO("%8d = LM.prob3 entries read\n", k);
380 }
381
382 /* read tseg_base size and tseg_base */
383 if (do_mmap)
384 offset = ftell(fp);
385 if (n_trigram > 0) {
386 if (do_mmap) {
387 memcpy(&k, map_base + offset, sizeof(k));
388 offset += sizeof(int32);
389 model->lm3g.tseg_base = (int32 *) (map_base + offset);
390 offset += k * sizeof(int32);
391 }
392 else {
393 k = (n_bigram + 1) / BG_SEG_SZ + 1;
394 if (fread(&k, sizeof(k), 1, fp) != 1)
395 goto error_out;
396 if (do_swap) SWAP_INT32(&k);
397 model->lm3g.tseg_base = ckd_calloc(k, sizeof(int32));
398 if (fread(model->lm3g.tseg_base, sizeof(int32), k, fp) !=
399 (size_t) k) {
400 E_ERROR("Failed to read trigram index\n");
401 goto error_out;
402 }
403 if (do_swap)
404 for (i = 0; i < k; i++)
405 SWAP_INT32(&model->lm3g.tseg_base[i]);
406 }
407 E_INFO("%8d = LM.tseg_base entries read\n", k);
408 }
409
410 /* read ascii word strings */
411 if (do_mmap) {
412 memcpy(&k, map_base + offset, sizeof(k));
413 offset += sizeof(int32);
414 tmp_word_str = (char *) (map_base + offset);
415 offset += k;
416 }
417 else {
418 base->writable = TRUE;
419 if (fread(&k, sizeof(k), 1, fp) != 1)
420 goto error_out;
421 if (do_swap) SWAP_INT32(&k);
422 tmp_word_str = ckd_calloc(k, 1);
423 if (fread(tmp_word_str, 1, k, fp) != (size_t) k) {
424 E_ERROR("Failed to read words\n");
425 goto error_out;
426 }
427 }
428
429 /* First make sure string just read contains n_counts[0] words (PARANOIA!!) */
430 for (i = 0, j = 0; i < k; i++)
431 if (tmp_word_str[i] == '\0')
432 j++;
433 if (j != n_unigram) {
434 E_ERROR("Error reading word strings (%d doesn't match n_unigrams %d)\n",
435 j, n_unigram);
436 goto error_out;
437 }
438
439 /* Break up string just read into words */
440 if (do_mmap) {
441 j = 0;
442 for (i = 0; i < n_unigram; i++) {
443 base->word_str[i] = tmp_word_str + j;
444 if (hash_table_enter(base->wid, base->word_str[i],
445 (void *)(long)i) != (void *)(long)i) {
446 E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]);
447 }
448 j += strlen(base->word_str[i]) + 1;
449 }
450 }
451 else {
452 j = 0;
453 for (i = 0; i < n_unigram; i++) {
454 base->word_str[i] = ckd_salloc(tmp_word_str + j);
455 if (hash_table_enter(base->wid, base->word_str[i],
456 (void *)(long)i) != (void *)(long)i) {
457 E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]);
458 }
459 j += strlen(base->word_str[i]) + 1;
460 }
461 free(tmp_word_str);
462 }
463 E_INFO("%8d = ascii word strings read\n", i);
464
465 fclose_comp(fp, is_pipe);
466 return base;
467
468error_out:
469 if (fp)
470 fclose_comp(fp, is_pipe);
471 ngram_model_free(base);
472 return NULL;
473}
474
476ngram_model_dmp_build(ngram_model_t *base)
477{
478 ngram_model_dmp_t *model;
479 ngram_model_t *newbase;
480 ngram_iter_t *itor;
481 sorted_list_t sorted_prob2;
482 sorted_list_t sorted_bo_wt2;
483 sorted_list_t sorted_prob3;
484 bigram_t *bgptr;
485 trigram_t *tgptr;
486 int i, bgcount, tgcount, seg;
487
488 if (base->funcs == &ngram_model_dmp_funcs) {
489 E_INFO("Using existing DMP model.\n");
491 }
492
493 /* Initialize new base model structure with params from base. */
494 E_INFO("Building DMP model...\n");
495 model = ckd_calloc(1, sizeof(*model));
496 newbase = &model->base;
497 ngram_model_init(newbase, &ngram_model_dmp_funcs,
498 logmath_retain(base->lmath),
499 base->n, base->n_counts[0]);
500 /* Copy N-gram counts over. */
501 memcpy(newbase->n_counts, base->n_counts,
502 base->n * sizeof(*base->n_counts));
503 /* Make sure word strings are freed. */
504 newbase->writable = TRUE;
505 /* Initialize unigram table and string table. */
506 model->lm3g.unigrams = new_unigram_table(newbase->n_counts[0] + 1);
507 for (itor = ngram_model_mgrams(base, 0); itor;
508 itor = ngram_iter_next(itor)) {
509 int32 prob1, bo_wt1;
510 int32 const *wids;
511
512 /* Can't guarantee they will go in unigram order, so just to
513 * be correct, we do this... */
514 wids = ngram_iter_get(itor, &prob1, &bo_wt1);
515 model->lm3g.unigrams[wids[0]].prob1.l = prob1;
516 model->lm3g.unigrams[wids[0]].bo_wt1.l = bo_wt1;
517 newbase->word_str[wids[0]] = ckd_salloc(ngram_word(base, wids[0]));
518 if ((hash_table_enter_int32(newbase->wid,
519 newbase->word_str[wids[0]], wids[0]))
520 != wids[0]) {
521 E_WARN("Duplicate word in dictionary: %s\n", newbase->word_str[wids[0]]);
522 }
523 }
524 E_INFO("%8d = #unigrams created\n", newbase->n_counts[0]);
525
526 if (newbase->n < 2)
527 return model;
528
529 /* Construct quantized probability table for bigrams and
530 * (optionally) trigrams. Hesitate to use the "sorted list" thing
531 * since it isn't so useful, but it's there already. */
532 init_sorted_list(&sorted_prob2);
533 if (newbase->n > 2) {
534 init_sorted_list(&sorted_bo_wt2);
535 init_sorted_list(&sorted_prob3);
536 }
537 /* Construct bigram and trigram arrays. */
538 bgptr = model->lm3g.bigrams = ckd_calloc(newbase->n_counts[1] + 1, sizeof(bigram_t));
539 if (newbase->n > 2) {
540 tgptr = model->lm3g.trigrams = ckd_calloc(newbase->n_counts[2], sizeof(trigram_t));
541 model->lm3g.tseg_base =
542 ckd_calloc((newbase->n_counts[1] + 1) / BG_SEG_SZ + 1, sizeof(int32));
543 }
544 else
545 tgptr = NULL;
546 /* Since bigrams and trigrams have to be contiguous with others
547 * with the same N-1-gram, we traverse them in depth-first order
548 * to build the bigram and trigram arrays. */
549 for (i = 0; i < newbase->n_counts[0]; ++i) {
550 ngram_iter_t *uitor;
551 bgcount = bgptr - model->lm3g.bigrams;
552 /* First bigram index (same as next if no bigrams...) */
553 model->lm3g.unigrams[i].bigrams = bgcount;
554 E_DEBUG(2, ("unigram %d: %s => bigram %d\n", i, newbase->word_str[i], bgcount));
555 /* All bigrams corresponding to unigram i */
556 uitor = ngram_ng_iter(base, i, NULL, 0);
557 for (itor = ngram_iter_successors(uitor);
558 itor; ++bgptr, itor = ngram_iter_next(itor)) {
559 int32 prob2, bo_wt2;
560 int32 const *wids;
561 ngram_iter_t *titor;
562
563 wids = ngram_iter_get(itor, &prob2, &bo_wt2);
564
565 assert (bgptr - model->lm3g.bigrams < newbase->n_counts[1]);
566
567 bgptr->wid = wids[1];
568 bgptr->prob2 = sorted_id(&sorted_prob2, &prob2);
569 if (newbase->n > 2) {
570 tgcount = (tgptr - model->lm3g.trigrams);
571 bgcount = (bgptr - model->lm3g.bigrams);
572
573 /* Backoff weight (only if there are trigrams...) */
574 bgptr->bo_wt2 = sorted_id(&sorted_bo_wt2, &bo_wt2);
575
576 /* Find bigram segment for this bigram (this isn't
577 * used unless there are trigrams) */
578 seg = bgcount >> LOG_BG_SEG_SZ;
579 /* If we just crossed a bigram segment boundary, then
580 * point tseg_base for the new segment to the current
581 * trigram pointer. */
582 if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ)
583 model->lm3g.tseg_base[seg] = tgcount;
584 /* Now calculate the trigram offset. */
585 bgptr->trigrams = tgcount - model->lm3g.tseg_base[seg];
586 E_DEBUG(2, ("bigram %d %s %s => trigram %d:%d\n",
587 bgcount,
588 newbase->word_str[wids[0]],
589 newbase->word_str[wids[1]],
590 seg, bgptr->trigrams));
591
592 /* And fill in successors' trigram info. */
593 for (titor = ngram_iter_successors(itor);
594 titor; ++tgptr, titor = ngram_iter_next(titor)) {
595 int32 prob3, dummy;
596
597 assert(tgptr - model->lm3g.trigrams < newbase->n_counts[2]);
598 wids = ngram_iter_get(titor, &prob3, &dummy);
599 tgptr->wid = wids[2];
600 tgptr->prob3 = sorted_id(&sorted_prob3, &prob3);
601 E_DEBUG(2, ("trigram %d %s %s %s => prob %d\n",
602 tgcount,
603 newbase->word_str[wids[0]],
604 newbase->word_str[wids[1]],
605 newbase->word_str[wids[2]],
606 tgptr->prob3));
607 }
608 }
609 }
610 ngram_iter_free(uitor);
611 }
612 /* Add sentinal unigram and bigram records. */
613 bgcount = bgptr - model->lm3g.bigrams;
614 tgcount = tgptr - model->lm3g.trigrams;
615 seg = bgcount >> LOG_BG_SEG_SZ;
616 if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ)
617 model->lm3g.tseg_base[seg] = tgcount;
618 model->lm3g.unigrams[i].bigrams = bgcount;
619 if (newbase->n > 2)
620 bgptr->trigrams = tgcount - model->lm3g.tseg_base[seg];
621
622 /* Now create probability tables. */
623 model->lm3g.n_prob2 = sorted_prob2.free;
624 model->lm3g.prob2 = vals_in_sorted_list(&sorted_prob2);
625 E_INFO("%8d = #bigrams created\n", newbase->n_counts[1]);
626 E_INFO("%8d = #prob2 entries\n", model->lm3g.n_prob2);
627 free_sorted_list(&sorted_prob2);
628 if (newbase->n > 2) {
629 /* Create trigram bo-wts array. */
630 model->lm3g.n_bo_wt2 = sorted_bo_wt2.free;
631 model->lm3g.bo_wt2 = vals_in_sorted_list(&sorted_bo_wt2);
632 free_sorted_list(&sorted_bo_wt2);
633 E_INFO("%8d = #bo_wt2 entries\n", model->lm3g.n_bo_wt2);
634 /* Create trigram probability table. */
635 model->lm3g.n_prob3 = sorted_prob3.free;
636 model->lm3g.prob3 = vals_in_sorted_list(&sorted_prob3);
637 E_INFO("%8d = #trigrams created\n", newbase->n_counts[2]);
638 E_INFO("%8d = #prob3 entries\n", model->lm3g.n_prob3);
639 free_sorted_list(&sorted_prob3);
640 /* Initialize tginfo */
641 model->lm3g.tginfo = ckd_calloc(newbase->n_counts[0], sizeof(tginfo_t *));
642 model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t));
643 }
644
645 return model;
646}
647
648static void
649fwrite_int32(FILE *fh, int32 val)
650{
651 fwrite(&val, 4, 1, fh);
652}
653
654static void
655fwrite_ug(FILE *fh, unigram_t *ug, logmath_t *lmath)
656{
657 int32 bogus = -1;
658 float32 log10val;
659
660 /* Bogus dictionary mapping field. */
661 fwrite(&bogus, 4, 1, fh);
662 /* Convert values to log10. */
663 log10val = logmath_log_to_log10(lmath, ug->prob1.l);
664 fwrite(&log10val, 4, 1, fh);
665 log10val = logmath_log_to_log10(lmath, ug->bo_wt1.l);
666 fwrite(&log10val, 4, 1, fh);
667 fwrite_int32(fh, ug->bigrams);
668}
669
670static void
671fwrite_bg(FILE *fh, bigram_t *bg)
672{
673 fwrite(bg, sizeof(*bg), 1, fh);
674}
675
676static void
677fwrite_tg(FILE *fh, trigram_t *tg)
678{
679 fwrite(tg, sizeof(*tg), 1, fh);
680}
681
684static char const *fmtdesc[] = {
685 "BEGIN FILE FORMAT DESCRIPTION",
686 "Header string length (int32) and string (including trailing 0)",
687 "Original LM filename string-length (int32) and filename (including trailing 0)",
688 "(int32) version number (present iff value <= 0)",
689 "(int32) original LM file modification timestamp (iff version# present)",
690 "(int32) string-length and string (including trailing 0) (iff version# present)",
691 "... previous entry continued any number of times (iff version# present)",
692 "(int32) 0 (terminating sequence of strings) (iff version# present)",
693 "(int32) log_bg_seg_sz (present iff different from default value of LOG2_BG_SEG_SZ)",
694 "(int32) lm_t.ucount (must be > 0)",
695 "(int32) lm_t.bcount",
696 "(int32) lm_t.tcount",
697 "lm_t.ucount+1 unigrams (including sentinel)",
698 "lm_t.bcount+1 bigrams (including sentinel 64 bits (bg_t) each if version=-1/-2, 128 bits (bg32_t) each if version=-3",
699 "lm_t.tcount trigrams (present iff lm_t.tcount > 0 32 bits (tg_t) each if version=-1/-2, 64 bits (tg32_t) each if version=-3)",
700 "(int32) lm_t.n_prob2",
701 "(int32) lm_t.prob2[]",
702 "(int32) lm_t.n_bo_wt2 (present iff lm_t.tcount > 0)",
703 "(int32) lm_t.bo_wt2[] (present iff lm_t.tcount > 0)",
704 "(int32) lm_t.n_prob3 (present iff lm_t.tcount > 0)",
705 "(int32) lm_t.prob3[] (present iff lm_t.tcount > 0)",
706 "(int32) (lm_t.bcount+1)/BG_SEG_SZ+1 (present iff lm_t.tcount > 0)",
707 "(int32) lm_t.tseg_base[] (present iff lm_t.tcount > 0)",
708 "(int32) Sum(all word string-lengths, including trailing 0 for each)",
709 "All word strings (including trailing 0 for each)",
710 "END FILE FORMAT DESCRIPTION",
711 NULL,
712};
713
714static void
715ngram_model_dmp_write_header(FILE * fh)
716{
717 int32 k;
718 k = strlen(darpa_hdr) + 1;
719 fwrite_int32(fh, k);
720 fwrite(darpa_hdr, 1, k, fh);
721}
722
723static void
724ngram_model_dmp_write_lm_filename(FILE * fh, const char *lmfile)
725{
726 int32 k;
727
728 k = strlen(lmfile) + 1;
729 fwrite_int32(fh, k);
730 fwrite(lmfile, 1, k, fh);
731}
732
733#define LMDMP_VERSION_TG_16BIT -1
737static void
738ngram_model_dmp_write_version(FILE * fh, int32 mtime)
739{
740 fwrite_int32(fh, LMDMP_VERSION_TG_16BIT); /* version # */
741 fwrite_int32(fh, mtime);
742}
743
744static void
745ngram_model_dmp_write_ngram_counts(FILE * fh, ngram_model_t *model)
746{
747 fwrite_int32(fh, model->n_counts[0]);
748 fwrite_int32(fh, model->n_counts[1]);
749 fwrite_int32(fh, model->n_counts[2]);
750}
751
752static void
753ngram_model_dmp_write_fmtdesc(FILE * fh)
754{
755 int32 i, k;
756 long pos;
757
758 /* Write file format description into header */
759 for (i = 0; fmtdesc[i] != NULL; i++) {
760 k = strlen(fmtdesc[i]) + 1;
761 fwrite_int32(fh, k);
762 fwrite(fmtdesc[i], 1, k, fh);
763 }
764 /* Pad it out in order to achieve 32-bit alignment */
765 pos = ftell(fh);
766 k = pos & 3;
767 if (k) {
768 fwrite_int32(fh, 4-k);
769 fwrite("!!!!", 1, 4-k, fh);
770 }
771 fwrite_int32(fh, 0);
772}
773
774static void
775ngram_model_dmp_write_unigram(FILE *fh, ngram_model_t *model)
776{
778 int32 i;
779
780 for (i = 0; i <= model->n_counts[0]; i++) {
781 fwrite_ug(fh, &(lm->lm3g.unigrams[i]), model->lmath);
782 }
783}
784
785
786static void
787ngram_model_dmp_write_bigram(FILE *fh, ngram_model_t *model)
788{
790 int32 i;
791
792 for (i = 0; i <= model->n_counts[1]; i++) {
793 fwrite_bg(fh, &(lm->lm3g.bigrams[i]));
794 }
795
796}
797
798static void
799ngram_model_dmp_write_trigram(FILE *fh, ngram_model_t *model)
800{
802 int32 i;
803
804 for (i = 0; i < model->n_counts[2]; i++) {
805 fwrite_tg(fh, &(lm->lm3g.trigrams[i]));
806 }
807}
808
809static void
810ngram_model_dmp_write_bgprob(FILE *fh, ngram_model_t *model)
811{
813 int32 i;
814
815 fwrite_int32(fh, lm->lm3g.n_prob2);
816 for (i = 0; i < lm->lm3g.n_prob2; i++) {
817 float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.prob2[i].l);
818 fwrite(&log10val, 4, 1, fh);
819 }
820}
821
822static void
823ngram_model_dmp_write_tgbowt(FILE *fh, ngram_model_t *model)
824{
826 int32 i;
827
828 fwrite_int32(fh, lm->lm3g.n_bo_wt2);
829 for (i = 0; i < lm->lm3g.n_bo_wt2; i++) {
830 float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.bo_wt2[i].l);
831 fwrite(&log10val, 4, 1, fh);
832 }
833}
834
835static void
836ngram_model_dmp_write_tgprob(FILE *fh, ngram_model_t *model)
837{
839 int32 i;
840
841 fwrite_int32(fh, lm->lm3g.n_prob3);
842 for (i = 0; i < lm->lm3g.n_prob3; i++) {
843 float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.prob3[i].l);
844 fwrite(&log10val, 4, 1, fh);
845 }
846}
847
848static void
849ngram_model_dmp_write_tg_segbase(FILE *fh, ngram_model_t *model)
850{
852 int32 i, k;
853
854 k = (model->n_counts[1] + 1) / BG_SEG_SZ + 1;
855 fwrite_int32(fh, k);
856 for (i = 0; i < k; i++)
857 fwrite_int32(fh, lm->lm3g.tseg_base[i]);
858}
859
860static void
861ngram_model_dmp_write_wordstr(FILE *fh, ngram_model_t *model)
862{
863 int32 i, k;
864
865 k = 0;
866 for (i = 0; i < model->n_counts[0]; i++)
867 k += strlen(model->word_str[i]) + 1;
868 fwrite_int32(fh, k);
869 for (i = 0; i < model->n_counts[0]; i++)
870 fwrite(model->word_str[i], 1,
871 strlen(model->word_str[i]) + 1, fh);
872}
873
874int
875ngram_model_dmp_write(ngram_model_t *base,
876 const char *file_name)
877{
878 ngram_model_dmp_t *model;
879 ngram_model_t *newbase;
880 FILE *fh;
881
882 /* First, construct a DMP model from the base model. */
883 model = ngram_model_dmp_build(base);
884 newbase = &model->base;
885
886 /* Now write it, confident in the knowledge that it's the right
887 * kind of language model internally. */
888 if ((fh = fopen(file_name, "wb")) == NULL) {
889 E_ERROR("Cannot create file %s\n", file_name);
890 return -1;
891 }
892 ngram_model_dmp_write_header(fh);
893 ngram_model_dmp_write_lm_filename(fh, file_name);
894 ngram_model_dmp_write_version(fh, 0);
895 ngram_model_dmp_write_fmtdesc(fh);
896 ngram_model_dmp_write_ngram_counts(fh, newbase);
897 ngram_model_dmp_write_unigram(fh, newbase);
898 if (newbase->n > 1) {
899 ngram_model_dmp_write_bigram(fh, newbase);
900 if (newbase->n > 2) {
901 ngram_model_dmp_write_trigram(fh, newbase);
902 }
903 ngram_model_dmp_write_bgprob(fh, newbase);
904 if (newbase->n > 2) {
905 ngram_model_dmp_write_tgbowt(fh, newbase);
906 ngram_model_dmp_write_tgprob(fh, newbase);
907 ngram_model_dmp_write_tg_segbase(fh, newbase);
908 }
909 }
910 ngram_model_dmp_write_wordstr(fh, newbase);
911 ngram_model_free(newbase);
912
913 return fclose(fh);
914}
915
916static int
917ngram_model_dmp_apply_weights(ngram_model_t *base, float32 lw,
918 float32 wip, float32 uw)
919{
920 ngram_model_dmp_t *model = (ngram_model_dmp_t *)base;
921 lm3g_apply_weights(base, &model->lm3g, lw, wip, uw);
922 return 0;
923}
924
925/* Lousy "templating" for things that are largely the same in DMP and
926 * ARPA models, except for the bigram and trigram types and some
927 * names. */
928#define NGRAM_MODEL_TYPE ngram_model_dmp_t
929#include "lm3g_templates.c"
930
931static void
932ngram_model_dmp_free(ngram_model_t *base)
933{
934 ngram_model_dmp_t *model = (ngram_model_dmp_t *)base;
935
936 ckd_free(model->lm3g.unigrams);
937 ckd_free(model->lm3g.prob2);
938 if (model->dump_mmap) {
940 }
941 else {
942 ckd_free(model->lm3g.bigrams);
943 if (base->n > 2) {
944 ckd_free(model->lm3g.trigrams);
945 ckd_free(model->lm3g.tseg_base);
946 }
947 }
948 if (base->n > 2) {
949 ckd_free(model->lm3g.bo_wt2);
950 ckd_free(model->lm3g.prob3);
951 }
952
953 lm3g_tginfo_free(base, &model->lm3g);
954}
955
956static ngram_funcs_t ngram_model_dmp_funcs = {
957 ngram_model_dmp_free, /* free */
958 ngram_model_dmp_apply_weights, /* apply_weights */
959 lm3g_template_score, /* score */
960 lm3g_template_raw_score, /* raw_score */
961 lm3g_template_add_ug, /* add_ug */
962 lm3g_template_flush, /* flush */
963 lm3g_template_iter, /* iter */
964 lm3g_template_mgrams, /* mgrams */
965 lm3g_template_successors, /* successors */
966 lm3g_template_iter_get, /* iter_get */
967 lm3g_template_iter_next, /* iter_next */
968 lm3g_template_iter_free /* iter_free */
969};
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition ckd_alloc.c:241
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition ckd_alloc.h:248
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition ckd_alloc.h:264
#define cmd_ln_boolean_r(c, n)
Retrieve a boolean value from a command-line object.
Definition cmd_ln.h:334
Implementation of logging routines.
#define E_FATAL
Exit with non-zero status after error message.
Definition err.h:127
#define E_ERROR
Print error message to standard error stream.
Definition err.h:169
#define E_WARN
Print warning information to standard error stream.
Definition err.h:164
#define E_INFO
Print logging information to standard error stream.
Definition err.h:147
#define E_DEBUG(level, x)
Print debugging information to standard error stream.
Definition err.h:212
#define hash_table_enter_int32(h, k, v)
Add a 32-bit integer value to a hash table.
Definition hash_table.h:228
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
Definition hash_table.c:508
Fast memory allocator for uniformly sized objects.
SPHINXBASE_EXPORT listelem_alloc_t * listelem_alloc_init(size_t elemsize)
Initialize and return a list element allocator.
SPHINXBASE_EXPORT float64 logmath_log_to_log10(logmath_t *lmath, int logb_p)
Convert integer log in base B to base 10 log (in floating point).
Definition logmath.c:480
SPHINXBASE_EXPORT int logmath_log10_to_log(logmath_t *lmath, float64 log_p)
Convert base 10 log (in floating point) to integer log in base B.
Definition logmath.c:474
SPHINXBASE_EXPORT logmath_t * logmath_retain(logmath_t *lmath)
Retain ownership of a log table.
Definition logmath.c:335
SPHINXBASE_EXPORT void mmio_file_unmap(mmio_file_t *mf)
Unmap a file, releasing memory associated with it.
Definition mmio.c:240
SPHINXBASE_EXPORT mmio_file_t * mmio_file_read(const char *filename)
Memory-map a file for reading.
Definition mmio.c:206
SPHINXBASE_EXPORT void * mmio_file_ptr(mmio_file_t *mf)
Get a pointer to the memory mapped for a file.
Definition mmio.c:251
SPHINXBASE_EXPORT ngram_iter_t * ngram_ng_iter(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist)
Get an iterator over M-grams pointing to the specified M-gram.
SPHINXBASE_EXPORT const char * ngram_word(ngram_model_t *model, int32 wid)
Look up word string for numerical word ID.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter_successors(ngram_iter_t *itor)
Iterate over all M-gram successors of an M-1-gram.
SPHINXBASE_EXPORT int32 const * ngram_iter_get(ngram_iter_t *itor, int32 *out_score, int32 *out_bowt)
Get information from the current M-gram in an iterator.
SPHINXBASE_EXPORT void ngram_iter_free(ngram_iter_t *itor)
Terminate an M-gram iterator.
SPHINXBASE_EXPORT ngram_iter_t * ngram_model_mgrams(ngram_model_t *model, int m)
Iterate over all M-grams.
SPHINXBASE_EXPORT int ngram_model_free(ngram_model_t *model)
Release memory associated with an N-Gram model.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_retain(ngram_model_t *model)
Retain ownership of an N-Gram model.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter_next(ngram_iter_t *itor)
Advance an M-gram iterator.
file IO related operations.
SPHINXBASE_EXPORT void fclose_comp(FILE *fp, int32 ispipe)
Close a file opened using fopen_comp.
Definition pio.c:175
SPHINXBASE_EXPORT FILE * fopen_comp(const char *file, const char *mode, int32 *ispipe)
Like fopen, but use popen and zcat if it is determined that "file" is compressed (i....
Definition pio.c:98
Bigram structure.
uint16 trigrams
Index of 1st entry in lm_t.trigrams[], RELATIVE TO its segment base (see above)
uint16 bo_wt2
Index into array of actual bigram backoff wts.
uint16 prob2
Index into array of actual bigram probs.
uint32 wid
Index of unigram entry for this.
Opaque structure used to hold the results of command-line parsing.
int32 n_prob2
prob2 size
Definition lm3g_model.h:147
listelem_alloc_t * le
List element allocator for tginfo.
Definition lm3g_model.h:156
lmprob_t * prob2
Table of actual bigram probs.
Definition lm3g_model.h:146
int32 * tseg_base
tseg_base[i>>LOG_BG_SEG_SZ] = index of 1st trigram for bigram segment (i>>LOG_BG_SEG_SZ)
Definition lm3g_model.h:152
lmprob_t * bo_wt2
Table of actual bigram backoff weights.
Definition lm3g_model.h:148
tginfo_t ** tginfo
tginfo[lw2] is head of linked list of trigram information for some cached subset of bigrams (*,...
Definition lm3g_model.h:154
int32 n_bo_wt2
bo_wt2 size
Definition lm3g_model.h:149
lmprob_t * prob3
Table of actual trigram probs.
Definition lm3g_model.h:150
int32 n_prob3
prob3 size
Definition lm3g_model.h:151
Implementation-specific functions for operating on ngram_model_t objects.
Base iterator structure for N-grams.
Subclass of ngram_model for DMP file reading.
mmio_file_t * dump_mmap
mmap() of dump file (or NULL if none)
lm3g_model_t lm3g
Common lm3g_model_t structure.
ngram_model_t base
Base ngram_model_t structure.
Common implementation of ngram_model_t.
logmath_t * lmath
Log-math object.
uint8 n
This is an n-gram model (1, 2, 3, ...).
hash_table_t * wid
Mapping of unigram names to word IDs.
uint8 writable
Are word strings writable?
int32 * n_counts
Counts for 1, 2, 3, ... grams.
struct ngram_funcs_s * funcs
Implementation-specific methods.
char ** word_str
Unigram names.
The sorted list.
Definition lm3g_model.h:82
int32 free
first free element in list
Definition lm3g_model.h:84
Trigram information cache.
Definition lm3g_model.h:129
Trigram structure.
uint32 wid
Index of unigram entry for this.
uint16 prob3
Index into array of actual trigram probs.
Unigram structure (common among all lm3g implementations)
Definition lm3g_model.h:91
lmprob_t prob1
Unigram probability.
Definition lm3g_model.h:92
lmprob_t bo_wt1
Unigram backoff weight.
Definition lm3g_model.h:93
int32 bigrams
Index of 1st entry in lm_t.bigrams[].
Definition lm3g_model.h:94