52#include "sphinxbase/byteorder.h"
55#include "ngram_model_dmp.h"
57static const char darpa_hdr[] =
"Darpa Trigram LM";
60#define TSEG_BASE(m,b) ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ])
61#define FIRST_BG(m,u) ((m)->lm3g.unigrams[u].bigrams)
62#define FIRST_TG(m,b) (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams))
65new_unigram_table(int32 n_ug)
71 for (i = 0; i < n_ug; i++) {
72 table[i].
prob1.f = -99.0;
79ngram_model_dmp_read(
cmd_ln_t *config,
80 const char *file_name,
88 int32 i, j, k, vn, n, ts;
97 char *map_base = NULL;
105 if ((fp =
fopen_comp(file_name,
"rb", &is_pipe)) == NULL) {
106 E_ERROR(
"Dump file %s not found\n", file_name);
110 if (is_pipe && do_mmap) {
111 E_WARN(
"Dump file is compressed, will not use memory-mapped I/O\n");
116 if (fread(&k,
sizeof(k), 1, fp) != 1)
118 if (k != strlen(darpa_hdr)+1) {
120 if (k != strlen(darpa_hdr)+1) {
121 E_ERROR(
"Wrong magic header size number %x: %s is not a dump file\n", k, file_name);
126 if (fread(str, 1, k, fp) != (
size_t) k) {
127 E_ERROR(
"Cannot read header\n");
130 if (strncmp(str, darpa_hdr, k) != 0) {
131 E_ERROR(
"Wrong header %s: %s is not a dump file\n", darpa_hdr);
138 (
"Byteswapping required, will not use memory-mapped I/O for LM file\n");
142 E_INFO(
"Will use memory-mapped I/O for LM file\n");
143#ifdef __ADSPBLACKFIN__
144 E_FATAL(
"memory mapping is not supported at the moment.");
150 if (fread(&k,
sizeof(k), 1, fp) != 1)
152 if (do_swap) SWAP_INT32(&k);
153 if (fread(str, 1, k, fp) != (
size_t) k) {
154 E_ERROR(
"Cannot read LM filename in header\n");
159 if (fread(&vn,
sizeof(vn), 1, fp) != 1)
161 if (do_swap) SWAP_INT32(&vn);
164 if (fread(&ts,
sizeof(ts), 1, fp) != 1)
166 if (do_swap) SWAP_INT32(&ts);
170 if (fread(&k,
sizeof(k), 1, fp) != 1)
172 if (do_swap) SWAP_INT32(&k);
175 if (fread(str, 1, k, fp) != (
size_t) k) {
176 E_ERROR(
"Failed to read word\n");
181 if (fread(&n_unigram,
sizeof(n_unigram), 1, fp) != 1)
183 if (do_swap) SWAP_INT32(&n_unigram);
190 if (fread(&n_bigram,
sizeof(n_bigram), 1, fp) != 1)
192 if (do_swap) SWAP_INT32(&n_bigram);
193 if (fread(&n_trigram,
sizeof(n_trigram), 1, fp) != 1)
195 if (do_swap) SWAP_INT32(&n_trigram);
196 E_INFO(
"ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);
203 else if (n_bigram > 0)
207 ngram_model_init(base, &ngram_model_dmp_funcs, lmath, n, n_unigram);
214 model->
lm3g.unigrams = new_unigram_table(n_unigram + 1);
215 ugptr = model->
lm3g.unigrams;
216 for (i = 0; i <= n_unigram; ++i) {
218 if (fread(ugptr,
sizeof(int32), 1, fp) != 1) {
219 E_ERROR(
"Failed to read maping id %d\n", i);
223 if (fread(ugptr,
sizeof(
unigram_t), 1, fp) != 1) {
224 E_ERROR(
"Failed to read unigrams data\n");
231 SWAP_INT32(&ugptr->
prob1.l);
232 SWAP_INT32(&ugptr->
bo_wt1.l);
238 E_DEBUG(2, (
"ug %d: prob %d bo %d bigrams %d\n",
242 E_INFO(
"%8d = LM.unigrams(+trailer) read\n", n_unigram);
250 E_WARN(
"-mmap specified, but trigram index is not word-aligned. Will not memory-map.\n");
268 offset += (n_bigram + 1) *
sizeof(
bigram_t);
271 model->
lm3g.bigrams =
273 if (fread(model->
lm3g.bigrams,
sizeof(
bigram_t), n_bigram + 1, fp)
274 != (
size_t) n_bigram + 1) {
275 E_ERROR(
"Failed to read bigrams data\n");
279 for (i = 0, bgptr = model->
lm3g.bigrams; i <= n_bigram;
281 SWAP_INT16(&bgptr->
wid);
282 SWAP_INT16(&bgptr->
prob2);
283 SWAP_INT16(&bgptr->
bo_wt2);
288 E_INFO(
"%8d = LM.bigrams(+trailer) read\n", n_bigram);
298 model->
lm3g.trigrams =
302 != (
size_t) n_trigram) {
303 E_ERROR(
"Failed to read trigrams data\n");
307 for (i = 0, tgptr = model->
lm3g.trigrams; i < n_trigram;
309 SWAP_INT16(&tgptr->
wid);
310 SWAP_INT16(&tgptr->
prob3);
314 E_INFO(
"%8d = LM.trigrams read\n", n_trigram);
323 fseek(fp, offset, SEEK_SET);
324 if (fread(&k,
sizeof(k), 1, fp) != 1)
326 if (do_swap) SWAP_INT32(&k);
330 E_ERROR(
"fread(prob2) failed\n");
333 for (i = 0; i < k; i++) {
339 E_INFO(
"%8d = LM.prob2 entries read\n", k);
344 if (fread(&k,
sizeof(k), 1, fp) != 1)
346 if (do_swap) SWAP_INT32(&k);
350 E_ERROR(
"Failed to read backoff weights\n");
353 for (i = 0; i < k; i++) {
359 E_INFO(
"%8d = LM.bo_wt2 entries read\n", k);
364 if (fread(&k,
sizeof(k), 1, fp) != 1)
366 if (do_swap) SWAP_INT32(&k);
370 E_ERROR(
"Failed to read trigram probability\n");
373 for (i = 0; i < k; i++) {
379 E_INFO(
"%8d = LM.prob3 entries read\n", k);
387 memcpy(&k, map_base + offset,
sizeof(k));
388 offset +=
sizeof(int32);
390 offset += k *
sizeof(int32);
393 k = (n_bigram + 1) / BG_SEG_SZ + 1;
394 if (fread(&k,
sizeof(k), 1, fp) != 1)
396 if (do_swap) SWAP_INT32(&k);
400 E_ERROR(
"Failed to read trigram index\n");
404 for (i = 0; i < k; i++)
407 E_INFO(
"%8d = LM.tseg_base entries read\n", k);
412 memcpy(&k, map_base + offset,
sizeof(k));
413 offset +=
sizeof(int32);
414 tmp_word_str = (
char *) (map_base + offset);
419 if (fread(&k,
sizeof(k), 1, fp) != 1)
421 if (do_swap) SWAP_INT32(&k);
423 if (fread(tmp_word_str, 1, k, fp) != (
size_t) k) {
424 E_ERROR(
"Failed to read words\n");
430 for (i = 0, j = 0; i < k; i++)
431 if (tmp_word_str[i] ==
'\0')
433 if (j != n_unigram) {
434 E_ERROR(
"Error reading word strings (%d doesn't match n_unigrams %d)\n",
442 for (i = 0; i < n_unigram; i++) {
443 base->
word_str[i] = tmp_word_str + j;
445 (
void *)(
long)i) != (
void *)(
long)i) {
453 for (i = 0; i < n_unigram; i++) {
456 (
void *)(
long)i) != (
void *)(
long)i) {
463 E_INFO(
"%8d = ascii word strings read\n", i);
486 int i, bgcount, tgcount, seg;
488 if (base->
funcs == &ngram_model_dmp_funcs) {
489 E_INFO(
"Using existing DMP model.\n");
494 E_INFO(
"Building DMP model...\n");
496 newbase = &model->
base;
497 ngram_model_init(newbase, &ngram_model_dmp_funcs,
506 model->
lm3g.unigrams = new_unigram_table(newbase->
n_counts[0] + 1);
515 model->
lm3g.unigrams[wids[0]].
prob1.l = prob1;
516 model->
lm3g.unigrams[wids[0]].
bo_wt1.l = bo_wt1;
519 newbase->
word_str[wids[0]], wids[0]))
521 E_WARN(
"Duplicate word in dictionary: %s\n", newbase->
word_str[wids[0]]);
532 init_sorted_list(&sorted_prob2);
533 if (newbase->
n > 2) {
534 init_sorted_list(&sorted_bo_wt2);
535 init_sorted_list(&sorted_prob3);
539 if (newbase->
n > 2) {
549 for (i = 0; i < newbase->
n_counts[0]; ++i) {
551 bgcount = bgptr - model->
lm3g.bigrams;
554 E_DEBUG(2, (
"unigram %d: %s => bigram %d\n", i, newbase->
word_str[i], bgcount));
565 assert (bgptr - model->
lm3g.bigrams < newbase->
n_counts[1]);
567 bgptr->
wid = wids[1];
568 bgptr->
prob2 = sorted_id(&sorted_prob2, &prob2);
569 if (newbase->
n > 2) {
570 tgcount = (tgptr - model->
lm3g.trigrams);
571 bgcount = (bgptr - model->
lm3g.bigrams);
574 bgptr->
bo_wt2 = sorted_id(&sorted_bo_wt2, &bo_wt2);
578 seg = bgcount >> LOG_BG_SEG_SZ;
582 if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ)
586 E_DEBUG(2, (
"bigram %d %s %s => trigram %d:%d\n",
597 assert(tgptr - model->
lm3g.trigrams < newbase->
n_counts[2]);
599 tgptr->
wid = wids[2];
600 tgptr->
prob3 = sorted_id(&sorted_prob3, &prob3);
601 E_DEBUG(2, (
"trigram %d %s %s %s => prob %d\n",
613 bgcount = bgptr - model->
lm3g.bigrams;
614 tgcount = tgptr - model->
lm3g.trigrams;
615 seg = bgcount >> LOG_BG_SEG_SZ;
616 if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ)
624 model->
lm3g.
prob2 = vals_in_sorted_list(&sorted_prob2);
627 free_sorted_list(&sorted_prob2);
628 if (newbase->
n > 2) {
631 model->
lm3g.
bo_wt2 = vals_in_sorted_list(&sorted_bo_wt2);
632 free_sorted_list(&sorted_bo_wt2);
636 model->
lm3g.
prob3 = vals_in_sorted_list(&sorted_prob3);
639 free_sorted_list(&sorted_prob3);
649fwrite_int32(FILE *fh, int32 val)
651 fwrite(&val, 4, 1, fh);
661 fwrite(&bogus, 4, 1, fh);
664 fwrite(&log10val, 4, 1, fh);
666 fwrite(&log10val, 4, 1, fh);
673 fwrite(bg,
sizeof(*bg), 1, fh);
679 fwrite(tg,
sizeof(*tg), 1, fh);
684static char const *fmtdesc[] = {
685 "BEGIN FILE FORMAT DESCRIPTION",
686 "Header string length (int32) and string (including trailing 0)",
687 "Original LM filename string-length (int32) and filename (including trailing 0)",
688 "(int32) version number (present iff value <= 0)",
689 "(int32) original LM file modification timestamp (iff version# present)",
690 "(int32) string-length and string (including trailing 0) (iff version# present)",
691 "... previous entry continued any number of times (iff version# present)",
692 "(int32) 0 (terminating sequence of strings) (iff version# present)",
693 "(int32) log_bg_seg_sz (present iff different from default value of LOG2_BG_SEG_SZ)",
694 "(int32) lm_t.ucount (must be > 0)",
695 "(int32) lm_t.bcount",
696 "(int32) lm_t.tcount",
697 "lm_t.ucount+1 unigrams (including sentinel)",
698 "lm_t.bcount+1 bigrams (including sentinel 64 bits (bg_t) each if version=-1/-2, 128 bits (bg32_t) each if version=-3",
699 "lm_t.tcount trigrams (present iff lm_t.tcount > 0 32 bits (tg_t) each if version=-1/-2, 64 bits (tg32_t) each if version=-3)",
700 "(int32) lm_t.n_prob2",
701 "(int32) lm_t.prob2[]",
702 "(int32) lm_t.n_bo_wt2 (present iff lm_t.tcount > 0)",
703 "(int32) lm_t.bo_wt2[] (present iff lm_t.tcount > 0)",
704 "(int32) lm_t.n_prob3 (present iff lm_t.tcount > 0)",
705 "(int32) lm_t.prob3[] (present iff lm_t.tcount > 0)",
706 "(int32) (lm_t.bcount+1)/BG_SEG_SZ+1 (present iff lm_t.tcount > 0)",
707 "(int32) lm_t.tseg_base[] (present iff lm_t.tcount > 0)",
708 "(int32) Sum(all word string-lengths, including trailing 0 for each)",
709 "All word strings (including trailing 0 for each)",
710 "END FILE FORMAT DESCRIPTION",
715ngram_model_dmp_write_header(FILE * fh)
718 k = strlen(darpa_hdr) + 1;
720 fwrite(darpa_hdr, 1, k, fh);
724ngram_model_dmp_write_lm_filename(FILE * fh,
const char *lmfile)
728 k = strlen(lmfile) + 1;
730 fwrite(lmfile, 1, k, fh);
733#define LMDMP_VERSION_TG_16BIT -1
738ngram_model_dmp_write_version(FILE * fh, int32 mtime)
740 fwrite_int32(fh, LMDMP_VERSION_TG_16BIT);
741 fwrite_int32(fh, mtime);
745ngram_model_dmp_write_ngram_counts(FILE * fh,
ngram_model_t *model)
747 fwrite_int32(fh, model->
n_counts[0]);
748 fwrite_int32(fh, model->
n_counts[1]);
749 fwrite_int32(fh, model->
n_counts[2]);
753ngram_model_dmp_write_fmtdesc(FILE * fh)
759 for (i = 0; fmtdesc[i] != NULL; i++) {
760 k = strlen(fmtdesc[i]) + 1;
762 fwrite(fmtdesc[i], 1, k, fh);
768 fwrite_int32(fh, 4-k);
769 fwrite(
"!!!!", 1, 4-k, fh);
780 for (i = 0; i <= model->
n_counts[0]; i++) {
781 fwrite_ug(fh, &(lm->
lm3g.unigrams[i]), model->
lmath);
792 for (i = 0; i <= model->
n_counts[1]; i++) {
793 fwrite_bg(fh, &(lm->
lm3g.bigrams[i]));
804 for (i = 0; i < model->
n_counts[2]; i++) {
805 fwrite_tg(fh, &(lm->
lm3g.trigrams[i]));
818 fwrite(&log10val, 4, 1, fh);
831 fwrite(&log10val, 4, 1, fh);
844 fwrite(&log10val, 4, 1, fh);
849ngram_model_dmp_write_tg_segbase(FILE *fh,
ngram_model_t *model)
854 k = (model->
n_counts[1] + 1) / BG_SEG_SZ + 1;
856 for (i = 0; i < k; i++)
866 for (i = 0; i < model->
n_counts[0]; i++)
867 k += strlen(model->
word_str[i]) + 1;
869 for (i = 0; i < model->
n_counts[0]; i++)
871 strlen(model->
word_str[i]) + 1, fh);
876 const char *file_name)
883 model = ngram_model_dmp_build(base);
884 newbase = &model->
base;
888 if ((fh = fopen(file_name,
"wb")) == NULL) {
889 E_ERROR(
"Cannot create file %s\n", file_name);
892 ngram_model_dmp_write_header(fh);
893 ngram_model_dmp_write_lm_filename(fh, file_name);
894 ngram_model_dmp_write_version(fh, 0);
895 ngram_model_dmp_write_fmtdesc(fh);
896 ngram_model_dmp_write_ngram_counts(fh, newbase);
897 ngram_model_dmp_write_unigram(fh, newbase);
898 if (newbase->
n > 1) {
899 ngram_model_dmp_write_bigram(fh, newbase);
900 if (newbase->
n > 2) {
901 ngram_model_dmp_write_trigram(fh, newbase);
903 ngram_model_dmp_write_bgprob(fh, newbase);
904 if (newbase->
n > 2) {
905 ngram_model_dmp_write_tgbowt(fh, newbase);
906 ngram_model_dmp_write_tgprob(fh, newbase);
907 ngram_model_dmp_write_tg_segbase(fh, newbase);
910 ngram_model_dmp_write_wordstr(fh, newbase);
917ngram_model_dmp_apply_weights(
ngram_model_t *base, float32 lw,
918 float32 wip, float32 uw)
921 lm3g_apply_weights(base, &model->
lm3g, lw, wip, uw);
928#define NGRAM_MODEL_TYPE ngram_model_dmp_t
929#include "lm3g_templates.c"
953 lm3g_tginfo_free(base, &model->
lm3g);
957 ngram_model_dmp_free,
958 ngram_model_dmp_apply_weights,
960 lm3g_template_raw_score,
961 lm3g_template_add_ug,
964 lm3g_template_mgrams,
965 lm3g_template_successors,
966 lm3g_template_iter_get,
967 lm3g_template_iter_next,
968 lm3g_template_iter_free
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
#define ckd_salloc(ptr)
Macro for ckd_salloc
#define cmd_ln_boolean_r(c, n)
Retrieve a boolean value from a command-line object.
Implementation of logging routines.
#define E_FATAL
Exit with non-zero status after error message.
#define E_ERROR
Print error message to standard error stream.
#define E_WARN
Print warning information to standard error stream.
#define E_INFO
Print logging information to standard error stream.
#define E_DEBUG(level, x)
Print debugging information to standard error stream.
#define hash_table_enter_int32(h, k, v)
Add a 32-bit integer value to a hash table.
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
Fast memory allocator for uniformly sized objects.
SPHINXBASE_EXPORT listelem_alloc_t * listelem_alloc_init(size_t elemsize)
Initialize and return a list element allocator.
SPHINXBASE_EXPORT float64 logmath_log_to_log10(logmath_t *lmath, int logb_p)
Convert integer log in base B to base 10 log (in floating point).
SPHINXBASE_EXPORT int logmath_log10_to_log(logmath_t *lmath, float64 log_p)
Convert base 10 log (in floating point) to integer log in base B.
SPHINXBASE_EXPORT logmath_t * logmath_retain(logmath_t *lmath)
Retain ownership of a log table.
SPHINXBASE_EXPORT void mmio_file_unmap(mmio_file_t *mf)
Unmap a file, releasing memory associated with it.
SPHINXBASE_EXPORT mmio_file_t * mmio_file_read(const char *filename)
Memory-map a file for reading.
SPHINXBASE_EXPORT void * mmio_file_ptr(mmio_file_t *mf)
Get a pointer to the memory mapped for a file.
SPHINXBASE_EXPORT ngram_iter_t * ngram_ng_iter(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist)
Get an iterator over M-grams pointing to the specified M-gram.
SPHINXBASE_EXPORT const char * ngram_word(ngram_model_t *model, int32 wid)
Look up word string for numerical word ID.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter_successors(ngram_iter_t *itor)
Iterate over all M-gram successors of an M-1-gram.
SPHINXBASE_EXPORT int32 const * ngram_iter_get(ngram_iter_t *itor, int32 *out_score, int32 *out_bowt)
Get information from the current M-gram in an iterator.
SPHINXBASE_EXPORT void ngram_iter_free(ngram_iter_t *itor)
Terminate an M-gram iterator.
SPHINXBASE_EXPORT ngram_iter_t * ngram_model_mgrams(ngram_model_t *model, int m)
Iterate over all M-grams.
SPHINXBASE_EXPORT int ngram_model_free(ngram_model_t *model)
Release memory associated with an N-Gram model.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_retain(ngram_model_t *model)
Retain ownership of an N-Gram model.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter_next(ngram_iter_t *itor)
Advance an M-gram iterator.
file IO related operations.
SPHINXBASE_EXPORT void fclose_comp(FILE *fp, int32 ispipe)
Close a file opened using fopen_comp.
SPHINXBASE_EXPORT FILE * fopen_comp(const char *file, const char *mode, int32 *ispipe)
Like fopen, but use popen and zcat if it is determined that "file" is compressed (i....
uint16 trigrams
Index of 1st entry in lm_t.trigrams[], RELATIVE TO its segment base (see above)
uint16 bo_wt2
Index into array of actual bigram backoff wts.
uint16 prob2
Index into array of actual bigram probs.
uint32 wid
Index of unigram entry for this.
Opaque structure used to hold the results of command-line parsing.
listelem_alloc_t * le
List element allocator for tginfo.
lmprob_t * prob2
Table of actual bigram probs.
int32 * tseg_base
tseg_base[i>>LOG_BG_SEG_SZ] = index of 1st trigram for bigram segment (i>>LOG_BG_SEG_SZ)
lmprob_t * bo_wt2
Table of actual bigram backoff weights.
tginfo_t ** tginfo
tginfo[lw2] is head of linked list of trigram information for some cached subset of bigrams (*,...
int32 n_bo_wt2
bo_wt2 size
lmprob_t * prob3
Table of actual trigram probs.
Implementation-specific functions for operating on ngram_model_t objects.
Base iterator structure for N-grams.
Subclass of ngram_model for DMP file reading.
mmio_file_t * dump_mmap
mmap() of dump file (or NULL if none)
lm3g_model_t lm3g
Common lm3g_model_t structure.
ngram_model_t base
Base ngram_model_t structure.
Common implementation of ngram_model_t.
logmath_t * lmath
Log-math object.
uint8 n
This is an n-gram model (1, 2, 3, ...).
hash_table_t * wid
Mapping of unigram names to word IDs.
uint8 writable
Are word strings writable?
int32 * n_counts
Counts for 1, 2, 3, ... grams.
struct ngram_funcs_s * funcs
Implementation-specific methods.
char ** word_str
Unigram names.
int32 free
first free element in list
Trigram information cache.
uint32 wid
Index of unigram entry for this.
uint16 prob3
Index into array of actual trigram probs.
Unigram structure (common among all lm3g implementations)
lmprob_t prob1
Unigram probability.
lmprob_t bo_wt1
Unigram backoff weight.
int32 bigrams
Index of 1st entry in lm_t.bigrams[].