53#include "ngram_model_arpa.h"
57#define TSEG_BASE(m,b) ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ])
58#define FIRST_BG(m,u) ((m)->lm3g.unigrams[u].bigrams)
59#define FIRST_TG(m,b) (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams))
65ReadNgramCounts(
lineiter_t **li, int32 * n_ug, int32 * n_bg, int32 * n_tg)
67 int32 ngram, ngram_cnt;
72 if (strcmp((*li)->buf,
"\\data\\") == 0)
76 if (*li == NULL || strcmp((*li)->buf,
"\\data\\") != 0) {
77 E_INFO(
"No \\data\\ mark in LM file\n");
81 *n_ug = *n_bg = *n_tg = 0;
83 if (sscanf((*li)->buf,
"ngram %d=%d", &ngram, &ngram_cnt) != 2)
96 E_ERROR(
"Unknown ngram (%d)\n", ngram);
101 E_ERROR(
"EOF while reading ngram counts\n");
108 if (strcmp((*li)->buf,
"\\1-grams:") == 0)
116 if ((*n_ug <= 0) || (*n_bg < 0) || (*n_tg < 0)) {
117 E_ERROR(
"Bad or missing ngram count\n");
135 E_INFO(
"Reading unigrams\n");
139 char *wptr[3], *name;
140 float32 bo_wt = 0.0f;
144 if (strcmp((*li)->buf,
"\\2-grams:") == 0
145 || strcmp((*li)->buf,
"\\end\\") == 0)
148 if ((n =
str2words((*li)->buf, wptr, 3)) < 2) {
149 if ((*li)->buf[0] !=
'\0')
150 E_WARN(
"Format error; unigram ignored: %s\n", (*li)->buf);
154 p1 = (float)
atof_c(wptr[0]);
157 bo_wt = (float)
atof_c(wptr[2]);
161 E_ERROR(
"Too many unigrams\n");
168 != (
void *)(
long)wcnt) {
169 E_WARN(
"Duplicate word in dictionary: %s\n", base->
word_str[wcnt]);
177 E_WARN(
"lm_t.ucount(%d) != #unigrams read(%d)\n",
192 int32 w1, w2, prev_w1, bgcount;
195 E_INFO(
"Reading bigrams\n");
198 bgptr = model->
lm3g.bigrams;
202 float32 p, bo_wt = 0.0f;
204 char *wptr[4], *word1, *word2;
209 if ((n =
str2words((*li)->buf, wptr, 4)) < 3) {
210 if ((*li)->buf[0] !=
'\0')
215 p = (float32)
atof_c(wptr[0]);
219 bo_wt = (float32)
atof_c(wptr[3]);
223 E_ERROR(
"Unknown word: %s, skipping bigram (%s %s)\n",
224 word1, word1, word2);
228 E_ERROR(
"Unknown word: %s, skipping bigram (%s %s)\n",
229 word2, word1, word2);
235 p = (float32)((int32)(p * 10000)) / 10000;
236 bo_wt = (float32)((int32)(bo_wt * 10000)) / 10000;
247 bgptr->
prob2 = sorted_id(&model->sorted_prob2, &p2);
249 bgptr->
bo_wt2 = sorted_id(&model->sorted_bo_wt2, &bo_wt2);
253 E_ERROR(
"Bigrams not in unigram order\n");
257 for (prev_w1++; prev_w1 <= w1; prev_w1++)
264 if ((bgcount & 0x0000ffff) == 0) {
268 if (*li == NULL || ((strcmp((*li)->buf,
"\\end\\") != 0)
269 && (strcmp((*li)->buf,
"\\3-grams:") != 0))) {
270 E_ERROR(
"Bad bigram: %s\n", (*li)->buf);
274 for (prev_w1++; prev_w1 <= base->
n_counts[0]; prev_w1++)
287 int32 i, w1, w2, w3, prev_w1, prev_w2, tgcount, prev_bg, bg, endbg;
288 int32 seg, prev_seg, prev_seg_lastbg;
292 E_INFO(
"Reading trigrams\n");
295 tgptr = model->
lm3g.trigrams;
304 char *wptr[4], *word1, *word2, *word3;
307 if (
str2words((*li)->buf, wptr, 4) != 4) {
308 if ((*li)->buf[0] !=
'\0')
313 p = (float32)
atof_c(wptr[0]);
320 E_ERROR(
"Unknown word: %s, skipping trigram (%s %s %s)\n",
321 word1, word1, word2, word3);
325 E_ERROR(
"Unknown word: %s, skipping trigram (%s %s %s)\n",
326 word2, word1, word2, word3);
330 E_ERROR(
"Unknown word: %s, skipping trigram (%s %s %s)\n",
331 word3, word1, word2, word3);
337 p = (float32)((int32)(p * 10000)) / 10000;
341 E_ERROR(
"Too many trigrams\n");
346 tgptr->
prob3 = sorted_id(&model->sorted_prob3, &p3);
348 if ((w1 != prev_w1) || (w2 != prev_w2)) {
350 if ((w1 < prev_w1) || ((w1 == prev_w1) && (w2 < prev_w2))) {
351 E_ERROR(
"Trigrams not in bigram order\n");
356 prev_w1) ? model->
lm3g.unigrams[w1].
bigrams : prev_bg + 1;
358 bgptr = model->
lm3g.bigrams + bg;
359 for (; (bg < endbg) && (bgptr->
wid != w2); bg++, bgptr++);
361 E_ERROR(
"Missing bigram for trigram: %s", (*li)->buf);
366 seg = bg >> LOG_BG_SEG_SZ;
367 for (i = prev_seg + 1; i <= seg; i++)
371 if (prev_seg < seg) {
377 E_ERROR(
"Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
382 prev_seg_lastbg = ((prev_seg + 1) << LOG_BG_SEG_SZ) - 1;
383 bgptr = model->
lm3g.bigrams + prev_bg;
384 for (++prev_bg, ++bgptr; prev_bg <= prev_seg_lastbg;
388 for (; prev_bg <= bg; prev_bg++, bgptr++)
396 E_ERROR(
"Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
400 bgptr = model->
lm3g.bigrams + prev_bg;
401 for (++prev_bg, ++bgptr; prev_bg <= bg; prev_bg++, bgptr++)
414 if ((tgcount & 0x0000ffff) == 0) {
418 if (*li == NULL || strcmp((*li)->buf,
"\\end\\") != 0) {
419 E_ERROR(
"Bad trigram: %s\n", (*li)->buf);
423 for (prev_bg++; prev_bg <= base->
n_counts[1]; prev_bg++) {
424 if ((prev_bg & (BG_SEG_SZ - 1)) == 0)
426 if ((tgcount - model->
lm3g.
tseg_base[prev_bg >> LOG_BG_SEG_SZ]) > 65535) {
427 E_ERROR(
"Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
437new_unigram_table(int32 n_ug)
443 for (i = 0; i < n_ug; i++) {
444 table[i].
prob1.l = INT_MIN;
445 table[i].
bo_wt1.l = INT_MIN;
451ngram_model_arpa_read(
cmd_ln_t *config,
452 const char *file_name,
465 if ((fp =
fopen_comp(file_name,
"r", &is_pipe)) == NULL) {
466 E_ERROR(
"File %s not found\n", file_name);
472 if (ReadNgramCounts(&li, &n_unigram, &n_bigram, &n_trigram) == -1) {
477 E_INFO(
"ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);
484 else if (n_bigram > 0)
489 ngram_model_init(base, &ngram_model_arpa_funcs, lmath, n, n_unigram);
499 model->
lm3g.unigrams = new_unigram_table(n_unigram + 1);
500 model->
lm3g.bigrams =
503 model->
lm3g.trigrams =
511 if (ReadUnigrams(&li, model) == -1) {
518 init_sorted_list(&model->sorted_prob2);
520 init_sorted_list(&model->sorted_bo_wt2);
523 if (ReadBigrams(&li, model) == -1) {
531 model->
lm3g.
prob2 = vals_in_sorted_list(&model->sorted_prob2);
532 free_sorted_list(&model->sorted_prob2);
540 model->
lm3g.
bo_wt2 = vals_in_sorted_list(&model->sorted_bo_wt2);
541 free_sorted_list(&model->sorted_bo_wt2);
544 init_sorted_list(&model->sorted_prob3);
546 if (ReadTrigrams(&li, model) == -1) {
554 model->
lm3g.
prob3 = vals_in_sorted_list(&model->sorted_prob3);
558 free_sorted_list(&model->sorted_prob3);
572 const char *file_name)
578 if ((fh = fopen(file_name,
"w")) == NULL) {
582 fprintf(fh,
"This is an ARPA-format language model file, generated by CMU Sphinx\n");
589 fprintf(fh,
"\\data\\\n");
590 for (i = 0; i < model->
n; ++i) {
591 fprintf(fh,
"ngram %d=%d\n", i+1, model->
n_counts[i]);
595 for (i = 0; i < model->
n; ++i) {
596 fprintf(fh,
"\n\\%d-grams:\n", i + 1);
604 for (j = 0; j <= i; ++j) {
605 assert(wids[j] < model->
n_counts[0]);
606 fprintf(fh,
"%s ", model->
word_str[wids[j]]);
613 fprintf(fh,
"\n\\end\\\n");
618ngram_model_arpa_apply_weights(
ngram_model_t *base, float32 lw,
619 float32 wip, float32 uw)
622 lm3g_apply_weights(base, &model->
lm3g, lw, wip, uw);
629#define NGRAM_MODEL_TYPE ngram_model_arpa_t
630#include "lm3g_templates.c"
642 lm3g_tginfo_free(base, &model->
lm3g);
647 ngram_model_arpa_free,
648 ngram_model_arpa_apply_weights,
650 lm3g_template_raw_score,
651 lm3g_template_add_ug,
654 lm3g_template_mgrams,
655 lm3g_template_successors,
656 lm3g_template_iter_get,
657 lm3g_template_iter_next,
658 lm3g_template_iter_free
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
#define ckd_salloc(ptr)
Macro for ckd_salloc
Implementation of logging routines.
#define E_ERROR_SYSTEM
Print error text; Call perror("");.
#define E_ERROR
Print error message to standard error stream.
#define E_WARN
Print warning information to standard error stream.
#define E_INFO
Print logging information to standard error stream.
#define E_INFOCONT
Print logging information without header, to standard error stream.
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
Fast memory allocator for uniformly sized objects.
SPHINXBASE_EXPORT listelem_alloc_t * listelem_alloc_init(size_t elemsize)
Initialize and return a list element allocator.
SPHINXBASE_EXPORT float64 logmath_log_to_log10(logmath_t *lmath, int logb_p)
Convert integer log in base B to base 10 log (in floating point).
SPHINXBASE_EXPORT int logmath_log10_to_log(logmath_t *lmath, float64 log_p)
Convert base 10 log (in floating point) to integer log in base B.
#define NGRAM_INVALID_WID
Impossible word ID.
SPHINXBASE_EXPORT int32 const * ngram_iter_get(ngram_iter_t *itor, int32 *out_score, int32 *out_bowt)
Get information from the current M-gram in an iterator.
SPHINXBASE_EXPORT int32 ngram_wid(ngram_model_t *model, const char *word)
Look up numerical word ID.
SPHINXBASE_EXPORT ngram_iter_t * ngram_model_mgrams(ngram_model_t *model, int m)
Iterate over all M-grams.
SPHINXBASE_EXPORT int ngram_model_free(ngram_model_t *model)
Release memory associated with an N-Gram model.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter_next(ngram_iter_t *itor)
Advance an M-gram iterator.
file IO related operations.
SPHINXBASE_EXPORT void lineiter_free(lineiter_t *li)
Stop reading lines from a file.
SPHINXBASE_EXPORT void fclose_comp(FILE *fp, int32 ispipe)
Close a file opened using fopen_comp.
SPHINXBASE_EXPORT FILE * fopen_comp(const char *file, const char *mode, int32 *ispipe)
Like fopen, but use popen and zcat if it is determined that "file" is compressed (i....
SPHINXBASE_EXPORT lineiter_t * lineiter_start(FILE *fh)
Start reading lines from a file.
SPHINXBASE_EXPORT lineiter_t * lineiter_next(lineiter_t *li)
Move to the next line in the file.
Miscellaneous useful string functions.
SPHINXBASE_EXPORT char * string_trim(char *string, enum string_edge_e which)
Remove whitespace from a string, modifying it in-place.
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
@ STRING_BOTH
Both ends of string.
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
uint16 trigrams
Index of 1st entry in lm_t.trigrams[], RELATIVE TO its segment base (see above)
uint16 bo_wt2
Index into array of actual bigram backoff wts.
uint16 prob2
Index into array of actual bigram probs.
uint32 wid
Index of unigram entry for this.
Opaque structure used to hold the results of command-line parsing.
listelem_alloc_t * le
List element allocator for tginfo.
lmprob_t * prob2
Table of actual bigram probs.
int32 * tseg_base
tseg_base[i>>LOG_BG_SEG_SZ] = index of 1st trigram for bigram segment (i>>LOG_BG_SEG_SZ)
lmprob_t * bo_wt2
Table of actual bigram backoff weights.
tginfo_t ** tginfo
tginfo[lw2] is head of linked list of trigram information for some cached subset of bigrams (*,...
int32 n_bo_wt2
bo_wt2 size
lmprob_t * prob3
Table of actual trigram probs.
Implementation-specific functions for operating on ngram_model_t objects.
Base iterator structure for N-grams.
Subclass of ngram_model for ARPA file reading.
ngram_model_t base
Base ngram_model_t structure.
lm3g_model_t lm3g
Shared lm3g structure.
Common implementation of ngram_model_t.
logmath_t * lmath
Log-math object.
uint8 n
This is an n-gram model (1, 2, 3, ...).
int32 n_words
Number of actual word strings (NOT the same as the number of unigrams, due to class words).
hash_table_t * wid
Mapping of unigram names to word IDs.
uint8 writable
Are word strings writable?
int32 * n_counts
Counts for 1, 2, 3, ... grams.
char ** word_str
Unigram names.
int32 free
first free element in list
Trigram information cache.
uint32 wid
Index of unigram entry for this.
uint16 prob3
Index into array of actual trigram probs.
Unigram structure (common among all lm3g implementations)
lmprob_t prob1
Unigram probability.
lmprob_t bo_wt1
Unigram backoff weight.
int32 bigrams
Index of 1st entry in lm_t.bigrams[].