SphinxBase 0.6
ngram_model_internal.h
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 1999-2007 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37/*
38 * \file ngram_model_internal.h Internal structures for N-Gram models
39 *
40 * Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
41 */
42
43#ifndef __NGRAM_MODEL_INTERNAL_H__
44#define __NGRAM_MODEL_INTERNAL_H__
45
48
79
84 int32 tag_wid;
85 int32 start_wid;
86 int32 n_words;
87 int32 *prob1;
91 struct ngram_hash_s {
92 int32 wid;
93 int32 prob1;
94 int32 next;
95 } *nword_hash;
96 int32 n_hash;
98};
99
100#define NGRAM_HASH_SIZE 128
101
102#define NGRAM_BASEWID(wid) ((wid)&0xffffff)
103#define NGRAM_CLASSID(wid) (((wid)>>24) & 0x7f)
104#define NGRAM_CLASSWID(wid,classid) (((classid)<<24) | 0x80000000 | (wid))
105#define NGRAM_IS_CLASSWID(wid) ((wid)&0x80000000)
106
107#define UG_ALLOC_STEP 10
108
110typedef struct ngram_funcs_s {
114 void (*free)(ngram_model_t *model);
119 float32 lw,
120 float32 wip,
121 float32 uw);
125 int32 (*score)(ngram_model_t *model,
126 int32 wid,
127 int32 *history,
128 int32 n_hist,
129 int32 *n_used);
134 int32 (*raw_score)(ngram_model_t *model,
135 int32 wid,
136 int32 *history,
137 int32 n_hist,
138 int32 *n_used);
150 int32 (*add_ug)(ngram_model_t *model,
151 int32 wid, int32 lweight);
155 void (*flush)(ngram_model_t *model);
156
160 ngram_iter_t * (*iter)(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist);
161
165 ngram_iter_t * (*mgrams)(ngram_model_t *model, int32 m);
166
170 ngram_iter_t * (*successors)(ngram_iter_t *itor);
171
175 int32 const * (*iter_get)(ngram_iter_t *itor,
176 int32 *out_score,
177 int32 *out_bowt);
178
182 ngram_iter_t * (*iter_next)(ngram_iter_t *itor);
183
187 void (*iter_free)(ngram_iter_t *itor);
189
194 ngram_model_t *model;
195 int32 *wids;
196 int16 m;
197 int16 successor;
198};
199
203typedef struct classdef_s {
204 char **words;
205 float32 *weights;
206 int32 n_words;
207} classdef_t;
208
212int32
213ngram_model_init(ngram_model_t *model,
214 ngram_funcs_t *funcs,
215 logmath_t *lmath,
216 int32 n, int32 n_unigram);
217
221ngram_model_t *ngram_model_arpa_read(cmd_ln_t *config,
222 const char *file_name,
223 logmath_t *lmath);
227ngram_model_t *ngram_model_dmp_read(cmd_ln_t *config,
228 const char *file_name,
229 logmath_t *lmath);
233ngram_model_t *ngram_model_dmp32_read(cmd_ln_t *config,
234 const char *file_name,
235 logmath_t *lmath);
236
240int ngram_model_arpa_write(ngram_model_t *model,
241 const char *file_name);
245int ngram_model_dmp_write(ngram_model_t *model,
246 const char *file_name);
247
251int32 read_classdef_file(hash_table_t *classes, const char *classdef_file);
252
256void classdef_free(classdef_t *classdef);
257
261ngram_class_t *ngram_class_new(ngram_model_t *model, int32 tag_wid,
262 int32 start_wid, glist_t classwords);
263
267void ngram_class_free(ngram_class_t *lmclass);
268
274int32 ngram_class_prob(ngram_class_t *lmclass, int32 wid);
275
279void ngram_iter_init(ngram_iter_t *itor, ngram_model_t *model,
280 int m, int successor);
281
282#endif /* __NGRAM_MODEL_INTERNAL_H__ */
Hash table implementation.
N-Gram language models.
One class definition from a classdef file.
Opaque structure used to hold the results of command-line parsing.
A node in a generic list.
Definition glist.h:100
Custom hash table for additional words.
int32 prob1
Probability for this word.
int32 next
Index of next bucket (or -1 for no collision)
int32 wid
Word ID of this bucket.
Implementation of ngram_class_t.
int32 start_wid
Starting base word ID for this class' words.
int32 * prob1
Probability table for base words.
int32 n_hash_inuse
Number of words in nword_hash.
int32 n_hash
Number of buckets in nword_hash (power of 2)
int32 tag_wid
Base word ID for this class tag.
int32 n_words
Number of base words for this class.
Implementation-specific functions for operating on ngram_model_t objects.
int(* apply_weights)(ngram_model_t *model, float32 lw, float32 wip, float32 uw)
Implementation-specific function for applying language model weights.
int32(* raw_score)(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Implementation-specific function for querying raw language model probability.
int32(* score)(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Implementation-specific function for querying language model score.
void(* iter_free)(ngram_iter_t *itor)
Implementation-specific function for iterating.
void(* free)(ngram_model_t *model)
Implementation-specific function for freeing an ngram_model_t.
int32(* add_ug)(ngram_model_t *model, int32 wid, int32 lweight)
Implementation-specific function for adding unigrams.
void(* flush)(ngram_model_t *model)
Implementation-specific function for purging N-Gram cache.
Base iterator structure for N-grams.
int32 * wids
Scratch space for word IDs.
int16 successor
Is this a successor iterator?
int16 m
Order of history.
Common implementation of ngram_model_t.
logmath_t * lmath
Log-math object.
struct ngram_class_s ** classes
Word class definitions.
int refcount
Reference count.
uint8 n
This is an n-gram model (1, 2, 3, ...).
int32 log_wip
Log of word insertion penalty.
int32 n_1g_alloc
Number of allocated word strings (for new word addition)
int32 * tmp_wids
Temporary array of word IDs for ngram_model_get_ngram()
int32 log_uniform
Log of uniform (0-gram) probability.
int32 log_zero
Zero probability, cached here for quick lookup.
int32 log_uw
Log of unigram weight.
int32 n_words
Number of actual word strings (NOT the same as the number of unigrams, due to class words).
hash_table_t * wid
Mapping of unigram names to word IDs.
float32 lw
Language model scaling factor.
uint8 writable
Are word strings writable?
int32 * n_counts
Counts for 1, 2, 3, ... grams.
int32 log_uniform_weight
Log of uniform weight (i.e.
uint8 flags
Any other flags we might care about (FIXME: Merge this and writable)
struct ngram_funcs_s * funcs
Implementation-specific methods.
uint8 n_classes
Number of classes (maximum 128)
char ** word_str
Unigram names.