SphinxBase 0.6
ngram_model.h
Go to the documentation of this file.
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 2007 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
43#ifndef __NGRAM_MODEL_H__
44#define __NGRAM_MODEL_H__
45
46#include <stdarg.h>
47
48/* Win32/WinCE DLL gunk */
49#include <sphinxbase/sphinxbase_export.h>
51#include <sphinxbase/cmd_ln.h>
52#include <sphinxbase/logmath.h>
53#include <sphinxbase/mmio.h>
54
55#ifdef __cplusplus
56extern "C" {
57#endif
58#if 0
59/* Fool Emacs. */
60}
61#endif
62
67
72
83
84#define NGRAM_INVALID_WID -1
106SPHINXBASE_EXPORT
108 const char *file_name,
109 ngram_file_type_t file_type,
110 logmath_t *lmath);
111
117SPHINXBASE_EXPORT
118int ngram_model_write(ngram_model_t *model, const char *file_name,
119 ngram_file_type_t format);
120
126SPHINXBASE_EXPORT
127ngram_file_type_t ngram_file_name_to_type(const char *file_name);
128
134SPHINXBASE_EXPORT
135ngram_file_type_t ngram_str_to_type(const char *str_name);
136
143SPHINXBASE_EXPORT
144char const *ngram_type_to_str(int type);
145
151SPHINXBASE_EXPORT
153
159SPHINXBASE_EXPORT
161
178SPHINXBASE_EXPORT
179int ngram_model_recode(ngram_model_t *model, const char *from, const char *to);
180
184typedef enum ngram_case_e {
185 NGRAM_UPPER,
186 NGRAM_LOWER
188
195SPHINXBASE_EXPORT
196int ngram_model_casefold(ngram_model_t *model, int kase);
197
209SPHINXBASE_EXPORT
211 float32 lw, float32 wip, float32 uw);
212
221SPHINXBASE_EXPORT
222float32 ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip,
223 int32 *out_log_uw);
224
257SPHINXBASE_EXPORT
258int32 ngram_score(ngram_model_t *model, const char *word, ...);
259
263SPHINXBASE_EXPORT
264int32 ngram_tg_score(ngram_model_t *model,
265 int32 w3, int32 w2, int32 w1,
266 int32 *n_used);
267
271SPHINXBASE_EXPORT
272int32 ngram_bg_score(ngram_model_t *model,
273 int32 w2, int32 w1,
274 int32 *n_used);
275
279SPHINXBASE_EXPORT
280int32 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history,
281 int32 n_hist, int32 *n_used);
282
293SPHINXBASE_EXPORT
294int32 ngram_prob(ngram_model_t *model, const char *word, ...);
295
302SPHINXBASE_EXPORT
303int32 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history,
304 int32 n_hist, int32 *n_used);
305
317SPHINXBASE_EXPORT
318int32 ngram_score_to_prob(ngram_model_t *model, int32 score);
319
323SPHINXBASE_EXPORT
324int32 ngram_wid(ngram_model_t *model, const char *word);
325
329SPHINXBASE_EXPORT
330const char *ngram_word(ngram_model_t *model, int32 wid);
331
345SPHINXBASE_EXPORT
346int32 ngram_unknown_wid(ngram_model_t *model);
347
351SPHINXBASE_EXPORT
352int32 ngram_zero(ngram_model_t *model);
353
357SPHINXBASE_EXPORT
359
363SPHINXBASE_EXPORT
364int32 const *ngram_model_get_counts(ngram_model_t *model);
365
370
379SPHINXBASE_EXPORT
381
385SPHINXBASE_EXPORT
386ngram_iter_t *ngram_iter(ngram_model_t *model, const char *word, ...);
387
391SPHINXBASE_EXPORT
392ngram_iter_t *ngram_ng_iter(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist);
393
402SPHINXBASE_EXPORT
403int32 const *ngram_iter_get(ngram_iter_t *itor,
404 int32 *out_score,
405 int32 *out_bowt);
406
412SPHINXBASE_EXPORT
414
418SPHINXBASE_EXPORT
420
424SPHINXBASE_EXPORT
425void ngram_iter_free(ngram_iter_t *itor);
426
439SPHINXBASE_EXPORT
441 const char *word, float32 weight);
442
456SPHINXBASE_EXPORT
458 const char *file_name);
459
468SPHINXBASE_EXPORT
470 const char *classname,
471 float32 classweight,
472 char **words,
473 const float32 *weights,
474 int32 n_words);
475
485SPHINXBASE_EXPORT
487 const char *classname,
488 const char *word,
489 float32 weight);
490
515SPHINXBASE_EXPORT
517 ngram_model_t **models,
518 char **names,
519 const float32 *weights,
520 int32 n_models);
521
552SPHINXBASE_EXPORT
554 const char *lmctlfile,
555 logmath_t *lmath);
556
560SPHINXBASE_EXPORT
562
567
573SPHINXBASE_EXPORT
575
581SPHINXBASE_EXPORT
583
587SPHINXBASE_EXPORT
589
597SPHINXBASE_EXPORT
599 char const **lmname);
600
607SPHINXBASE_EXPORT
609 const char *name);
610
617SPHINXBASE_EXPORT
619 const char *name);
620
624SPHINXBASE_EXPORT
625const char *ngram_model_set_current(ngram_model_t *set);
626
634SPHINXBASE_EXPORT
636 const char **names,
637 const float32 *weights);
638
651SPHINXBASE_EXPORT
653 ngram_model_t *model,
654 const char *name,
655 float32 weight,
656 int reuse_widmap);
657
666SPHINXBASE_EXPORT
668 const char *name,
669 int reuse_widmap);
670
674SPHINXBASE_EXPORT
676 const char **words,
677 int32 n_words);
678
686SPHINXBASE_EXPORT
688 int32 set_wid);
689
699SPHINXBASE_EXPORT
700int32 ngram_model_set_known_wid(ngram_model_t *set, int32 set_wid);
701
709SPHINXBASE_EXPORT
711
712#ifdef __cplusplus
713}
714#endif
715
716
717#endif /* __NGRAM_MODEL_H__ */
Command-line and other configurationparsing and handling.
Fast integer logarithmic addition operations.
Memory-mapped I/O wrappers for files.
SPHINXBASE_EXPORT int32 ngram_model_add_class(ngram_model_t *model, const char *classname, float32 classweight, char **words, const float32 *weights, int32 n_words)
Add a new class to a language model.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_lookup(ngram_model_t *set, const char *name)
Look up a language model by name from a set.
SPHINXBASE_EXPORT int32 ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used)
Quick trigram score lookup.
SPHINXBASE_EXPORT int32 ngram_unknown_wid(ngram_model_t *model)
Get the unknown word ID for a language model.
SPHINXBASE_EXPORT int32 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick "raw" probability lookup for a general N-Gram.
SPHINXBASE_EXPORT int32 ngram_model_set_count(ngram_model_t *set)
Returns the number of language models in a set.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter(ngram_model_t *model, const char *word,...)
Get an iterator over M-grams pointing to the specified M-gram.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_read(cmd_ln_t *config, const char *lmctlfile, logmath_t *lmath)
Read a set of language models from a control file.
SPHINXBASE_EXPORT float32 ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip, int32 *out_log_uw)
Get the current weights from a language model.
SPHINXBASE_EXPORT ngram_iter_t * ngram_ng_iter(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist)
Get an iterator over M-grams pointing to the specified M-gram.
SPHINXBASE_EXPORT const char * ngram_word(ngram_model_t *model, int32 wid)
Look up word string for numerical word ID.
SPHINXBASE_EXPORT int32 ngram_model_add_word(ngram_model_t *model, const char *word, float32 weight)
Add a word (unigram) to the language model.
SPHINXBASE_EXPORT int32 ngram_model_add_class_word(ngram_model_t *model, const char *classname, const char *word, float32 weight)
Add a word to a class in a language model.
SPHINXBASE_EXPORT const char * ngram_model_set_current(ngram_model_t *set)
Get the current language model name, if any.
SPHINXBASE_EXPORT int32 const * ngram_model_get_counts(ngram_model_t *model)
Get the counts of the various N-grams in the model.
SPHINXBASE_EXPORT void ngram_model_set_map_words(ngram_model_t *set, const char **words, int32 n_words)
Set the word-to-ID mapping for this model set.
ngram_file_type_e
File types for N-Gram files.
Definition ngram_model.h:76
@ NGRAM_INVALID
Not a valid file type.
Definition ngram_model.h:77
@ NGRAM_AUTO
Determine file type automatically.
Definition ngram_model.h:78
@ NGRAM_ARPA
ARPABO text format (the standard).
Definition ngram_model.h:79
@ NGRAM_DMP32
Sphinx .DMP32 format (NOT SUPPORTED)
Definition ngram_model.h:81
@ NGRAM_DMP
Sphinx .DMP format.
Definition ngram_model.h:80
SPHINXBASE_EXPORT int ngram_model_casefold(ngram_model_t *model, int kase)
Case-fold word strings in an N-Gram model.
SPHINXBASE_EXPORT int32 ngram_model_get_size(ngram_model_t *model)
Get the order of the N-gram model (i.e.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter_successors(ngram_iter_t *itor)
Iterate over all M-gram successors of an M-1-gram.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_init(cmd_ln_t *config, ngram_model_t **models, char **names, const float32 *weights, int32 n_models)
Create a set of language models sharing a common space of word IDs.
SPHINXBASE_EXPORT int32 ngram_prob(ngram_model_t *model, const char *word,...)
Get the "raw" log-probability for a general N-Gram.
SPHINXBASE_EXPORT int32 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick general N-Gram score lookup.
SPHINXBASE_EXPORT ngram_model_set_iter_t * ngram_model_set_iter(ngram_model_t *set)
Begin iterating over language models in a set.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_read(cmd_ln_t *config, const char *file_name, ngram_file_type_t file_type, logmath_t *lmath)
Read an N-Gram model from a file on disk.
SPHINXBASE_EXPORT int32 const * ngram_iter_get(ngram_iter_t *itor, int32 *out_score, int32 *out_bowt)
Get information from the current M-gram in an iterator.
SPHINXBASE_EXPORT void ngram_model_flush(ngram_model_t *lm)
Flush any cached N-Gram information.
SPHINXBASE_EXPORT char const * ngram_type_to_str(int type)
Get the canonical name for an N-Gram file type.
SPHINXBASE_EXPORT int32 ngram_bg_score(ngram_model_t *model, int32 w2, int32 w1, int32 *n_used)
Quick bigram score lookup.
SPHINXBASE_EXPORT int32 ngram_model_read_classdef(ngram_model_t *model, const char *file_name)
Read a class definition file and add classes to a language model.
SPHINXBASE_EXPORT int32 ngram_score(ngram_model_t *model, const char *word,...)
Get the score (scaled, interpolated log-probability) for a general N-Gram.
SPHINXBASE_EXPORT ngram_model_set_iter_t * ngram_model_set_iter_next(ngram_model_set_iter_t *itor)
Move to the next language model in a set.
SPHINXBASE_EXPORT int ngram_model_apply_weights(ngram_model_t *model, float32 lw, float32 wip, float32 uw)
Apply a language weight, insertion penalty, and unigram weight to a language model.
SPHINXBASE_EXPORT int ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
Re-encode word strings in an N-Gram model.
ngram_case_e
Constants for case folding.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_add(ngram_model_t *set, ngram_model_t *model, const char *name, float32 weight, int reuse_widmap)
Add a language model to a set.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_iter_model(ngram_model_set_iter_t *itor, char const **lmname)
Get language model and associated name from an iterator.
enum ngram_file_type_e ngram_file_type_t
File types for N-Gram files.
SPHINXBASE_EXPORT int ngram_model_write(ngram_model_t *model, const char *file_name, ngram_file_type_t format)
Write an N-Gram model to disk.
SPHINXBASE_EXPORT void ngram_iter_free(ngram_iter_t *itor)
Terminate an M-gram iterator.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_interp(ngram_model_t *set, const char **names, const float32 *weights)
Set interpolation weights for a set and enables interpolation.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_select(ngram_model_t *set, const char *name)
Select a single language model from a set for scoring.
SPHINXBASE_EXPORT int32 ngram_wid(ngram_model_t *model, const char *word)
Look up numerical word ID.
SPHINXBASE_EXPORT ngram_file_type_t ngram_file_name_to_type(const char *file_name)
Guess the file type for an N-Gram model from the filename.
Definition ngram_model.c:64
SPHINXBASE_EXPORT int32 ngram_model_set_known_wid(ngram_model_t *set, int32 set_wid)
Test whether a word ID corresponds to a known word in the current state of the language model set.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_remove(ngram_model_t *set, const char *name, int reuse_widmap)
Remove a language model from a set.
SPHINXBASE_EXPORT int32 ngram_score_to_prob(ngram_model_t *model, int32 score)
Convert score to "raw" log-probability.
SPHINXBASE_EXPORT ngram_iter_t * ngram_model_mgrams(ngram_model_t *model, int m)
Iterate over all M-grams.
SPHINXBASE_EXPORT int32 ngram_zero(ngram_model_t *model)
Get the "zero" log-probability value for a language model.
SPHINXBASE_EXPORT void ngram_model_set_iter_free(ngram_model_set_iter_t *itor)
Finish iteration over a langauge model set.
SPHINXBASE_EXPORT int ngram_model_free(ngram_model_t *model)
Release memory associated with an N-Gram model.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_retain(ngram_model_t *model)
Retain ownership of an N-Gram model.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter_next(ngram_iter_t *itor)
Advance an M-gram iterator.
SPHINXBASE_EXPORT int32 ngram_model_set_current_wid(ngram_model_t *set, int32 set_wid)
Query the word-ID mapping for the current language model.
enum ngram_case_e ngram_case_t
Constants for case folding.
SPHINXBASE_EXPORT ngram_file_type_t ngram_str_to_type(const char *str_name)
Get the N-Gram file type from a string.
Definition ngram_model.c:97
Basic type definitions used in Sphinx.
Opaque structure used to hold the results of command-line parsing.
Implementation of ngram_class_t.
Base iterator structure for N-grams.
int16 m
Order of history.
Common implementation of ngram_model_t.
Iterator over a model set.