SphinxBase 0.6
sphinx_lm_convert.c
Go to the documentation of this file.
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 2009 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
41#include <sphinxbase/logmath.h>
43#include <sphinxbase/cmd_ln.h>
45#include <sphinxbase/err.h>
46#include <sphinxbase/pio.h>
47#include <sphinxbase/strfuncs.h>
48
49#include <stdio.h>
50#include <string.h>
51#include <math.h>
52
53static const arg_t defn[] = {
54 { "-help",
56 "no",
57 "Shows the usage of the tool"},
58
59 { "-logbase",
61 "1.0001",
62 "Base in which all log-likelihoods calculated" },
63
64 { "-i",
66 NULL,
67 "Input language model file (required)"},
68
69 { "-o",
71 NULL,
72 "Output language model file (required)"},
73
74 { "-ifmt",
76 NULL,
77 "Input language model format (will guess if not specified)"},
78
79 { "-ofmt",
81 NULL,
82 "Output language model file (will guess if not specified)"},
83
84 { "-ienc",
86 NULL,
87 "Input language model text encoding (no conversion done if not specified)"},
88
89 { "-oenc",
91 "utf8",
92 "Output language model text encoding"},
93
94 { "-case",
96 NULL,
97 "Ether 'lower' or 'upper' - case fold to lower/upper case (NOT UNICODE AWARE)" },
98
99 { "-mmap",
101 "no",
102 "Use memory-mapped I/O for reading binary LM files"},
103
104 { "-debug",
105 ARG_INT32,
106 NULL,
107 "Verbosity level for debugging messages"
108 },
109
110 { NULL, 0, NULL, NULL }
111};
112
113static void
114usagemsg(char *pgm)
115{
116 E_INFO("Usage: %s -i <input.lm> \\\n", pgm);
117 E_INFOCONT("\t[-ifmt txt] [-ofmt dmp]\n");
118 E_INFOCONT("\t-o <output.lm.DMP>\n");
119
120 exit(0);
121}
122
123
124int
125main(int argc, char *argv[])
126{
127 cmd_ln_t *config;
128 ngram_model_t *lm = NULL;
129 logmath_t *lmath;
130 int itype, otype;
131 char const *kase;
132
133 if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL)
134 return 1;
135
136 if (cmd_ln_boolean_r(config, "-help")) {
137 usagemsg(argv[0]);
138 }
139
140 err_set_debug_level(cmd_ln_int32_r(config, "-debug"));
141
142 /* Create log math object. */
143 if ((lmath = logmath_init
144 (cmd_ln_float64_r(config, "-logbase"), 0, 0)) == NULL) {
145 E_FATAL("Failed to initialize log math\n");
146 }
147
148 if (cmd_ln_str_r(config, "-i") == NULL || cmd_ln_str_r(config, "-i") == NULL) {
149 E_ERROR("Please specify both input and output models\n");
150 goto error_out;
151 }
152
153
154 /* Load the input language model. */
155 if (cmd_ln_str_r(config, "-ifmt")) {
156 if ((itype = ngram_str_to_type(cmd_ln_str_r(config, "-ifmt")))
157 == NGRAM_INVALID) {
158 E_ERROR("Invalid input type %s\n", cmd_ln_str_r(config, "-ifmt"));
159 goto error_out;
160 }
161 lm = ngram_model_read(config, cmd_ln_str_r(config, "-i"),
162 itype, lmath);
163 }
164 else {
165 lm = ngram_model_read(config, cmd_ln_str_r(config, "-i"),
166 NGRAM_AUTO, lmath);
167 }
168
169 if (lm == NULL) {
170 E_FATAL("Failed to read the model from the file '%s'", cmd_ln_str_r(config, "-i"));
171 }
172
173 /* Guess or set the output language model type. */
174 if (cmd_ln_str_r(config, "-ofmt")) {
175 if ((otype = ngram_str_to_type(cmd_ln_str_r(config, "-ofmt")))
176 == NGRAM_INVALID) {
177 E_ERROR("Invalid output type %s\n", cmd_ln_str_r(config, "-ofmt"));
178 goto error_out;
179 }
180 }
181 else {
182 otype = ngram_file_name_to_type(cmd_ln_str_r(config, "-o"));
183 }
184
185 /* Recode the language model if desired. */
186 if (cmd_ln_str_r(config, "-ienc")) {
187 if (ngram_model_recode(lm, cmd_ln_str_r(config, "-ienc"),
188 cmd_ln_str_r(config, "-oenc")) != 0) {
189 E_ERROR("Failed to recode language model from %s to %s\n",
190 cmd_ln_str_r(config, "-ienc"),
191 cmd_ln_str_r(config, "-oenc"));
192 goto error_out;
193 }
194 }
195
196 /* Case fold if requested. */
197 if ((kase = cmd_ln_str_r(config, "-case"))) {
198 if (0 == strcmp(kase, "lower")) {
199 ngram_model_casefold(lm, NGRAM_LOWER);
200 }
201 else if (0 == strcmp(kase, "upper")) {
202 ngram_model_casefold(lm, NGRAM_UPPER);
203 }
204 else {
205 E_ERROR("Unknown value for -case: %s\n", kase);
206 goto error_out;
207 }
208 }
209
210 /* Write the output language model. */
211 if (ngram_model_write(lm, cmd_ln_str_r(config, "-o"), otype) != 0) {
212 E_ERROR("Failed to write language model in format %s to %s\n",
213 ngram_type_to_str(otype), cmd_ln_str_r(config, "-o"));
214 goto error_out;
215 }
216
217 /* That's all folks! */
219 return 0;
220
221error_out:
223 return 1;
224}
Sphinx's memory allocation/deallocation routines.
Command-line and other configurationparsing and handling.
#define cmd_ln_boolean_r(c, n)
Retrieve a boolean value from a command-line object.
Definition cmd_ln.h:334
#define ARG_STRING
String argument (optional).
Definition cmd_ln.h:114
#define ARG_INT32
Definition cmd_ln.h:144
SPHINXBASE_EXPORT char const * cmd_ln_str_r(cmd_ln_t *cmdln, char const *name)
Retrieve a string from a command-line object.
Definition cmd_ln.c:949
#define REQARG_STRING
Required string argument.
Definition cmd_ln.h:135
#define ARG_BOOLEAN
Boolean (true/false) argument (optional).
Definition cmd_ln.h:118
#define ARG_FLOAT64
Definition cmd_ln.h:152
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_parse_r(cmd_ln_t *inout_cmdln, arg_t const *defn, int32 argc, char *argv[], int32 strict)
Parse a list of strings into argumetns.
Definition cmd_ln.c:551
Implementation of logging routines.
#define E_FATAL
Exit with non-zero status after error message.
Definition err.h:127
#define E_ERROR
Print error message to standard error stream.
Definition err.h:169
SPHINXBASE_EXPORT int err_set_debug_level(int level)
Set debugging verbosity level.
Definition err.c:68
#define E_INFO
Print logging information to standard error stream.
Definition err.h:147
#define E_INFOCONT
Print logging information without header, to standard error stream.
Definition err.h:153
Fast integer logarithmic addition operations.
SPHINXBASE_EXPORT logmath_t * logmath_init(float64 base, int shift, int use_table)
Initialize a log math computation table.
Definition logmath.c:62
N-Gram language models.
@ NGRAM_INVALID
Not a valid file type.
Definition ngram_model.h:77
@ NGRAM_AUTO
Determine file type automatically.
Definition ngram_model.h:78
SPHINXBASE_EXPORT int ngram_model_casefold(ngram_model_t *model, int kase)
Case-fold word strings in an N-Gram model.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_read(cmd_ln_t *config, const char *file_name, ngram_file_type_t file_type, logmath_t *lmath)
Read an N-Gram model from a file on disk.
SPHINXBASE_EXPORT char const * ngram_type_to_str(int type)
Get the canonical name for an N-Gram file type.
SPHINXBASE_EXPORT int ngram_model_recode(ngram_model_t *model, const char *from, const char *to)
Re-encode word strings in an N-Gram model.
SPHINXBASE_EXPORT int ngram_model_write(ngram_model_t *model, const char *file_name, ngram_file_type_t format)
Write an N-Gram model to disk.
SPHINXBASE_EXPORT ngram_file_type_t ngram_file_name_to_type(const char *file_name)
Guess the file type for an N-Gram model from the filename.
Definition ngram_model.c:64
SPHINXBASE_EXPORT int ngram_model_free(ngram_model_t *model)
Release memory associated with an N-Gram model.
SPHINXBASE_EXPORT ngram_file_type_t ngram_str_to_type(const char *str_name)
Get the N-Gram file type from a string.
Definition ngram_model.c:97
file IO related operations.
Miscellaneous useful string functions.
Argument definition structure.
Opaque structure used to hold the results of command-line parsing.
Common implementation of ngram_model_t.