SphinxBase 0.6
sphinx_fe.c
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 1996-2004 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37#include <stdio.h>
38#include <stdlib.h>
39#include <string.h>
40#include <time.h>
41#include <assert.h>
42
43#ifdef HAVE_CONFIG_H
44#include <config.h>
45#endif
46
47#ifdef HAVE_SNDFILE_H
48#include <sndfile.h>
49#endif
50
51#include <sphinxbase/fe.h>
52#include <sphinxbase/strfuncs.h>
53#include <sphinxbase/pio.h>
54#include <sphinxbase/filename.h>
55#include <sphinxbase/cmd_ln.h>
56#include <sphinxbase/err.h>
58#include <sphinxbase/byteorder.h>
60
61#include "sphinx_wave2feat.h"
62#include "cmd_ln_defn.h"
63
64typedef struct audio_type_s {
65 char const *name;
66 int (*detect)(sphinx_wave2feat_t *wtf);
67 int (*decode)(sphinx_wave2feat_t *wtf);
69
70typedef struct output_type_s {
71 char const *name;
72 int (*output_header)(sphinx_wave2feat_t *wtf, int nfloat);
73 int (*output_frames)(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr);
75
80 char *infile;
81 char *outfile;
82 FILE *infh;
83 FILE *outfh;
84 short *audio;
85 mfcc_t **feat;
88 int veclen;
91#ifdef HAVE_SNDFILE_H
92 SNDFILE *insfh;
93#endif
95};
96
98typedef struct RIFFHeader{
99 char rifftag[4]; /* "RIFF" string */
100 int32 TotalLength; /* Total length */
101 char wavefmttag[8]; /* "WAVEfmt " string (note space after 't') */
102 int32 RemainingLength; /* Remaining length */
103 int16 data_format; /* data format tag, 1 = PCM */
104 int16 numchannels; /* Number of channels in file */
105 int32 SamplingFreq; /* Sampling frequency */
106 int32 BytesPerSec; /* Average bytes/sec */
107 int16 BlockAlign; /* Block align */
108 int16 BitsPerSample; /* 8 or 16 bit */
109 char datatag[4]; /* "data" string */
110 int32 datalength; /* Raw data length */
111} MSWAV_hdr;
112
118static int
119detect_riff(sphinx_wave2feat_t *wtf)
120{
121 FILE *fh;
122 MSWAV_hdr hdr;
123
124 if ((fh = fopen(wtf->infile, "rb")) == NULL) {
125 E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
126 return -1;
127 }
128 if (fread(&hdr, sizeof(hdr), 1, fh) != 1) {
129 E_ERROR_SYSTEM("Failed to read RIFF header");
130 fclose(fh);
131 return -1;
132 }
133 /* Make sure it is actually a RIFF file. */
134 if (0 != memcmp(hdr.rifftag, "RIFF", 4)) {
135 fclose(fh);
136 return FALSE;
137 }
138
139 /* Get relevant information. */
140 cmd_ln_set_int32_r(wtf->config, "-nchans", hdr.numchannels);
141 cmd_ln_set_float32_r(wtf->config, "-samprate", hdr.SamplingFreq);
142 wtf->infh = fh;
143
144 return TRUE;
145}
146
147static int
148open_nist_file(sphinx_wave2feat_t *wtf, char const *infile, FILE **out_fh, int detect_endian)
149{
150 char nist[7];
151 lineiter_t *li;
152 FILE *fh;
153
154 if ((fh = fopen(infile, "rb")) == NULL) {
155 E_ERROR_SYSTEM("Failed to open %s", infile);
156 return -1;
157 }
158 if (fread(&nist, 1, 7, fh) != 7) {
159 E_ERROR_SYSTEM("Failed to read NIST header");
160 fclose(fh);
161 return -1;
162 }
163 /* Is this actually a NIST file? */
164 if (0 != strncmp(nist, "NIST_1A", 7)) {
165 fclose(fh);
166 return FALSE;
167 }
168 /* Rewind, parse lines. */
169 fseek(fh, 0, SEEK_SET);
170 for (li = lineiter_start(fh); li; li = lineiter_next(li)) {
171 char **words;
172 int nword;
173
174 string_trim(li->buf, STRING_BOTH);
175 if (strlen(li->buf) == 0) {
176 lineiter_free(li);
177 break;
178 }
179 nword = str2words(li->buf, NULL, 0);
180 if (nword != 3)
181 continue;
182 words = ckd_calloc(nword, sizeof(*words));
183 str2words(li->buf, words, nword);
184 if (0 == strcmp(words[0], "sample_rate")) {
185 cmd_ln_set_float32_r(wtf->config, "-samprate", atof_c(words[2]));
186 }
187 if (0 == strcmp(words[0], "channel_count")) {
188 cmd_ln_set_int32_r(wtf->config, "-nchans", atoi(words[2]));
189 }
190 if (detect_endian && 0 == strcmp(words[0], "sample_byte_format")) {
191 cmd_ln_set_str_r(wtf->config, "-input_endian",
192 (0 == strcmp(words[2], "10")) ? "big" : "little");
193 }
194 ckd_free(words);
195 }
196
197 fseek(fh, 1024, SEEK_SET);
198 if (out_fh)
199 *out_fh = fh;
200 else
201 fclose(fh);
202 return TRUE;
203}
204
205#ifdef HAVE_POPEN
206static int
207detect_sph2pipe(sphinx_wave2feat_t *wtf)
208{
209 FILE *fh;
210 char *cmdline;
211 int rv;
212
213 /* Determine if it's NIST file and get parameters. */
214 if ((rv = open_nist_file(wtf, wtf->infile, NULL, FALSE)) != TRUE)
215 return rv;
216
217 /* Now popen it with sph2pipe. */
218 cmdline = string_join("sph2pipe -f raw '", wtf->infile, "'", NULL);
219 if ((fh = popen(cmdline, "r")) == NULL) {
220 E_ERROR_SYSTEM("Failed to popen(\"sph2pipe -f raw '%s'\")", wtf->infile);
221 ckd_free(cmdline);
222 return -1;
223 }
224
225 wtf->infh = fh;
226 return TRUE;
227}
228#else /* !HAVE_POPEN */
229static int
230detect_sph2pipe(sphinx_wave2feat_t *wtf)
231{
232 E_ERROR("popen() not available, cannot run sph2pipe\n");
233 return -1;
234}
235#endif /* !HAVE_POPEN */
236
242static int
243detect_nist(sphinx_wave2feat_t *wtf)
244{
245 FILE *fh;
246 int rv;
247
248 if ((rv = open_nist_file(wtf, wtf->infile, &fh, TRUE)) != TRUE)
249 return rv;
250 wtf->infh = fh;
251
252 return TRUE;
253}
254
255
262static int
263detect_raw(sphinx_wave2feat_t *wtf)
264{
265 FILE *fh;
266
267 if ((fh = fopen(wtf->infile, "rb")) == NULL) {
268 E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
269 return -1;
270 }
271 wtf->infh = fh;
272 return TRUE;
273}
274
281static int
282detect_sphinx_mfc(sphinx_wave2feat_t *wtf)
283{
284 FILE *fh;
285 int32 len;
286 long flen;
287
288 if ((fh = fopen(wtf->infile, "rb")) == NULL) {
289 E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
290 return -1;
291 }
292 if (fread(&len, 4, 1, fh) != 1) {
293 E_ERROR_SYSTEM("Failed to read header from %s\n", wtf->infile);
294 fclose(fh);
295 return -1;
296 }
297 fseek(fh, 0, SEEK_END);
298 flen = ftell(fh);
299
300 /* figure out whether to byteswap */
301 flen = (flen / 4) - 1;
302 if (flen != len) {
303 /* First make sure this is an endianness problem, otherwise fail. */
304 SWAP_INT32(&len);
305 if (flen != len) {
306 SWAP_INT32(&len);
307 E_ERROR("Mismatch in header/file lengths: 0x%08x vs 0x%08x\n",
308 len, flen);
309 return -1;
310 }
311 /* Set the input endianness to the opposite of the machine endianness... */
312 cmd_ln_set_str_r(wtf->config, "-input_endian",
313 (0 == strcmp("big", cmd_ln_str_r(wtf->config, "-mach_endian"))
314 ? "little" : "big"));
315 }
316
317 fseek(fh, 4, SEEK_SET);
318 wtf->infh = fh;
319 if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
320 wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
321 }
322 else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
323 wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-ncep");
324 wtf->veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
325 }
326 else {
327 /* Should not happen. */
328 E_ERROR("Sphinx MFCC file reading requested but -spec2cep/-cep2spec not given\n");
329 assert(FALSE);
330 }
331
332 return TRUE;
333}
334
335int
336mixnpick_channels(int16 *buf, int32 nsamp, int32 nchans, int32 whichchan)
337{
338 int i, j;
339
340 if (whichchan > 0) {
341 for (i = whichchan - 1; i < nsamp; i += nchans)
342 buf[i/nchans] = buf[i];
343 }
344 else {
345 for (i = 0; i < nsamp; i += nchans) {
346 float64 tmp = 0.0;
347 for (j = 0; j < nchans && i + j < nsamp; ++j) {
348 tmp += buf[i + j];
349 }
350 buf[i/nchans] = (int16)(tmp / nchans);
351 }
352 }
353 return i/nchans;
354}
355
356#ifdef HAVE_SNDFILE_H
362static int
363detect_sndfile(sphinx_wave2feat_t *wtf)
364{
365 SNDFILE *sf;
366 SF_INFO sfinfo;
367
368 memset(&sfinfo, 0, sizeof(sfinfo));
369 /* We let other detectors catch I/O errors, since there is
370 no way to tell them from format errors when opening :( */
371 if ((sf = sf_open(wtf->infile, SFM_READ, &sfinfo)) == NULL) {
372 return FALSE;
373 }
374 /* Get relevant information. */
375 cmd_ln_set_int32_r(wtf->config, "-nchans", sfinfo.channels);
376 cmd_ln_set_float32_r(wtf->config, "-samprate", sfinfo.samplerate);
377 wtf->insfh = sf;
378 wtf->infh = NULL;
379
380 return TRUE;
381}
382
387static int
388decode_sndfile(sphinx_wave2feat_t *wtf)
389{
390 size_t nsamp;
391 int32 nfr, nchans, whichchan;
392 int nfloat, n;
393
394 nchans = cmd_ln_int32_r(wtf->config, "-nchans");
395 whichchan = cmd_ln_int32_r(wtf->config, "-whichchan");
396 fe_start_utt(wtf->fe);
397 nfloat = 0;
398 while ((nsamp = sf_read_short(wtf->insfh,
399 wtf->audio,
400 wtf->blocksize)) != 0) {
401 int16 const *inspeech;
402 size_t nvec;
403
404 /* Mix or pick channels. */
405 if (nchans > 1)
406 nsamp = mixnpick_channels(wtf->audio, nsamp, nchans, whichchan);
407
408 inspeech = wtf->audio;
409 nvec = wtf->featsize;
410 /* Consume all samples. */
411 while (nsamp) {
412 nfr = nvec;
413 fe_process_frames(wtf->fe, &inspeech, &nsamp, wtf->feat, &nfr);
414 if (nfr) {
415 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
416 return -1;
417 nfloat += n;
418 }
419 }
420 inspeech = wtf->audio;
421 }
422 /* Now process any leftover audio frames. */
423 fe_end_utt(wtf->fe, wtf->feat[0], &nfr);
424 if (nfr) {
425 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
426 return -1;
427 nfloat += n;
428 }
429
430 sf_close(wtf->insfh);
431 wtf->insfh = NULL;
432 return nfloat;
433}
434#endif /* HAVE_SNDFILE_H */
435
440static int
441decode_pcm(sphinx_wave2feat_t *wtf)
442{
443 size_t nsamp;
444 int32 nfr, nchans, whichchan;
445 int nfloat, n;
446
447 nchans = cmd_ln_int32_r(wtf->config, "-nchans");
448 whichchan = cmd_ln_int32_r(wtf->config, "-whichchan");
449 fe_start_utt(wtf->fe);
450 nfloat = 0;
451 while ((nsamp = fread(wtf->audio, 2, wtf->blocksize, wtf->infh)) != 0) {
452 size_t nvec;
453 int16 const *inspeech;
454
455 /* Byteswap stuff here if necessary. */
456 if (wtf->byteswap) {
457 for (n = 0; n < nsamp; ++n)
458 SWAP_INT16(wtf->audio + n);
459 }
460
461 /* Mix or pick channels. */
462 if (nchans > 1)
463 nsamp = mixnpick_channels(wtf->audio, nsamp, nchans, whichchan);
464
465 inspeech = wtf->audio;
466 nvec = wtf->featsize;
467 /* Consume all samples. */
468 while (nsamp) {
469 nfr = nvec;
470 fe_process_frames(wtf->fe, &inspeech, &nsamp, wtf->feat, &nfr);
471 if (nfr) {
472 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
473 return -1;
474 nfloat += n;
475 }
476 }
477 inspeech = wtf->audio;
478 }
479 /* Now process any leftover audio frames. */
480 fe_end_utt(wtf->fe, wtf->feat[0], &nfr);
481 if (nfr) {
482 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
483 return -1;
484 nfloat += n;
485 }
486
487 if (fclose(wtf->infh) == EOF)
488 E_ERROR_SYSTEM("Failed to close input file");
489 wtf->infh = NULL;
490 return nfloat;
491}
492
497static int
498decode_sphinx_mfc(sphinx_wave2feat_t *wtf)
499{
500 int nfloat = 0, n;
501 int featsize = wtf->featsize;
502
503 /* If the input vector length is less than the output length, we
504 * need to do this one frame at a time, because there's empty
505 * space at the end of each vector in wtf->feat. */
506 if (wtf->in_veclen < wtf->veclen)
507 featsize = 1;
508 while ((n = fread(wtf->feat[0], sizeof(**wtf->feat),
509 featsize * wtf->in_veclen, wtf->infh)) != 0) {
510 int i, nfr = n / wtf->in_veclen;
511 if (n % wtf->in_veclen) {
512 E_ERROR("Size of file %d not a multiple of veclen %d\n",
513 n, wtf->in_veclen);
514 return -1;
515 }
516 /* Byteswap stuff here if necessary. */
517 if (wtf->byteswap) {
518 for (i = 0; i < n; ++i)
519 SWAP_FLOAT32(wtf->feat[0] + i);
520 }
521 fe_float_to_mfcc(wtf->fe, (float32 **)wtf->feat, wtf->feat, nfr);
522 for (i = 0; i < nfr; ++i) {
523 if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
524 if (0 == strcmp(cmd_ln_str_r(wtf->config, "-transform"), "legacy"))
525 fe_logspec_to_mfcc(wtf->fe, wtf->feat[i], wtf->feat[i]);
526 else
527 fe_logspec_dct2(wtf->fe, wtf->feat[i], wtf->feat[i]);
528 }
529 else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
530 fe_mfcc_dct3(wtf->fe, wtf->feat[i], wtf->feat[i]);
531 }
532 }
533 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
534 return -1;
535 nfloat += n;
536 }
537
538 if (fclose(wtf->infh) == EOF)
539 E_ERROR_SYSTEM("Failed to close input file");
540 wtf->infh = NULL;
541 return nfloat;
542}
543
544static const audio_type_t types[] = {
545#ifdef HAVE_SNDFILE_H
546 { "-sndfile", &detect_sndfile, &decode_sndfile },
547#endif
548 { "-mswav", &detect_riff, &decode_pcm },
549 { "-nist", &detect_nist, &decode_pcm },
550 { "-raw", &detect_raw, &decode_pcm },
551 { "-sph2pipe", &detect_sph2pipe, &decode_pcm }
552};
553static const int ntypes = sizeof(types)/sizeof(types[0]);
554static const audio_type_t mfcc_type = {
555 "sphinx_mfc", &detect_sphinx_mfc, &decode_sphinx_mfc
556};
557
563static int
564output_header_sphinx(sphinx_wave2feat_t *wtf, int32 nfloat)
565{
566 if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1) {
567 E_ERROR_SYSTEM("Failed to write to %s", wtf->outfile);
568 return -1;
569 }
570 return 0;
571}
572
578static int
579output_frames_sphinx(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
580{
581 int i, nfloat = 0;
582
583 fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
584 for (i = 0; i < nfr; ++i) {
585 if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
586 E_ERROR_SYSTEM("Writing %d values to %s failed",
587 wtf->veclen, wtf->outfile);
588 return -1;
589 }
590 nfloat += wtf->veclen;
591 }
592 return nfloat;
593}
594
595typedef enum htk_feature_kind_e {
596 WAVEFORM = 0, /* PCM audio (rarely used) */
597 LPC = 1, /* LPC filter coefficients */
598 LPCREFC = 2, /* LPC reflection coefficients */
599 LPCEPSTRA = 3, /* LPC-based cepstral coefficients */
600 LPCDELCEP = 4, /* LPCC plus deltas */
601 IREFC = 5, /* 16-bit integer LPC reflection coefficients */
602 MFCC = 6, /* MFCCs */
603 FBANK = 7, /* Log mel spectrum */
604 MELSPEC = 8, /* Linear mel spectrum */
605 USER = 9, /* User defined */
606 DISCRETE = 10, /* Vector quantized data */
607 PLP = 11 /* PLP coefficients */
608} htk_feature_kind_t;
609
610typedef enum htk_feature_flag_e {
611 _E = 0000100, /* has energy */
612 _N = 0000200, /* absolute energy supressed */
613 _D = 0000400, /* has delta coefficients */
614 _A = 0001000, /* has acceleration (delta-delta) coefficients */
615 _C = 0002000, /* is compressed */
616 _Z = 0004000, /* has zero mean static coefficients (i.e. CMN) */
617 _K = 0010000, /* has CRC checksum */
618 _O = 0020000, /* has 0th cepstral coefficient */
619 _V = 0040000, /* has VQ data */
620 _T = 0100000 /* has third differential coefficients */
621} htk_feature_flag_t;
622
626static int
627output_header_htk(sphinx_wave2feat_t *wtf, int32 nfloat)
628{
629 int32 samp_period;
630 int16 samp_size;
631 int16 param_kind;
632 int swap = FALSE;
633
634 /* HTK files are big-endian. */
635 if (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")))
636 swap = TRUE;
637 /* Same file size thing as in Sphinx files (I think) */
638 if (swap) SWAP_INT32(&nfloat);
639 if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1)
640 return -1;
641 /* Sample period in 100ns units. */
642 samp_period = (int32)(1e+7 / cmd_ln_float32_r(wtf->config, "-frate"));
643 if (swap) SWAP_INT32(&samp_period);
644 if (fwrite(&samp_period, 4, 1, wtf->outfh) != 1)
645 return -1;
646 /* Sample size - veclen * sizeof each sample. */
647 samp_size = wtf->veclen * 4;
648 if (swap) SWAP_INT16(&samp_size);
649 if (fwrite(&samp_size, 2, 1, wtf->outfh) != 1)
650 return -1;
651 /* Format and flags. */
652 if (cmd_ln_boolean_r(wtf->config, "-logspec")
653 || cmd_ln_boolean_r(wtf->config, "-cep2spec"))
654 param_kind = FBANK; /* log mel-filter bank outputs */
655 else
656 param_kind = MFCC | _O; /* MFCC + CEP0 (note reordering...) */
657 if (swap) SWAP_INT16(&param_kind);
658 if (fwrite(&param_kind, 2, 1, wtf->outfh) != 1)
659 return -1;
660
661 return 0;
662}
663
667static int
668output_frames_htk(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
669{
670 int i, j, swap, htk_reorder, nfloat = 0;
671
672 fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
673 /* This is possibly inefficient, but probably not a big deal. */
674 swap = (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")));
675 htk_reorder = (0 == strcmp("htk", wtf->ot->name)
676 && !(cmd_ln_boolean_r(wtf->config, "-logspec")
677 || cmd_ln_boolean_r(wtf->config, "-cep2spec")));
678 for (i = 0; i < nfr; ++i) {
679 if (htk_reorder) {
680 mfcc_t c0 = frames[i][0];
681 memmove(frames[i] + 1, frames[i], (wtf->veclen - 1) * 4);
682 frames[i][wtf->veclen - 1] = c0;
683 }
684 if (swap)
685 for (j = 0; j < wtf->veclen; ++j)
686 SWAP_FLOAT32(frames[i] + j);
687 if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
688 E_ERROR_SYSTEM("Writing %d values to %s failed",
689 wtf->veclen, wtf->outfile);
690 return -1;
691 }
692 nfloat += wtf->veclen;
693 }
694 return nfloat;
695}
696
700static int
701output_frames_text(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
702{
703 int i, j, nfloat = 0;
704
705 fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
706 for (i = 0; i < nfr; ++i) {
707 for (j = 0; j < wtf->veclen; ++j) {
708 fprintf(wtf->outfh, "%.5g", frames[i][j]);
709 if (j == wtf->veclen - 1)
710 fprintf(wtf->outfh, "\n");
711 else
712 fprintf(wtf->outfh, " ");
713 }
714 nfloat += wtf->veclen;
715 }
716 return nfloat;
717}
718
719static const output_type_t outtypes[] = {
720 { "sphinx", &output_header_sphinx, &output_frames_sphinx },
721 { "htk", &output_header_htk, &output_frames_htk },
722 { "text", NULL, &output_frames_text }
723};
724static const int nouttypes = sizeof(outtypes)/sizeof(outtypes[0]);
725
727sphinx_wave2feat_init(cmd_ln_t *config)
728{
730 int i;
731
732 wtf = ckd_calloc(1, sizeof(*wtf));
733 wtf->refcount = 1;
734 wtf->config = cmd_ln_retain(config);
735 wtf->fe = fe_init_auto_r(wtf->config);
736 wtf->ot = outtypes; /* Default (sphinx) type. */
737 for (i = 0; i < nouttypes; ++i) {
738 output_type_t const *otype = &outtypes[i];
739 if (0 == strcmp(cmd_ln_str_r(config, "-ofmt"), otype->name)) {
740 wtf->ot = otype;
741 break;
742 }
743 }
744 if (i == nouttypes) {
745 E_ERROR("Unknown output type: '%s'\n",
746 cmd_ln_str_r(config, "-ofmt"));
747 sphinx_wave2feat_free(wtf);
748 return NULL;
749 }
750
751 return wtf;
752}
753
754int
755sphinx_wave2feat_free(sphinx_wave2feat_t *wtf)
756{
757 if (wtf == NULL)
758 return 0;
759 if (--wtf->refcount > 0)
760 return wtf->refcount;
761
762 if (wtf->audio)
763 ckd_free(wtf->audio);
764 if (wtf->feat)
765 ckd_free_2d(wtf->feat);
766 if (wtf->infile)
767 ckd_free(wtf->infile);
768 if (wtf->outfile)
769 ckd_free(wtf->outfile);
770 if (wtf->infh) {
771 if (fclose(wtf->infh) == EOF)
772 E_ERROR_SYSTEM("Failed to close input file");
773 }
774 if (wtf->outfh) {
775 if (fclose(wtf->outfh) == EOF)
776 E_ERROR_SYSTEM("Failed to close output file");
777 }
778 cmd_ln_free_r(wtf->config);
779 fe_free(wtf->fe);
780 ckd_free(wtf);
781
782 return 0;
783}
784
786sphinx_wave2feat_retain(sphinx_wave2feat_t *wtf)
787{
788 ++wtf->refcount;
789 return wtf;
790}
791
792static audio_type_t const *
793detect_audio_type(sphinx_wave2feat_t *wtf)
794{
795 audio_type_t const *atype;
796 int i;
797
798 /* Special case audio type for Sphinx MFCC inputs. */
799 if (cmd_ln_boolean_r(wtf->config, "-spec2cep")
800 || cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
801 int rv = mfcc_type.detect(wtf);
802 if (rv == -1)
803 goto error_out;
804 return &mfcc_type;
805 }
806
807 /* Try to use the type of infile given on the command line. */
808 for (i = 0; i < ntypes; ++i) {
809 int rv;
810 atype = &types[i];
811 if (cmd_ln_boolean_r(wtf->config, atype->name)) {
812 rv = (*atype->detect)(wtf);
813 if (rv == -1)
814 goto error_out;
815 else if (rv == TRUE)
816 break;
817 }
818 }
819 if (i == ntypes) {
820 /* Detect file type of infile and get parameters. */
821 for (i = 0; i < ntypes; ++i) {
822 int rv;
823 atype = &types[i];
824 rv = (*atype->detect)(wtf);
825 if (rv == -1)
826 goto error_out;
827 else if (rv == TRUE)
828 break;
829 }
830 if (i == ntypes)
831 goto error_out;
832 }
833 return atype;
834 error_out:
835 if (wtf->infh)
836 fclose(wtf->infh);
837 wtf->infh = NULL;
838 return NULL;
839}
840
841int
842sphinx_wave2feat_convert_file(sphinx_wave2feat_t *wtf,
843 char const *infile, char const *outfile)
844{
845 int nchans, minfft, nfft, nfloat, veclen;
846 audio_type_t const *atype;
847 int fshift, fsize;
848
849 if (cmd_ln_boolean_r(wtf->config, "-verbose"))
850 E_INFO("Converting %s to %s\n", infile, outfile);
851
852 wtf->infile = ckd_salloc(infile);
853
854 /* Detect input file type. */
855 if ((atype = detect_audio_type(wtf)) == NULL)
856 return -1;
857
858 /* Determine whether to byteswap input. */
859 wtf->byteswap = strcmp(cmd_ln_str_r(wtf->config, "-mach_endian"),
860 cmd_ln_str_r(wtf->config, "-input_endian"));
861
862 /* Make sure the FFT size is sufficiently large. */
863 minfft = (int)(cmd_ln_float32_r(wtf->config, "-samprate")
864 * cmd_ln_float32_r(wtf->config, "-wlen") + 0.5);
865 for (nfft = 1; nfft < minfft; nfft <<= 1)
866 ;
867 if (nfft > cmd_ln_int32_r(wtf->config, "-nfft")) {
868 E_WARN("Value of -nfft = %d is too small, increasing to %d\n",
869 cmd_ln_int32_r(wtf->config, "-nfft"), nfft);
870 cmd_ln_set_int32_r(wtf->config, "-nfft", nfft);
871 fe_free(wtf->fe);
872 wtf->fe = fe_init_auto_r(wtf->config);
873 }
874
875 /* Get the output frame size (if not already set). */
876 if (wtf->veclen == 0)
877 wtf->veclen = fe_get_output_size(wtf->fe);
878
879 /* Set up the input and output buffers. */
880 fe_get_input_size(wtf->fe, &fshift, &fsize);
881 /* Want to get at least a whole frame plus shift in here. Also we
882 will either pick or mix multiple channels so we need to read
883 them all at once. */
884 nchans = cmd_ln_int32_r(wtf->config, "-nchans");
885 wtf->blocksize = cmd_ln_int32_r(wtf->config, "-blocksize") * nchans;
886 if (wtf->blocksize < (fsize + fshift) * nchans) {
887 E_INFO("Block size of %d too small, increasing to %d\n",
888 wtf->blocksize,
889 (fsize + fshift) * nchans);
890 wtf->blocksize = (fsize + fshift) * nchans;
891 }
892 wtf->audio = ckd_calloc(wtf->blocksize, sizeof(*wtf->audio));
893 wtf->featsize = (wtf->blocksize / nchans - fsize) / fshift;
894
895 /* Use the maximum of the input and output frame sizes to allocate this. */
896 veclen = wtf->veclen;
897 if (wtf->in_veclen > veclen) veclen = wtf->in_veclen;
898
899 wtf->feat = ckd_calloc_2d(wtf->featsize, veclen, sizeof(**wtf->feat));
900
901 /* Let's go! */
902 if ((wtf->outfh = fopen(outfile, "wb")) == NULL) {
903 E_ERROR_SYSTEM("Failed to open %s for writing", outfile);
904 return -1;
905 }
906 /* Write an empty header, which we'll fill in later. */
907 if (wtf->ot->output_header &&
908 (*wtf->ot->output_header)(wtf, 0) < 0) {
909 E_ERROR_SYSTEM("Failed to write empty header to %s\n", outfile);
910 goto error_out;
911 }
912 wtf->outfile = ckd_salloc(outfile);
913
914 if ((nfloat = (*atype->decode)(wtf)) < 0) {
915 E_ERROR("Failed to convert");
916 goto error_out;
917 }
918
919 if (wtf->ot->output_header) {
920 if (fseek(wtf->outfh, 0, SEEK_SET) < 0) {
921 E_ERROR_SYSTEM("Failed to seek to beginning of %s\n", outfile);
922 goto error_out;
923 }
924 if ((*wtf->ot->output_header)(wtf, nfloat) < 0) {
925 E_ERROR_SYSTEM("Failed to write header to %s\n", outfile);
926 goto error_out;
927 }
928 }
929
930
931 if (wtf->audio)
932 ckd_free(wtf->audio);
933 if (wtf->feat)
934 ckd_free_2d(wtf->feat);
935 if (wtf->infile)
936 ckd_free(wtf->infile);
937 if (wtf->outfile)
938 ckd_free(wtf->outfile);
939
940 wtf->audio = NULL;
941 wtf->infile = NULL;
942 wtf->feat = NULL;
943 wtf->outfile = NULL;
944
945 if (wtf->outfh)
946 if (fclose(wtf->outfh) == EOF)
947 E_ERROR_SYSTEM("Failed to close output file");
948 wtf->outfh = NULL;
949
950 return 0;
951
952error_out:
953
954 if (wtf->audio)
955 ckd_free(wtf->audio);
956 if (wtf->feat)
957 ckd_free_2d(wtf->feat);
958 if (wtf->infile)
959 ckd_free(wtf->infile);
960 if (wtf->outfile)
961 ckd_free(wtf->outfile);
962
963 wtf->audio = NULL;
964 wtf->infile = NULL;
965 wtf->feat = NULL;
966 wtf->outfile = NULL;
967
968 if (wtf->outfh)
969 if (fclose(wtf->outfh) == EOF)
970 E_ERROR_SYSTEM("Failed to close output file");
971 wtf->outfh = NULL;
972
973 return -1;
974}
975
976void
977build_filenames(cmd_ln_t *config, char const *basename,
978 char **out_infile, char **out_outfile)
979{
980 char const *di, *do_, *ei, *eo;
981
982 di = cmd_ln_str_r(config, "-di");
983 do_ = cmd_ln_str_r(config, "-do");
984 ei = cmd_ln_str_r(config, "-ei");
985 eo = cmd_ln_str_r(config, "-eo");
986
987 *out_infile = string_join(di ? di : "",
988 di ? "/" : "",
989 basename,
990 ei ? "." : "",
991 ei ? ei : "",
992 NULL);
993 *out_outfile = string_join(do_ ? do_ : "",
994 do_ ? "/" : "",
995 basename,
996 eo ? "." : "",
997 eo ? eo : "",
998 NULL);
999 /* Build output directory structure if possible/requested (it is
1000 * by default). */
1001 if (cmd_ln_boolean_r(config, "-build_outdirs")) {
1002 char *dirname = ckd_salloc(*out_outfile);
1003 path2dirname(*out_outfile, dirname);
1004 build_directory(dirname);
1005 ckd_free(dirname);
1006 }
1007}
1008
1009static int
1010run_control_file(sphinx_wave2feat_t *wtf, char const *ctlfile)
1011{
1012 hash_table_t *files;
1013 hash_iter_t *itor;
1014 lineiter_t *li;
1015 FILE *ctlfh;
1016 int nskip, runlen, npart, rv = 0;
1017
1018 if ((ctlfh = fopen(ctlfile, "r")) == NULL) {
1019 E_ERROR_SYSTEM("Failed to open control file %s", ctlfile);
1020 return -1;
1021 }
1022 nskip = cmd_ln_int32_r(wtf->config, "-nskip");
1023 runlen = cmd_ln_int32_r(wtf->config, "-runlen");
1024 if ((npart = cmd_ln_int32_r(wtf->config, "-npart"))) {
1025 /* Count lines in the file. */
1026 int partlen, part, nlines = 0;
1027 part = cmd_ln_int32_r(wtf->config, "-part");
1028 for (li = lineiter_start(ctlfh); li; li = lineiter_next(li))
1029 ++nlines;
1030 fseek(ctlfh, 0, SEEK_SET);
1031 partlen = nlines / npart;
1032 nskip = partlen * (part - 1);
1033 if (part == npart)
1034 runlen = -1;
1035 else
1036 runlen = partlen;
1037 }
1038 if (runlen != -1){
1039 E_INFO("Processing %d utterances at position %d\n", runlen, nskip);
1040 files = hash_table_new(runlen, HASH_CASE_YES);
1041 }
1042 else {
1043 E_INFO("Processing all remaining utterances at position %d\n", nskip);
1044 files = hash_table_new(1000, HASH_CASE_YES);
1045 }
1046 for (li = lineiter_start(ctlfh); li; li = lineiter_next(li)) {
1047 char *c, *infile, *outfile;
1048
1049 if (nskip-- > 0)
1050 continue;
1051 if (runlen == 0) {
1052 lineiter_free(li);
1053 break;
1054 }
1055 --runlen;
1056
1057 string_trim(li->buf, STRING_BOTH);
1058 /* Extract the file ID from the control line. */
1059 if ((c = strchr(li->buf, ' ')) != NULL)
1060 *c = '\0';
1061 if (strlen(li->buf) == 0) {
1062 E_WARN("Empty line %d in control file, skipping\n", li->lineno);
1063 continue;
1064 }
1065 build_filenames(wtf->config, li->buf, &infile, &outfile);
1066 if (hash_table_lookup(files, infile, NULL) == 0)
1067 continue;
1068 rv = sphinx_wave2feat_convert_file(wtf, infile, outfile);
1069 hash_table_enter(files, infile, outfile);
1070 if (rv != 0) {
1071 lineiter_free(li);
1072 break;
1073 }
1074 }
1075 for (itor = hash_table_iter(files); itor;
1076 itor = hash_table_iter_next(itor)) {
1077 ckd_free((void *)hash_entry_key(itor->ent));
1078 ckd_free(hash_entry_val(itor->ent));
1079 }
1080 hash_table_free(files);
1081
1082 if (fclose(ctlfh) == EOF)
1083 E_ERROR_SYSTEM("Failed to close control file");
1084 return rv;
1085}
1086
1087int
1088main(int argc, char *argv[])
1089{
1090 sphinx_wave2feat_t *wtf;
1091 cmd_ln_t *config;
1092 int rv;
1093
1094 /* Initialize config. */
1095 if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL)
1096 return 2;
1097
1098 /* Parse an argument file if there's one in there. */
1099 if (cmd_ln_str_r(config, "-argfile"))
1100 config = cmd_ln_parse_file_r(config, defn,
1101 cmd_ln_str_r(config, "-argfile"), FALSE);
1102 if (config == NULL) {
1103 E_ERROR("Command line parsing failed\n");
1104 return 1;
1105 }
1106 if ((wtf = sphinx_wave2feat_init(config)) == NULL) {
1107 E_ERROR("Failed to initialize wave2feat object\n");
1108 return 1;
1109 }
1110
1111 /* If there's a control file run through it, otherwise we will do
1112 * a single file (which is what run_control_file will do
1113 * internally too) */
1114 if (cmd_ln_str_r(config, "-c"))
1115 rv = run_control_file(wtf, cmd_ln_str_r(config, "-c"));
1116 else
1117 rv = sphinx_wave2feat_convert_file(wtf, cmd_ln_str_r(config, "-i"),
1118 cmd_ln_str_r(config, "-o"));
1119
1120 sphinx_wave2feat_free(wtf);
1121 cmd_ln_free_r(config);
1122 return rv;
1123}
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition ckd_alloc.c:241
SPHINXBASE_EXPORT void ckd_free_2d(void *ptr)
Free a 2-D array (ptr) previously allocated by ckd_calloc_2d.
Definition ckd_alloc.c:252
#define ckd_calloc_2d(d1, d2, sz)
Macro for ckd_calloc_2d
Definition ckd_alloc.h:270
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition ckd_alloc.h:248
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition ckd_alloc.h:264
Command-line and other configurationparsing and handling.
#define cmd_ln_boolean_r(c, n)
Retrieve a boolean value from a command-line object.
Definition cmd_ln.h:334
SPHINXBASE_EXPORT int cmd_ln_free_r(cmd_ln_t *cmdln)
Release a command-line argument set and all associated strings.
Definition cmd_ln.c:1036
SPHINXBASE_EXPORT void cmd_ln_set_str_r(cmd_ln_t *cmdln, char const *name, char const *str)
Set a string in a command-line object.
Definition cmd_ln.c:989
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_parse_file_r(cmd_ln_t *inout_cmdln, arg_t const *defn, char const *filename, int32 strict)
Parse an arguments file by deliminating on " \r\t\n" and putting each tokens into an argv[] for cmd_l...
Definition cmd_ln.c:773
SPHINXBASE_EXPORT char const * cmd_ln_str_r(cmd_ln_t *cmdln, char const *name)
Retrieve a string from a command-line object.
Definition cmd_ln.c:949
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_retain(cmd_ln_t *cmdln)
Retain ownership of a command-line argument set.
Definition cmd_ln.c:1029
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_parse_r(cmd_ln_t *inout_cmdln, arg_t const *defn, int32 argc, char *argv[], int32 strict)
Parse a list of strings into argumetns.
Definition cmd_ln.c:551
Implementation of logging routines.
#define E_ERROR_SYSTEM
Print error text; Call perror("");.
Definition err.h:142
#define E_ERROR
Print error message to standard error stream.
Definition err.h:169
#define E_WARN
Print warning information to standard error stream.
Definition err.h:164
#define E_INFO
Print logging information to standard error stream.
Definition err.h:147
File names related operation.
SPHINXBASE_EXPORT void path2dirname(const char *path, char *dir)
Strip off filename from the given path and copy the directory name into dir Caller must have allocate...
Definition filename.c:90
Hash table implementation.
SPHINXBASE_EXPORT void hash_table_free(hash_table_t *h)
Free the specified hash table; the caller is responsible for freeing the key strings pointed to by th...
Definition hash_table.c:695
SPHINXBASE_EXPORT hash_iter_t * hash_table_iter_next(hash_iter_t *itor)
Get the next key-value pair in iteration.
Definition hash_table.c:663
SPHINXBASE_EXPORT int32 hash_table_lookup(hash_table_t *h, const char *key, void **val)
Look up a key in a hash table and optionally return the associated value.
Definition hash_table.c:309
#define hash_entry_val(e)
Access macros.
Definition hash_table.h:175
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
Definition hash_table.c:508
SPHINXBASE_EXPORT hash_iter_t * hash_table_iter(hash_table_t *h)
Start iterating over key-value pairs in a hash table.
Definition hash_table.c:653
SPHINXBASE_EXPORT hash_table_t * hash_table_new(int32 size, int32 casearg)
Allocate a new hash table for a given expected size.
Definition hash_table.c:158
file IO related operations.
SPHINXBASE_EXPORT void lineiter_free(lineiter_t *li)
Stop reading lines from a file.
Definition pio.c:358
SPHINXBASE_EXPORT int build_directory(const char *path)
Create a directory and all of its parent directories, as needed.
Definition pio.c:653
SPHINXBASE_EXPORT lineiter_t * lineiter_start(FILE *fh)
Start reading lines from a file.
Definition pio.c:255
SPHINXBASE_EXPORT lineiter_t * lineiter_next(lineiter_t *li)
Move to the next line in the file.
Definition pio.c:338
Miscellaneous useful string functions.
SPHINXBASE_EXPORT char * string_trim(char *string, enum string_edge_e which)
Remove whitespace from a string, modifying it in-place.
Definition strfuncs.c:89
SPHINXBASE_EXPORT char * string_join(const char *base,...)
Concatenate a NULL-terminated argument list of strings, returning a newly allocated string.
Definition strfuncs.c:62
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
Definition strfuncs.c:115
@ STRING_BOTH
Both ends of string.
Definition strfuncs.h:73
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
Definition strfuncs.c:56
RIFF 44-byte header structure for MS wav files.
Definition sphinx_fe.c:98
Opaque structure used to hold the results of command-line parsing.
Structure for the front-end computation.
hash_entry_t * ent
Current entry in that table.
Definition hash_table.h:170
Line iterator for files.
Definition pio.h:177
int byteswap
Whether byteswapping is necessary.
Definition sphinx_fe.c:90
int in_veclen
Length of each input vector (for cep<->spec).
Definition sphinx_fe.c:89
cmd_ln_t * config
Configuration parameters.
Definition sphinx_fe.c:78
fe_t * fe
Front end object.
Definition sphinx_fe.c:79
char * infile
Path to input file.
Definition sphinx_fe.c:80
short * audio
Audio buffer.
Definition sphinx_fe.c:84
output_type_t const * ot
Output type object.
Definition sphinx_fe.c:94
char * outfile
Path to output file.
Definition sphinx_fe.c:81
mfcc_t ** feat
Feature buffer.
Definition sphinx_fe.c:85
int featsize
Size of feature buffer.
Definition sphinx_fe.c:87
int veclen
Length of each output vector.
Definition sphinx_fe.c:88
FILE * outfh
Output file handle.
Definition sphinx_fe.c:83
FILE * infh
Input file handle.
Definition sphinx_fe.c:82
int refcount
Reference count.
Definition sphinx_fe.c:77
int blocksize
Size of audio buffer.
Definition sphinx_fe.c:86