SphinxBase 0.6
sphinx_pitch.c
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 2008 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37
38#include <stdio.h>
39#include <string.h>
40
41#include <sphinxbase/cmd_ln.h>
42#include <sphinxbase/yin.h>
44#include <sphinxbase/byteorder.h>
45#include <sphinxbase/strfuncs.h>
46#include <sphinxbase/err.h>
47#include <sphinxbase/pio.h>
48
49#ifndef WORDS_BIGENDIAN
50#define WORDS_BIGENDIAN 0
51#endif
52
53static arg_t defn[] = {
54 { "-i",
56 NULL,
57 "Single audio input file" },
58
59 { "-o",
61 NULL,
62 "Single text output file (standard output will be used if not given)" },
63
64 { "-c",
66 NULL,
67 "Control file for batch processing" },
68
69 { "-nskip",
71 "0",
72 "If a control file was specified, the number of utterances to skip at the head of the file" },
73
74 { "-runlen",
76 "-1",
77 "If a control file was specified, the number of utterances to process (see -nskip too)" },
78
79 { "-di",
81 NULL,
82 "Input directory, input file names are relative to this, if defined" },
83
84 { "-ei",
86 NULL,
87 "Input extension to be applied to all input files" },
88
89 { "-do",
91 NULL,
92 "Output directory, output files are relative to this" },
93
94 { "-eo",
96 NULL,
97 "Output extension to be applied to all output files" },
98
99 { "-nist",
101 "no",
102 "Defines input format as NIST sphere" },
103
104 { "-raw",
106 "no",
107 "Defines input format as raw binary data" },
108
109 { "-mswav",
111 "no",
112 "Defines input format as Microsoft Wav (RIFF)" },
113
114 { "-samprate",
115 ARG_INT32,
116 "0",
117 "Sampling rate of audio data (will be determined automatically if 0)" },
118
119 { "-input_endian",
121 NULL,
122 "Endianness of audio data (will be determined automatically if not given)" },
123
124 { "-fshift",
126 "0.01",
127 "Frame shift: number of seconds between each analysis frame." },
128
129 { "-flen",
131 "0.025",
132 "Number of seconds in each analysis frame (needs to be greater than twice the longest period you wish to detect - to detect down to 80Hz you need a frame length of 2.0/80 = 0.025)." },
133
134 { "-smooth_window",
135 ARG_INT32,
136 "2",
137 "Number of frames on either side of the current frame to use for smoothing." },
138
139 { "-voice_thresh",
141 "0.1",
142 "Threshold of normalized difference under which to search for the fundamental period." },
143
144 { "-search_range",
146 "0.2",
147 "Fraction of the best local estimate to use as a search range for smoothing." },
148
149 { NULL, 0, NULL, NULL }
150};
151
152static int extract_pitch(const char *in, const char *out);
153static int run_control_file(const char *ctl);
154
155int
156main(int argc, char *argv[])
157{
158 cmd_ln_parse(defn, argc, argv, TRUE);
159
160 /* Run a control file if requested. */
161 if (cmd_ln_str("-c")) {
162 if (run_control_file(cmd_ln_str("-c")) < 0)
163 return 1;
164 }
165 else {
166 if (extract_pitch(cmd_ln_str("-i"), cmd_ln_str("-o")) < 0)
167 return 1;
168 }
169
170 cmd_ln_free();
171 return 0;
172}
173
174static int
175guess_file_type(char const *file, FILE *infh)
176{
177 char header[4];
178
179 fseek(infh, 0, SEEK_SET);
180 if (fread(header, 1, 4, infh) != 4) {
181 E_ERROR_SYSTEM("Failed to read 4 byte header");
182 return -1;
183 }
184 if (0 == memcmp(header, "RIFF", 4)) {
185 E_INFO("%s appears to be a WAV file\n", file);
186 cmd_ln_set_boolean("-mswav", TRUE);
187 cmd_ln_set_boolean("-nist", FALSE);
188 cmd_ln_set_boolean("-raw", FALSE);
189 }
190 else if (0 == memcmp(header, "NIST", 4)) {
191 E_INFO("%s appears to be a NIST SPHERE file\n", file);
192 cmd_ln_set_boolean("-mswav", FALSE);
193 cmd_ln_set_boolean("-nist", TRUE);
194 cmd_ln_set_boolean("-raw", FALSE);
195 }
196 else {
197 E_INFO("%s appears to be raw data\n", file);
198 cmd_ln_set_boolean("-mswav", FALSE);
199 cmd_ln_set_boolean("-nist", FALSE);
200 cmd_ln_set_boolean("-raw", TRUE);
201 }
202 fseek(infh, 0, SEEK_SET);
203 return 0;
204}
205
206#define TRY_FREAD(ptr, size, nmemb, stream) \
207 if (fread(ptr, size, nmemb, stream) != (nmemb)) { \
208 E_ERROR_SYSTEM("Failed to read %d bytes", size * nmemb); \
209 goto error_out; \
210 }
211
212static int
213read_riff_header(FILE *infh)
214{
215 char id[4];
216 int32 intval, header_len;
217 int16 shortval;
218
219 /* RIFF files are little-endian by definition. */
220 cmd_ln_set_str("-input_endian", "little");
221
222 /* Read in all the header chunks and etcetera. */
223 TRY_FREAD(id, 1, 4, infh);
224 /* Total file length (we don't care) */
225 TRY_FREAD(&intval, 4, 1, infh);
226 /* 'WAVE' */
227 TRY_FREAD(id, 1, 4, infh);
228 if (0 != memcmp(id, "WAVE", 4)) {
229 E_ERROR("This is not a WAVE file\n");
230 goto error_out;
231 }
232 /* 'fmt ' */
233 TRY_FREAD(id, 1, 4, infh);
234 if (0 != memcmp(id, "fmt ", 4)) {
235 E_ERROR("Format chunk missing\n");
236 goto error_out;
237 }
238 /* Length of 'fmt ' chunk */
239 TRY_FREAD(&intval, 4, 1, infh);
240 if (WORDS_BIGENDIAN) SWAP_INT32(&intval);
241 header_len = intval;
242
243 /* Data format. */
244 TRY_FREAD(&shortval, 2, 1, infh);
245 if (WORDS_BIGENDIAN) SWAP_INT16(&shortval);
246 if (shortval != 1) { /* PCM */
247 E_ERROR("WAVE file is not in PCM format\n");
248 goto error_out;
249 }
250
251 /* Number of channels. */
252 TRY_FREAD(&shortval, 2, 1, infh);
253 if (WORDS_BIGENDIAN) SWAP_INT16(&shortval);
254 if (shortval != 1) { /* PCM */
255 E_ERROR("WAVE file is not single channel\n");
256 goto error_out;
257 }
258
259 /* Sampling rate (finally!) */
260 TRY_FREAD(&intval, 4, 1, infh);
261 if (WORDS_BIGENDIAN) SWAP_INT32(&intval);
262 if (cmd_ln_int32("-samprate") == 0)
263 cmd_ln_set_int32("-samprate", intval);
264 else if (cmd_ln_int32("-samprate") != intval) {
265 E_WARN("WAVE file sampling rate %d != -samprate %d\n",
266 intval, cmd_ln_int32("-samprate"));
267 }
268
269 /* Average bytes per second (we don't care) */
270 TRY_FREAD(&intval, 4, 1, infh);
271
272 /* Block alignment (we don't care) */
273 TRY_FREAD(&shortval, 2, 1, infh);
274
275 /* Bits per sample (must be 16) */
276 TRY_FREAD(&shortval, 2, 1, infh);
277 if (WORDS_BIGENDIAN) SWAP_INT16(&shortval);
278 if (shortval != 16) {
279 E_ERROR("WAVE file is not 16-bit\n");
280 goto error_out;
281 }
282
283 /* Any extra parameters. */
284 if (header_len > 16)
285 fseek(infh, header_len - 16, SEEK_CUR);
286
287 /* Now skip to the 'data' chunk. */
288 while (1) {
289 TRY_FREAD(id, 1, 4, infh);
290 if (0 == memcmp(id, "data", 4)) {
291 /* Total number of bytes of data (we don't care). */
292 TRY_FREAD(&intval, 4, 1, infh);
293 break;
294 }
295 else {
296 /* Some other stuff... */
297 /* Number of bytes of ... whatever */
298 TRY_FREAD(&intval, 4, 1, infh);
299 if (WORDS_BIGENDIAN) SWAP_INT32(&intval);
300 fseek(infh, intval, SEEK_CUR);
301 }
302 }
303
304 /* We are ready to rumble. */
305 return 0;
306error_out:
307 return -1;
308}
309
310static int
311read_nist_header(FILE *infh)
312{
313 char hdr[1024];
314 char *line, *c;
315
316 TRY_FREAD(hdr, 1, 1024, infh);
317 hdr[1023] = '\0';
318
319 /* Roughly parse it to find the sampling rate and byte order
320 * (don't bother with other stuff) */
321 if ((line = strstr(hdr, "sample_rate")) == NULL) {
322 E_ERROR("No sampling rate in NIST header!\n");
323 goto error_out;
324 }
325 c = strchr(line, '\n');
326 if (c) *c = '\0';
327 c = strrchr(line, ' ');
328 if (c == NULL) {
329 E_ERROR("Could not find sampling rate!\n");
330 goto error_out;
331 }
332 ++c;
333 if (cmd_ln_int32("-samprate") == 0)
334 cmd_ln_set_int32("-samprate", atoi(c));
335 else if (cmd_ln_int32("-samprate") != atoi(c)) {
336 E_WARN("NIST file sampling rate %d != -samprate %d\n",
337 atoi(c), cmd_ln_int32("-samprate"));
338 }
339
340 if (line + strlen(line) < hdr + 1023)
341 line[strlen(line)] = ' ';
342 if ((line = strstr(hdr, "sample_byte_format")) == NULL) {
343 E_ERROR("No sample byte format in NIST header!\n");
344 goto error_out;
345 }
346 c = strchr(line, '\n');
347 if (c) *c = '\0';
348 c = strrchr(line, ' ');
349 if (c == NULL) {
350 E_ERROR("Could not find sample byte order!\n");
351 goto error_out;
352 }
353 ++c;
354 if (0 == memcmp(c, "01", 2)) {
355 cmd_ln_set_str("-input_endian", "little");
356 }
357 else if (0 == memcmp(c, "10", 2)) {
358 cmd_ln_set_str("-input_endian", "big");
359 }
360 else {
361 E_ERROR("Unknown byte order %s\n", c);
362 goto error_out;
363 }
364
365 /* We are ready to rumble. */
366 return 0;
367error_out:
368 return -1;
369}
370
371static int
372extract_pitch(const char *in, const char *out)
373{
374 FILE *infh = NULL, *outfh = NULL;
375 size_t flen, fshift, nsamps;
376 int16 *buf = NULL;
377 yin_t *yin = NULL;
378 uint16 period, bestdiff;
379 int32 sps;
380
381 if (out) {
382 if ((outfh = fopen(out, "w")) == NULL) {
383 E_ERROR_SYSTEM("Failed to open %s for writing", out);
384 goto error_out;
385 }
386 }
387 else {
388 outfh = stdout;
389 }
390 if ((infh = fopen(in, "rb")) == NULL) {
391 E_ERROR_SYSTEM("Failed to open %s for reading", in);
392 goto error_out;
393 }
394
395 /* If we weren't told what the file type is, weakly try to
396 * determine it (actually it's pretty obvious) */
397 if (!(cmd_ln_boolean("-raw")
398 || cmd_ln_boolean("-mswav")
399 || cmd_ln_boolean("-nist"))) {
400 if (guess_file_type(in, infh) < 0)
401 goto error_out;
402 }
403
404 /* Grab the sampling rate and byte order from the header and also
405 * make sure this is 16-bit linear PCM. */
406 if (cmd_ln_boolean("-mswav")) {
407 if (read_riff_header(infh) < 0)
408 goto error_out;
409 }
410 else if (cmd_ln_boolean("-nist")) {
411 if (read_nist_header(infh) < 0)
412 goto error_out;
413 }
414 else if (cmd_ln_boolean("-raw")) {
415 /* Just use some defaults for sampling rate and endian. */
416 if (cmd_ln_str("-input_endian") == NULL) {
417 if (WORDS_BIGENDIAN)
418 cmd_ln_set_str("-input_endian", "big");
419 else
420 cmd_ln_set_str("-input_endian", "little");
421 }
422 if (cmd_ln_int32("-samprate") == 0)
423 cmd_ln_set_int32("-samprate", 16000);
424 }
425
426 /* Now read frames and write pitch estimates. */
427 sps = cmd_ln_int32("-samprate");
428 flen = (size_t)(0.5 + sps * cmd_ln_float32("-flen"));
429 fshift = (size_t)(0.5 + sps * cmd_ln_float32("-fshift"));
430 yin = yin_init(flen, cmd_ln_float32("-voice_thresh"),
431 cmd_ln_float32("-search_range"),
432 cmd_ln_int32("-smooth_window"));
433 if (yin == NULL) {
434 E_ERROR("Failed to initialize YIN\n");
435 goto error_out;
436 }
437 buf = ckd_calloc(flen, sizeof(*buf));
438 /* Read the first full frame of data. */
439 if (fread(buf, sizeof(*buf), flen, infh) != flen) {
440 /* Fail silently, which is probably okay. */
441 }
442 yin_start(yin);
443 nsamps = 0;
444 while (!feof(infh)) {
445 /* Process a frame of data. */
446 yin_write(yin, buf);
447 if (yin_read(yin, &period, &bestdiff)) {
448 fprintf(outfh, "%.3f %.2f %.2f\n",
449 /* Time point. */
450 (double)nsamps/sps,
451 /* "Probability" of voicing. */
452 bestdiff > 32768 ? 0.0 : 1.0 - (double)bestdiff / 32768,
453 /* Pitch (possibly bogus) */
454 period == 0 ? sps : (double)sps / period);
455 nsamps += fshift;
456 }
457 /* Shift it back and get the next frame's overlap. */
458 memmove(buf, buf + fshift, (flen - fshift) * sizeof(*buf));
459 if (fread(buf + flen - fshift, sizeof(*buf), fshift, infh) != fshift) {
460 /* Fail silently (FIXME: really?) */
461 }
462 }
463 yin_end(yin);
464 /* Process trailing frames of data. */
465 while (yin_read(yin, &period, &bestdiff)) {
466 fprintf(outfh, "%.3f %.2f %.2f\n",
467 /* Time point. */
468 (double)nsamps/sps,
469 /* "Probability" of voicing. */
470 bestdiff > 32768 ? 0.0 : 1.0 - (double)bestdiff / 32768,
471 /* Pitch (possibly bogus) */
472 period == 0 ? sps : (double)sps / period);
473 }
474
475 if (yin)
476 yin_free(yin);
477 ckd_free(buf);
478 fclose(infh);
479 if (outfh != stdout)
480 fclose(outfh);
481 return 0;
482
483error_out:
484 yin_free(yin);
485 ckd_free(buf);
486 if (infh) fclose(infh);
487 if (outfh && outfh != stdout) fclose(outfh);
488 return -1;
489}
490
491static int
492run_control_file(const char *ctl)
493{
494 FILE *ctlfh;
495 char *line;
496 char *di, *dout, *ei, *eio;
497 size_t len;
498 int rv, guess_type, guess_sps, guess_endian;
499 int32 skip, runlen;
500
501 skip = cmd_ln_int32("-nskip");
502 runlen = cmd_ln_int32("-runlen");
503
504 /* Whether to guess file types */
505 guess_type = !(cmd_ln_boolean("-raw")
506 || cmd_ln_boolean("-mswav")
507 || cmd_ln_boolean("-nist"));
508 /* Whether to guess sampling rate */
509 guess_sps = (cmd_ln_int32("-samprate") == 0);
510 /* Whether to guess endian */
511 guess_endian = (cmd_ln_str("-input_endian") == NULL);
512
513 if ((ctlfh = fopen(ctl, "r")) == NULL) {
514 E_ERROR_SYSTEM("Failed to open control file %s", ctl);
515 return -1;
516 }
517 if (cmd_ln_str("-di"))
518 di = string_join(cmd_ln_str("-di"), "/", NULL);
519 else
520 di = ckd_salloc("");
521 if (cmd_ln_str("-do"))
522 dout = string_join(cmd_ln_str("-do"), "/", NULL);
523 else
524 dout = ckd_salloc("");
525 if (cmd_ln_str("-ei"))
526 ei = string_join(".", cmd_ln_str("-ei"), NULL);
527 else
528 ei = ckd_salloc("");
529 if (cmd_ln_str("-eo"))
530 eio = string_join(".", cmd_ln_str("-eo"), NULL);
531 else
532 eio = ckd_salloc("");
533 rv = 0;
534 while ((line = fread_line(ctlfh, &len)) != NULL) {
535 char *infile, *outfile;
536
537 if (skip-- > 0) {
538 ckd_free(line);
539 continue;
540 }
541 if (runlen == 0) {
542 ckd_free(line);
543 break;
544 }
545 --runlen;
546
547 if (line[len-1] == '\n')
548 line[len-1] = '\0';
549
550 infile = string_join(di, line, ei, NULL);
551 outfile = string_join(dout, line, eio, NULL);
552
553 /* Reset various guessed information */
554 if (guess_type) {
555 cmd_ln_set_boolean("-nist", FALSE);
556 cmd_ln_set_boolean("-mswav", FALSE);
557 cmd_ln_set_boolean("-raw", FALSE);
558 }
559 if (guess_sps)
560 cmd_ln_set_int32("-samprate", 0);
561 if (guess_endian)
562 cmd_ln_set_str("-input_endian", NULL);
563
564 rv = extract_pitch(infile, outfile);
565
566 ckd_free(infile);
567 ckd_free(outfile);
568 ckd_free(line);
569
570 if (rv != 0)
571 break;
572 }
573 ckd_free(di);
574 ckd_free(dout);
575 ckd_free(ei);
576 ckd_free(eio);
577 fclose(ctlfh);
578 return rv;
579}
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition ckd_alloc.c:241
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition ckd_alloc.h:248
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition ckd_alloc.h:264
Command-line and other configurationparsing and handling.
#define cmd_ln_boolean(name)
Retrieve a boolean from the global command line.
Definition cmd_ln.h:526
SPHINXBASE_EXPORT int32 cmd_ln_parse(const arg_t *defn, int32 argc, char *argv[], int32 strict)
Non-reentrant version of cmd_ln_parse().
Definition cmd_ln.c:755
SPHINXBASE_EXPORT void cmd_ln_free(void)
Free the global command line, if any exists.
Definition cmd_ln.c:1072
#define ARG_STRING
String argument (optional).
Definition cmd_ln.h:114
#define ARG_INT32
Definition cmd_ln.h:144
#define cmd_ln_float32(name)
Retrieve a 32-bit float from the global command line.
Definition cmd_ln.h:512
#define cmd_ln_set_boolean(n, b)
Set a boolean value in the global command line.
Definition cmd_ln.h:562
#define cmd_ln_str(name)
Retrieve a string from the global command line.
Definition cmd_ln.h:489
#define cmd_ln_set_int32(n, i)
Set a 32-bit integer value in the global command line.
Definition cmd_ln.h:541
#define ARG_BOOLEAN
Boolean (true/false) argument (optional).
Definition cmd_ln.h:118
#define cmd_ln_int32(name)
Retrieve a 32-bit integer from the global command line.
Definition cmd_ln.h:505
#define ARG_FLOAT32
Definition cmd_ln.h:148
#define cmd_ln_set_str(n, s)
Set a string in the global command line.
Definition cmd_ln.h:534
Implementation of logging routines.
#define E_ERROR_SYSTEM
Print error text; Call perror("");.
Definition err.h:142
#define E_ERROR
Print error message to standard error stream.
Definition err.h:169
#define E_WARN
Print warning information to standard error stream.
Definition err.h:164
#define E_INFO
Print logging information to standard error stream.
Definition err.h:147
file IO related operations.
SPHINXBASE_EXPORT char * fread_line(FILE *stream, size_t *out_len)
Read a line of arbitrary length from a file and return it as a newly allocated string.
Definition pio.c:367
Miscellaneous useful string functions.
SPHINXBASE_EXPORT char * string_join(const char *base,...)
Concatenate a NULL-terminated argument list of strings, returning a newly allocated string.
Definition strfuncs.c:62
Argument definition structure.
Definition yin.c:50
Implementation of pitch estimation.
SPHINXBASE_EXPORT int yin_read(yin_t *pe, uint16 *out_period, uint16 *out_bestdiff)
Read a raw estimated pitch value from the pitch estimator.
Definition yin.c:221
SPHINXBASE_EXPORT void yin_end(yin_t *pe)
Mark the end of an utterance.
Definition yin.c:165
SPHINXBASE_EXPORT yin_t * yin_init(int frame_size, float search_threshold, float search_range, int smooth_window)
Initialize moving-window pitch estimation.
Definition yin.c:130
SPHINXBASE_EXPORT void yin_start(yin_t *pe)
Start processing an utterance.
Definition yin.c:157
SPHINXBASE_EXPORT void yin_write(yin_t *pe, int16 const *frame)
Feed a frame of data to the pitch estimator.
Definition yin.c:194
SPHINXBASE_EXPORT void yin_free(yin_t *pe)
Free a moving-window pitch estimator.
Definition yin.c:149