SphinxBase 0.6
cont_fileseg.c
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 1999-2001 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37/*
38 * cont_fileseg.c -- Read input file, filter silence regions, and segment into utterances.
39 *
40 * HISTORY
41 *
42 * $Log: cont_fileseg.c,v $
43 * Revision 1.1.1.1 2006/05/23 18:45:02 dhuggins
44 * re-importation
45 *
46 * Revision 1.13 2005/06/30 00:28:46 rkm
47 * Kept within-utterance silences in rawmode
48 *
49 *
50 * 28-Jun-2005 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
51 * Modified to use new state variables in cont_ad_t.
52 *
53 * Revision 1.12 2005/05/31 15:54:38 rkm
54 * *** empty log message ***
55 *
56 * Revision 1.11 2005/05/24 20:56:58 rkm
57 * Added min/max-noise parameters to cont_fileseg
58 *
59 * Revision 1.10 2005/05/13 23:28:43 egouvea
60 * Changed null device to system dependent one: NUL for windows, /dev/null for everything else
61 *
62 * $Log: cont_fileseg.c,v $
63 * Revision 1.1.1.1 2006/05/23 18:45:02 dhuggins
64 * re-importation
65 *
66 * Revision 1.13 2005/06/30 00:28:46 rkm
67 * Kept within-utterance silences in rawmode
68 *
69 * Revision 1.12 2005/05/31 15:54:38 rkm
70 * *** empty log message ***
71 *
72 * Revision 1.11 2005/05/24 20:56:58 rkm
73 * Added min/max-noise parameters to cont_fileseg
74 *
75 * Revision 1.9 2005/02/13 01:29:48 rkm
76 * Fixed cont_ad_read to never cross sil/speech boundary, and rawmode
77 *
78 * Revision 1.8 2005/02/01 22:21:13 rkm
79 * Added raw data logging, and raw data pass-through mode to cont_ad
80 *
81 * Revision 1.7 2004/07/16 00:57:11 egouvea
82 * Added Ravi's implementation of FSG support.
83 *
84 * Revision 1.3 2004/06/25 14:58:05 rkm
85 * *** empty log message ***
86 *
87 * Revision 1.2 2004/06/23 20:32:08 rkm
88 * Exposed several cont_ad config parameters
89 *
90 *
91 * 27-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
92 * Created.
93 */
94
95#include <stdio.h>
96#include <stdlib.h>
97#include <string.h>
98#include <assert.h>
99#include <math.h>
100
101#include <sphinxbase/prim_type.h>
102#include <sphinxbase/ad.h>
103#include <sphinxbase/cont_ad.h>
104#include <sphinxbase/err.h>
105
106static FILE *infp; /* File being segmented */
107static int32 swap;
108
109/* Max size read by file_ad_read function on each invocation, for debugging */
110static int32 max_ad_read_size;
111
112#if defined(WIN32) && !defined(GNUWINCE)
113#define NULL_DEVICE "NUL"
114#else
115#define NULL_DEVICE "/dev/null"
116#endif
117
118
119/*
120 * Need to provide cont_ad_init with a read function to read the input file.
121 * This is it. The ad_rec_t *r argument is ignored since there is no A/D
122 * device involved.
123 */
124static int32
125file_ad_read(ad_rec_t * r, int16 * buf, int32 max)
126{
127 int32 i, k;
128
129 if (max > max_ad_read_size)
130 max = max_ad_read_size;
131
132 k = fread(buf, sizeof(int16), max, infp);
133 if (swap) {
134 for (i = 0; i < k; i++) {
135 buf[i] = ((buf[i] >> 8) & 0x00ff) | ((buf[i] << 8) & 0xff00);
136 }
137 }
138
139 return ((k > 0) ? k : -1);
140}
141
142
143static void
144usagemsg(char *pgm)
145{
146 E_INFO("Usage: %s \\\n", pgm);
147 E_INFOCONT("\t[-? | -h] \\\n");
148 E_INFOCONT("\t[-d | -debug] \\\n");
149 E_INFOCONT("\t[-sps <sampling-rate> (16000)] \\\n");
150 E_INFOCONT("\t[-b | -byteswap] \\\n");
152 ("\t[{-s | -silsep} <length-silence-separator(sec) (0.5)]> \\\n");
153 E_INFOCONT("\t[-w | -writeseg] \\\n");
154 E_INFOCONT("\t[-min-noise <min-noise>] \\\n");
155 E_INFOCONT("\t[-max-noise <max-noise>] \\\n");
156 E_INFOCONT("\t[-delta-sil <delta-sil>] \\\n");
157 E_INFOCONT("\t[-delta-speech <delta-speech>] \\\n");
158 E_INFOCONT("\t[-sil-onset <sil-onset>] \\\n");
159 E_INFOCONT("\t[-speech-onset <speech-onset>] \\\n");
160 E_INFOCONT("\t[-adapt-rate <adapt-rate>] \\\n");
161 E_INFOCONT("\t[-max-adreadsize <ad_read_blksize>] \\\n");
162 E_INFOCONT("\t[-c <copy-input-file>] \\\n");
163 E_INFOCONT("\t[-r | -rawmode] \\\n");
164 E_INFOCONT("\t-i <input-file>\n");
165
166 exit(0);
167}
168
169/*
170 * Read specified input file, segment it into utterances wherever a silence segment of
171 * a given minimum duration is encountered. Filter out long silences.
172 * Utterances are written to files named 00000000.raw, 00000001.raw, 00000002.raw, etc.
173 */
174int
175main(int32 argc, char **argv)
176{
177 cont_ad_t *cont;
178 int32 uttid, uttlen, starttime, siltime, sps, debug, writeseg, rawmode;
179 int16 buf[4096];
180 char *infile, *copyfile, segfile[1024];
181 FILE *fp;
182 float endsil;
183 ad_rec_t ad;
184 int32 i, k;
185 int32 winsize, leader, trailer;
186 int32 orig_min_noise, orig_max_noise;
187 int32 orig_delta_sil, orig_delta_speech;
188 int32 orig_speech_onset, orig_sil_onset;
189 int32 min_noise, max_noise;
190 int32 delta_sil, delta_speech;
191 int32 sil_onset, speech_onset;
192 float32 orig_adapt_rate;
193 float32 adapt_rate;
194 int32 total_speech_samples;
195 float32 total_speech_sec;
196 FILE *rawfp;
197
198 /* Set argument defaults */
199 cont = NULL;
200 sps = 16000;
201 swap = 0;
202 endsil = 0.5;
203 writeseg = 0;
204 min_noise = max_noise = -1;
205 delta_sil = delta_speech = -1;
206 sil_onset = speech_onset = -1;
207 adapt_rate = -1.0;
208 max_ad_read_size = (int32) 0x7ffffff0;
209 debug = 0;
210 infile = NULL;
211 copyfile = NULL;
212 rawfp = NULL;
213 rawmode = 0;
214
215 /* Parse arguments */
216 for (i = 1; i < argc; i++) {
217 if ((strcmp(argv[i], "-help") == 0)
218 || (strcmp(argv[i], "-h") == 0)
219 || (strcmp(argv[i], "-?") == 0)) {
220 usagemsg(argv[0]);
221 }
222 else if ((strcmp(argv[i], "-debug") == 0)
223 || (strcmp(argv[i], "-d") == 0)) {
224 debug = 1;
225 }
226 else if (strcmp(argv[i], "-sps") == 0) {
227 i++;
228 if ((i == argc)
229 || (sscanf(argv[i], "%d", &sps) != 1)
230 || (sps <= 0)) {
231 E_ERROR("Invalid -sps argument\n");
232 usagemsg(argv[0]);
233 }
234 }
235 else if ((strcmp(argv[i], "-byteswap") == 0)
236 || (strcmp(argv[i], "-b") == 0)) {
237 swap = 1;
238 }
239 else if ((strcmp(argv[i], "-silsep") == 0)
240 || (strcmp(argv[i], "-s") == 0)) {
241 i++;
242 if ((i == argc)
243 || (sscanf(argv[i], "%f", &endsil) != 1)
244 || (endsil <= 0.0)) {
245 E_ERROR("Invalid -silsep argument\n");
246 usagemsg(argv[0]);
247 }
248 }
249 else if ((strcmp(argv[i], "-writeseg") == 0)
250 || (strcmp(argv[i], "-w") == 0)) {
251 writeseg = 1;
252 }
253 else if (strcmp(argv[i], "-min-noise") == 0) {
254 i++;
255 if ((i == argc) ||
256 (sscanf(argv[i], "%d", &min_noise) != 1) ||
257 (min_noise < 0)) {
258 E_ERROR("Invalid -min-noise argument\n");
259 usagemsg(argv[0]);
260 }
261 }
262 else if (strcmp(argv[i], "-max-noise") == 0) {
263 i++;
264 if ((i == argc) ||
265 (sscanf(argv[i], "%d", &max_noise) != 1) ||
266 (max_noise < 0)) {
267 E_ERROR("Invalid -max-noise argument\n");
268 usagemsg(argv[0]);
269 }
270 }
271 else if (strcmp(argv[i], "-delta-sil") == 0) {
272 i++;
273 if ((i == argc) ||
274 (sscanf(argv[i], "%d", &delta_sil) != 1) ||
275 (delta_sil < 0)) {
276 E_ERROR("Invalid -delta-sil argument\n");
277 usagemsg(argv[0]);
278 }
279 }
280 else if (strcmp(argv[i], "-delta-speech") == 0) {
281 i++;
282 if ((i == argc) ||
283 (sscanf(argv[i], "%d", &delta_speech) != 1) ||
284 (delta_speech < 0)) {
285 E_ERROR("Invalid -delta-speech argument\n");
286 usagemsg(argv[0]);
287 }
288 }
289 else if (strcmp(argv[i], "-sil-onset") == 0) {
290 i++;
291 if ((i == argc) ||
292 (sscanf(argv[i], "%d", &sil_onset) != 1) ||
293 (sil_onset < 1)) {
294 E_ERROR("Invalid -sil-onset argument\n");
295 usagemsg(argv[0]);
296 }
297 }
298 else if (strcmp(argv[i], "-speech-onset") == 0) {
299 i++;
300 if ((i == argc) ||
301 (sscanf(argv[i], "%d", &speech_onset) != 1) ||
302 (speech_onset < 1)) {
303 E_ERROR("Invalid -speech-onset argument\n");
304 usagemsg(argv[0]);
305 }
306 }
307 else if (strcmp(argv[i], "-adapt-rate") == 0) {
308 i++;
309 if ((i == argc) ||
310 (sscanf(argv[i], "%f", &adapt_rate) != 1) ||
311 (adapt_rate < 0.0) || (adapt_rate > 1.0)) {
312 E_ERROR("Invalid -adapt-rate argument\n");
313 usagemsg(argv[0]);
314 }
315 }
316 else if (strcmp(argv[i], "-max-adreadsize") == 0) {
317 i++;
318 if ((i == argc) ||
319 (sscanf(argv[i], "%d", &max_ad_read_size) != 1) ||
320 (max_ad_read_size < 1)) {
321 E_ERROR("Invalid -max-adreadsize argument\n");
322 usagemsg(argv[0]);
323 }
324 }
325 else if (strcmp(argv[i], "-c") == 0) {
326 i++;
327 if (i == argc) {
328 E_ERROR("Invalid -c argument\n");
329 usagemsg(argv[0]);
330 }
331 copyfile = argv[i];
332 }
333 else if ((strcmp(argv[i], "-rawmode") == 0)
334 || (strcmp(argv[i], "-r") == 0)) {
335 rawmode = 1;
336 }
337 else if (strcmp(argv[i], "-i") == 0) {
338 i++;
339 if (i == argc) {
340 E_ERROR("Invalid -i argument\n");
341 usagemsg(argv[0]);
342 }
343 infile = argv[i];
344 }
345 else {
346 usagemsg(argv[0]);
347 }
348 }
349
350 if (infile == NULL) {
351 E_ERROR("No input file specified\n");
352 usagemsg(argv[0]);
353 }
354
355 if ((infp = fopen(infile, "rb")) == NULL)
356 E_FATAL_SYSTEM("Failed to open '%s' for reading", infile);
357
358 /*
359 * Associate continuous listening module with opened input file and read function.
360 * No A/D device is involved, but need to fill in ad->sps.
361 * Calibrate input data using first few seconds of file, but then rewind it!!
362 */
363 ad.sps = sps;
364 ad.bps = sizeof(int16);
365 if (!rawmode)
366 cont = cont_ad_init(&ad, file_ad_read);
367 else
368 cont = cont_ad_init_rawmode(&ad, file_ad_read);
369
370 printf("Calibrating ...");
371 fflush(stdout);
372 if (cont_ad_calib(cont) < 0)
373 printf(" failed; file too short?\n");
374 else
375 printf(" done\n");
376 rewind(infp);
377
378 /* Convert desired min. inter-utterance silence duration to #samples */
379 siltime = (int32) (endsil * sps);
380
381 /* Enable writing raw input to output by the cont module if specified */
382 if (copyfile) {
383 if ((rawfp = fopen(copyfile, "wb")) == NULL)
384 E_ERROR_SYSTEM("Failed to open raw output file '%s' for writing");
385 else
386 cont_ad_set_rawfp(cont, rawfp);
387 }
388
390 &orig_delta_sil, &orig_delta_speech,
391 &orig_min_noise, &orig_max_noise,
392 &winsize,
393 &orig_speech_onset, &orig_sil_onset,
394 &leader, &trailer, &orig_adapt_rate);
395
396 E_INFO("Default parameters:\n");
397 E_INFOCONT("\tmin-noise = %d, max-noise = %d\n",
398 orig_min_noise, orig_max_noise);
399 E_INFOCONT("\tdelta-sil = %d, delta-speech = %d\n",
400 orig_delta_sil, orig_delta_speech);
401 E_INFOCONT("\tsil-onset = %d, speech-onset = %d\n",
402 orig_sil_onset, orig_speech_onset);
403 E_INFOCONT("\tadapt_rate = %.3f\n", orig_adapt_rate);
404
405 if (min_noise < 0)
406 min_noise = orig_min_noise;
407 if (max_noise < 0)
408 max_noise = orig_max_noise;
409 if (delta_sil < 0)
410 delta_sil = orig_delta_sil;
411 if (delta_speech < 0)
412 delta_speech = orig_delta_speech;
413 if (sil_onset < 0)
414 sil_onset = orig_sil_onset;
415 if (speech_onset < 0)
416 speech_onset = orig_speech_onset;
417 if (adapt_rate < 0.0)
418 adapt_rate = orig_adapt_rate;
419
421 delta_sil, delta_speech,
422 min_noise, max_noise,
423 winsize,
424 speech_onset, sil_onset,
425 leader, trailer, adapt_rate);
426
427 E_INFO("Current parameters:\n");
428 E_INFOCONT("\tmin-noise = %d, max-noise = %d\n", min_noise, max_noise);
429 E_INFOCONT("\tdelta-sil = %d, delta-speech = %d\n", delta_sil,
430 delta_speech);
431 E_INFOCONT("\tsil-onset = %d, speech-onset = %d\n", sil_onset,
432 speech_onset);
433 E_INFOCONT("\tadapt_rate = %.3f\n", adapt_rate);
434
435 E_INFO("Sampling rate: %d", sps);
436 E_INFOCONT("; Byteswap: %s", swap ? "Yes" : "No");
437 E_INFOCONT("; Max ad-read size: %d\n", max_ad_read_size);
438
439 if (debug)
440 cont_ad_set_logfp(cont, stdout);
441
442 total_speech_samples = 0;
443 total_speech_sec = 0.0;
444
445 uttid = 0;
446 uttlen = 0;
447 starttime = 0;
448 fp = NULL;
449
450 /* Process data */
451 for (;;) {
452 /* Get audio data from continuous listening module */
453 k = cont_ad_read(cont, buf, 4096);
454
455 if (k < 0) { /* End of input audio file; close any open output file and exit */
456 if (fp != NULL) {
457 fclose(fp);
458 fp = NULL;
459
460 printf
461 ("Utt %08d, st= %8.2fs, et= %8.2fs, seg= %7.2fs (#samp= %10d)\n",
462 uttid, (double) starttime / (double) sps,
463 (double) (starttime + uttlen) / (double) sps,
464 (double) uttlen / (double) sps, uttlen);
465 fflush(stdout);
466
467 total_speech_samples += uttlen;
468 total_speech_sec += (double) uttlen / (double) sps;
469
470 uttid++;
471 }
472
473 break;
474 }
475
476 if (cont->state == CONT_AD_STATE_SIL) { /* Silence data got */
477 if (fp != NULL) { /* Currently in an utterance */
478 if (cont->seglen > siltime) { /* Long enough silence detected; end the utterance */
479 fclose(fp);
480 fp = NULL;
481
482 printf
483 ("Utt %08d, st= %8.2fs, et= %8.2fs, seg= %7.2fs (#samp= %10d)\n",
484 uttid, (double) starttime / (double) sps,
485 (double) (starttime + uttlen) / (double) sps,
486 (double) uttlen / (double) sps, uttlen);
487 fflush(stdout);
488
489 total_speech_samples += uttlen;
490 total_speech_sec += (double) uttlen / (double) sps;
491
492 uttid++;
493 }
494 else {
495 /*
496 * Short silence within utt; write it to output. (Some extra trailing silence
497 * is included in the utterance, as a result. Not to worry about it.)
498 */
499 if (k > 0) {
500 fwrite(buf, sizeof(int16), k, fp);
501 uttlen += k;
502 }
503 }
504 }
505 }
506 else {
507 assert(cont->state == CONT_AD_STATE_SPEECH);
508
509 if (fp == NULL) { /* Not in an utt; open a new output file */
510 if (writeseg)
511 sprintf(segfile, "%08d.raw", uttid);
512 else
513 strcpy(segfile, NULL_DEVICE);
514 if ((fp = fopen(segfile, "wb")) == NULL)
515 E_FATAL_SYSTEM("Failed to open segmentation file '%s' for writing", segfile);
516
517 starttime = cont->read_ts - k;
518 uttlen = 0;
519 }
520
521 /* Write data obtained to output file */
522 if (k > 0) {
523 fwrite(buf, sizeof(int16), k, fp);
524 uttlen += k;
525 }
526 }
527 }
528
529 if (rawfp)
530 fclose(rawfp);
531
532 E_INFO("Total raw input speech = %d frames, %d samples, %.2f sec\n",
533 cont->tot_frm, cont->tot_frm * cont->spf,
534 (cont->tot_frm * cont->spf) / (float32) cont->sps);
535 E_INFO("Total speech detected = %d samples, %.2f sec\n",
536 total_speech_samples, total_speech_sec);
537
538 cont_ad_close(cont);
539
540 return 0;
541}
generic live audio interface for recording and playback
Continuous A/D listening and silence filtering module.
SPHINXBASE_EXPORT int32 cont_ad_set_logfp(cont_ad_t *c, FILE *fp)
Set the file to which cont_ad logs its progress.
SPHINXBASE_EXPORT int32 cont_ad_close(cont_ad_t *cont)
Close the continuous listening object.
SPHINXBASE_EXPORT int32 cont_ad_set_rawfp(cont_ad_t *c, FILE *fp)
Set a file for dumping raw audio input.
SPHINXBASE_EXPORT int32 cont_ad_calib(cont_ad_t *cont)
Calibrate the silence filter.
SPHINXBASE_EXPORT int32 cont_ad_set_params(cont_ad_t *r, int32 delta_sil, int32 delta_speech, int32 min_noise, int32 max_noise, int32 winsize, int32 speech_onset, int32 sil_onset, int32 leader, int32 trailer, float32 adapt_rate)
Set the changable parameters.
SPHINXBASE_EXPORT cont_ad_t * cont_ad_init_rawmode(ad_rec_t *ad, int32(*adfunc)(ad_rec_t *ad, int16 *buf, int32 max))
Initializes a continuous listening object which simply passes data through (!)
SPHINXBASE_EXPORT int32 cont_ad_read(cont_ad_t *r, int16 *buf, int32 max)
Read raw audio data into the silence filter.
SPHINXBASE_EXPORT int32 cont_ad_get_params(cont_ad_t *r, int32 *delta_sil, int32 *delta_speech, int32 *min_noise, int32 *max_noise, int32 *winsize, int32 *speech_onset, int32 *sil_onset, int32 *leader, int32 *trailer, float32 *adapt_rate)
PWP 1/14/98 – get the changable params.
SPHINXBASE_EXPORT cont_ad_t * cont_ad_init(ad_rec_t *ad, int32(*adfunc)(ad_rec_t *ad, int16 *buf, int32 max))
Initialize a continuous listening/silence filtering object.
Implementation of logging routines.
#define E_FATAL_SYSTEM
Print error text; Call perror(""); exit(errno);.
Definition err.h:132
#define E_ERROR_SYSTEM
Print error text; Call perror("");.
Definition err.h:142
#define E_ERROR
Print error message to standard error stream.
Definition err.h:169
#define E_INFO
Print logging information to standard error stream.
Definition err.h:147
#define E_INFOCONT
Print logging information without header, to standard error stream.
Definition err.h:153
Basic type definitions used in Sphinx.
Definition ad.h:255
int32 sps
Samples/sec.
Definition ad.h:256
int32 bps
Bytes/sample.
Definition ad.h:257
Continuous listening module or object Continuous listening module or object.
Definition cont_ad.h:151
int32 sps
Samples/sec; moved from ad->sps to break dependence on ad by N.
Definition cont_ad.h:180
int32 seglen
Total no.
Definition cont_ad.h:171
int32 tot_frm
Total number of frames of A/D data read, including consumed ones.
Definition cont_ad.h:191
int32 spf
Samples/frame; audio level is analyzed within frames.
Definition cont_ad.h:185
int32 state
State of data returned by most recent cont_ad_read call; CONT_AD_STATE_SIL or CONT_AD_STATE_SPEECH.
Definition cont_ad.h:165
int32 read_ts
Absolute timestamp (total no.
Definition cont_ad.h:167