SphinxBase 0.6
cont_ad_base.c
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 1999-2001 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37/*
38 * cont_ad.c -- Continuous A/D listening and silence filtering module.
39 *
40 * HISTORY
41 *
42 * $Log: cont_ad_base.c,v $
43 * Revision 1.14 2005/07/02 03:51:32 rkm
44 * Slowed down power histogram decay rate
45 *
46 * Revision 1.13 2005/06/30 00:27:17 rkm
47 * Fixed silence handling in rawmode; added extra state variables
48 *
49 *
50 * 28-Jun-2005 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University.
51 * - Changed rawmode handling to simply copy data even for silence
52 * segments.
53 * - Moved definitions of CONT_AD_STATE_{SIL,SPEECH} from .c to .h.
54 *
55 * Revision 1.12 2005/06/29 23:48:04 egouvea
56 * Revert changes: variables defined in cont_ad_base.c should not be accessible by the application
57 *
58 * Revision 1.10 2005/02/13 01:29:48 rkm
59 * Fixed cont_ad_read to never cross sil/speech boundary, and rawmode
60 *
61 * Revision 1.9 2005/02/01 22:21:19 rkm
62 * Added raw data logging, and raw data pass-through mode to cont_ad
63 *
64 * Revision 1.8 2004/07/23 23:36:34 egouvea
65 * Ravi's merge, with the latest fixes in the FSG code, and making the log files generated by FSG, LM, and allphone have the same 'look and feel', with the backtrace information presented consistently
66 *
67 * 23-Jul-2004 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
68 * Changed default adapt_rate from 0.5 to 0.2.
69 *
70 * Revision 1.7 2004/07/16 00:57:12 egouvea
71 * Added Ravi's implementation of FSG support.
72 *
73 * Revision 1.2 2004/06/23 20:31:18 rkm
74 * Added adapt_rate parameter; restructured frame processing to include threshold update
75 *
76 *
77 * 23-Oct-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
78 * Small change in the way the noiselevel is updated in find_thresh().
79 *
80 * 26-Aug-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
81 * Separated computation of "frame power" into a separate low-level
82 * function.
83 *
84 * 13-Jul-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
85 * Modified to allow frame size to depend on audio sampling rate.
86 *
87 * 01-Jul-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
88 * Changed CONT_AD_DELTA_SPEECH back to 20.
89 *
90 * 30-Jun-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
91 * Changed CONT_AD_DELTA_SPEECH from 10 to 15.
92 * Added FILE* argument to cont_ad_powhist_dump().
93 *
94 * 19-Jun-98 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
95 * Changed CONT_AD_DELTA_SPEECH from 20 to 10, to increase sensitivity
96 * to very short utterances.
97 *
98 * 16-Jan-98 Paul Placeway (pwp@cs.cmu.edu) at Carnegie Mellon University
99 * Changed to use dB instead of the weird power measure.
100 * Changed analysis window size, tuned default settings of most
101 * parameters to make the system less sensitive to noise, changed
102 * the histogram update frequency and decay to make the system
103 * adapt more rapidly to changes in the environment.
104 * Added cont_ad_set_params() and cont_ad_get_params().
105 *
106 * 28-Jul-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
107 * Added FRMPOW2SIGLVL, max_siglvl(), and cont_ad_t.siglvl.
108 * Changed min signal energy/frame to CONT_AD_SPF.
109 *
110 * 27-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
111 * Added the option for cont_ad_read to return -1 on EOF.
112 *
113 * 21-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
114 * Added cont_ad_set_thresh().
115 * Bugfix: n_other is recomputed after updating thresholds.
116 *
117 * 20-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
118 * Separated thresholds for speech and silence.
119 * Fixed bug in moving analysis window upon transition to speech state.
120 *
121 * 17-Jun-96 M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon University
122 * Created, based loosely on Steve Reed's original implementation.
123 */
124
125/*
126 * This module is intended to be interposed as a filter between any raw A/D source and the
127 * application to remove silence regions. It is initialized with a raw A/D source function
128 * (during the cont_ad_init call). Filtered A/D data can be read by the application using
129 * the cont_ad_read function. This module assumes that the A/D source function supplies an
130 * endless stream of data. The application is responsible for setting up the A/D source,
131 * turning recording on and off as it desires. It is also responsible for invoking the
132 * cont_ad_read function frequently enough to avoid buffer overruns and dropping A/D data.
133 * This continuous listening module has an internal buffer of about 4 sec.
134 *
135 * This module must be initialized and calibrated at first (cont_ad_init and cont_ad_calib
136 * functions). Raw samples are grouped into frames, the signal power in each frame is
137 * computed and accumulated in a histogram. The module is always in one of two states:
138 * SILENCE or SPEECH. Transitions between the two states are detected by looking for a
139 * contiguous window of several frames that is predominantly of the other type. The type
140 * is determined by comparing frame power to either of two thresholds, thresh_sil and
141 * thresh_speech, as appropriate for the current state. These thresholds are set from the
142 * first peak in the low-end of the power histogram, and are updated every few seconds.
143 * Separate thresholds are used to provide some hysteresis.
144 *
145 * The module maintains a linked list of speech (non-silence) segments not yet read by the
146 * application. The cont_ad_read function returns speech data, if any available, by
147 * following this list. It also updates an "absolute" timestamp at the end of the
148 * cont_ad_read operation. The timestamp indicates the total #samples of A/D data read
149 * until this point, including data discarded as silence frames. The application is
150 * responsible for using this timestamp to make any policy decisions regarding utterance
151 * boundaries or whatever.
152 */
153
154#include <stdio.h>
155#include <stdlib.h>
156#include <string.h>
157#include <assert.h>
158#include <math.h>
159
160#ifdef HAVE_CONFIG_H
161#include <config.h>
162#endif
163
164#ifdef _MSC_VER
165#pragma warning (disable: 4305)
166#endif
167
168#include "sphinxbase/prim_type.h"
169#include "sphinxbase/ad.h"
170#include "sphinxbase/cont_ad.h"
171#include "sphinxbase/err.h"
172
173
174#ifndef _ABS
175#define _ABS(x) ((x) >= 0 ? (x) : -(x))
176#endif
177
178
179/* Various parameters, including defaults for many cont_ad_t member variables */
180
181#define CONT_AD_ADFRMSIZE 256 /* #Frames of internal A/D buffer maintained */
182
183#define CONT_AD_POWHISTSIZE 98 /* #Powhist bins: ~ FRMPOW(65536^2*CONT_AD_SPF) */
184/* Maximum level is 96.3 dB full-scale; 97 for safety, plus 1 for zero-based */
185
186#define CONT_AD_CALIB_FRAMES (CONT_AD_POWHISTSIZE * 2)
187
188#define CONT_AD_THRESH_UPDATE 100 /* Update thresholds approx every so many frames */
189 /* PWP: update was 200 frames, or 3.2 seconds. Now about every 1.6 sec. */
190
191#define CONT_AD_ADAPT_RATE 0.2 /* Interpolation of new and old noiselevel */
192
193#define CONT_AD_SPS 16000
194
195#define CONT_AD_DEFAULT_NOISE 30 /* Default background noise power level */
196#define CONT_AD_DELTA_SIL 10 /* Initial default for cont_ad_t.delta_sil */
197#define CONT_AD_DELTA_SPEECH 17 /* Initial default for cont_ad_t.delta_speech */
198#define CONT_AD_MIN_NOISE 2 /* Expected minimum background noise level */
199#define CONT_AD_MAX_NOISE 70 /* Maximum background noise level */
200
201#define CONT_AD_HIST_INERTIA 3 /* Used in decaying the power histogram */
202
203#define CONT_AD_WINSIZE 21 /* Analysis window for state transitions */
204 /* rkm had 16 */
205
206#define CONT_AD_SPEECH_ONSET 9 /* Min #speech frames in analysis window for
207 SILENCE -> SPEECH state transition */
208/*
209 * SReed had 100 ms == 6.25 fr contiguous; rkm had 9 (out of 16+10) with a
210 * lower threshold.
211 */
212
213#define CONT_AD_SIL_ONSET 18 /* Min #silence frames in analysis window for
214 SPEECH -> SILENCE state transition
215 MUST BE <= CONT_AD_WINSIZE */
216/*
217 * SReed had 400 ms == 25 fr contiguous; rkm had 14 out of 16
218 */
219
220#define CONT_AD_LEADER 5 /* On transition to SPEECH state, so many frames
221 BEFORE window included in speech data (>0) */
222 /* SReed had 200 ms == 12.5 fr; rkm had 5 */
223
224#define CONT_AD_TRAILER 10 /* On transition to SILENCE state, so many frames
225 of silence included in speech data (>0).
226 NOTE: Ensure (0 < TRAILER+LEADER <= WINSIZE) */
227 /* SReed had 100 ms == 6.25 fr; rkm had 10 */
228
229
230void
231cont_ad_powhist_dump(FILE * fp, cont_ad_t * r)
232{
233 int32 i, j;
234
235 fprintf(fp, "PowHist:\n");
236 for (i = 0, j = 0; i < CONT_AD_POWHISTSIZE; i++) {
237 if (r->pow_hist[i] > 0) {
238 fprintf(fp, "\t%3d %6d\n", i, r->pow_hist[i]);
239 j = i;
240 }
241 }
242
243 fprintf(fp, "PH[%7.2f]:",
244 (double) (r->tot_frm * r->spf) / (double) (r->sps));
245 for (i = 0; i <= j; i++)
246 fprintf(fp, " %2d", r->pow_hist[i]);
247 fprintf(fp, "\n");
248
249 fflush(fp);
250}
251
252
253/*
254 * Compute frame power. Interface deliberately kept low level to allow arbitrary
255 * users to call this function with appropriate data.
256 */
257int32
258cont_ad_frame_pow(int16 * buf, int32 * prev, int32 spf)
259{
260 double sumsq, v;
261 int32 i;
262 int32 p;
263
264 sumsq = 0.0;
265 p = *prev;
266 for (i = 0; i < spf; i++) {
267 /* Note: pre-emphasis done to remove low-frequency noise. */
268 v = (double) (buf[i] - p);
269 sumsq += v * v;
270 p = buf[i];
271 }
272 *prev = p;
273
274 if (sumsq < spf) /* Make sure FRMPOW(sumsq) >= 0 */
275 sumsq = spf;
276
277 /*
278 * PWP: Units changed to dB
279 *
280 * Now the units of measurement of an input sample are volts (really!),
281 * so the power in dB is p = 20*log10(samp). Further, we want the RMS
282 * (root-mean-squared) average power across the frame.
283 *
284 * "sumsq" is the sum of the sum of the squares, so we want
285 *
286 * i = 20 * log10( sqrt ( sumsq / n_samps) )
287 *
288 * (Stephen Reed's code actually had
289 * i = 20 * log10( sqrt (sumsq) / n_samps )
290 * but this only produced an additive error.)
291 *
292 * i = 20 * log10( sqrt ( sumsq / n_samps) )
293 * = 20 * log10( ( sumsq / n_samps) ^ 0.5 )
294 * = 20 * log10( ( sumsq / n_samps) ) * 0.5 )
295 * = 10 * log10( ( sumsq / n_samps) )
296 * = 10 * ( log10( sumsq) - log10(n_samps) )
297 */
298 i = (int32) ((10.0 * (log10(sumsq) - log10((double) spf))) + 0.5);
299 if (i < 0)
300 i = 0; /* trim lower bound again to be safe. */
301 assert(i < 97);
302
303 return (i);
304}
305
306
307/*
308 * Classify frame (id=frm, starting at sample position s) as sil/nonsil. Classification
309 * done in isolation, independent of any other frame, based only on power histogram.
310 */
311static void
312compute_frame_pow(cont_ad_t * r, int32 frm)
313{
314 int32 i;
315
316 i = cont_ad_frame_pow(r->adbuf + (frm * r->spf), &(r->prev_sample),
317 r->spf);
318
319 r->frm_pow[frm] = (char) i;
320 (r->pow_hist[i])++;
321 r->thresh_update--;
322}
323
324
325/* PWP: $$$ check this */
326/*
327 * PWP: in SReed's code, decay was done by zeroing the histogram,
328 * i.e. no history.
329 */
330static void
331decay_hist(cont_ad_t * r)
332{
333 int32 i;
334
335 for (i = 0; i < CONT_AD_POWHISTSIZE; i++)
336 r->pow_hist[i] -= (r->pow_hist[i] >> CONT_AD_HIST_INERTIA);
337}
338
339
340/*
341 * Find silence threshold from power histogram.
342 */
343static int32
344find_thresh(cont_ad_t * r)
345{
346 int32 i, j, max, th;
347 int32 old_noise_level, old_thresh_sil, old_thresh_speech;
348
349 if (!r->auto_thresh)
350 return 0;
351
352 /*
353 * Find smallest non-zero histogram entry, but starting at some minimum power.
354 * Power lower than CONT_AD_MIN_NOISE indicates bad A/D input (eg, mic off...).
355 * Too high a minimum power is also bad.
356 */
357 for (i = r->min_noise;
358 (i < CONT_AD_POWHISTSIZE) && (r->pow_hist[i] == 0); i++);
359 if (i > r->max_noise) /* Bad signal? */
360 return -1;
361
362 /* PWP: Hmmmmm.... SReed's code looks over the lower 20 dB */
363 /* PWP: 1/14/98 Made to work like Stephen Reed's code */
364
365 /* This method of detecting the noise level is VERY unsatisfactory */
366 max = 0;
367 for (j = i, th = i; (j < CONT_AD_POWHISTSIZE) && (j < i + 20); j++) { /* PWP: was i+6, which was 9 dB */
368 if (max < r->pow_hist[j]) {
369 max = r->pow_hist[j];
370 th = j;
371 }
372 }
373
374 /* "Don't change the threshold too fast" */
375 old_noise_level = r->noise_level;
376 old_thresh_sil = r->thresh_sil;
377 old_thresh_speech = r->thresh_speech;
378 /* r->noise_level = (int32) (th * r->adapt_rate + r->noise_level * (1.0 - r->adapt_rate)); */
379 r->noise_level =
380 (int32) (r->noise_level +
381 r->adapt_rate * (th - r->noise_level) + 0.5);
382
383 /* update thresholds */
384 r->thresh_sil = r->noise_level + r->delta_sil;
386
387 if (r->logfp) {
388 fprintf(r->logfp,
389 "%7.2fs %8df: NoisePeak: %d, Noiselevel: %d -> %d, Th-Sil: %d -> %d, Th-Sp: %d -> %d\n",
390 (double) (r->tot_frm * r->spf) / (double) (r->sps),
391 r->tot_frm, th, old_noise_level, r->noise_level,
392 old_thresh_sil, r->thresh_sil, old_thresh_speech,
393 r->thresh_speech);
394
396
397 fflush(r->logfp);
398 }
399
400 /*
401 * PWP: in SReed's original, he cleared the histogram here.
402 * I can't fathom why.
403 */
404
405 return 0;
406}
407
408
409/*
410 * Silence to speech transition
411 */
412static void
413sil2speech_transition(cont_ad_t *r, int frm)
414{
415 spseg_t *seg;
416
417 /* Speech detected; create speech segment description */
418 seg = malloc(sizeof(*seg));
419
420 seg->startfrm = r->win_startfrm - r->leader;
421 if (seg->startfrm < 0)
422 seg->startfrm += CONT_AD_ADFRMSIZE;
423 seg->nfrm = r->leader + r->winsize;
424 seg->next = NULL;
425
426 if (!r->spseg_head)
427 r->spseg_head = seg;
428 else
429 r->spseg_tail->next = seg;
430 r->spseg_tail = seg;
431
432 r->tail_state = CONT_AD_STATE_SPEECH;
433
434 if (r->logfp) {
435 int32 n;
436
437 /* Where (in absolute time) this speech segment starts */
438 n = frm - seg->startfrm;
439 if (n < 0)
440 n += CONT_AD_ADFRMSIZE;
441 n = r->tot_frm - n - 1;
442
443 fprintf(r->logfp,
444 "%7.2fs %8d[%3d]f: Sil -> Sp detect; seg start: %7.2fs %8d\n",
445 (double) (r->tot_frm *
446 r->spf) /
447 (double) (r->sps),
448 r->tot_frm, frm,
449 (double) (n * r->spf) / (double) (r->sps), n);
450 }
451
452 /* Now in SPEECH state; want to look for silence from end of this window */
453 r->win_validfrm = 1;
454 r->win_startfrm = frm;
455
456 /* Count #sil frames remaining in reduced window (of 1 frame) */
457 r->n_other = (r->frm_pow[frm] <= r->thresh_sil) ? 1 : 0;
458}
459
460/*
461 * Speech to silence transition
462 */
463static void
464speech2sil_transition(cont_ad_t *r, int frm)
465{
466 int f;
467
468 /* End of speech detected; speech->sil transition */
469 r->spseg_tail->nfrm += r->trailer;
470
471 r->tail_state = CONT_AD_STATE_SIL;
472
473 if (r->logfp) {
474 int32 n;
475
476 /* Where (in absolute time) this speech segment ends */
477 n = r->spseg_tail->startfrm + r->spseg_tail->nfrm - 1;
478 if (n >= CONT_AD_ADFRMSIZE)
479 n -= CONT_AD_ADFRMSIZE;
480 n = frm - n;
481 if (n < 0)
482 n += CONT_AD_ADFRMSIZE;
483 n = r->tot_frm - n;
484
485 fprintf(r->logfp,
486 "%7.2fs %8d[%3d]f: Sp -> Sil detect; seg end: %7.2fs %8d\n",
487 (double) (r->tot_frm * r->spf) /
488 (double) (r->sps), r->tot_frm, frm,
489 (double) (n * r->spf) / (double) (r->sps), n);
490 }
491
492 /* Now in SILENCE state; start looking for speech trailer+leader frames later */
493 r->win_validfrm -= (r->trailer + r->leader - 1);
494 r->win_startfrm += (r->trailer + r->leader - 1);
495 if (r->win_startfrm >= CONT_AD_ADFRMSIZE)
496 r->win_startfrm -= CONT_AD_ADFRMSIZE;
497
498 /* Count #speech frames remaining in reduced window */
499 r->n_other = 0;
500 for (f = r->win_startfrm;;) {
501 if (r->frm_pow[f] >= r->thresh_speech)
502 r->n_other++;
503
504 if (f == frm)
505 break;
506
507 f++;
508 if (f >= CONT_AD_ADFRMSIZE)
509 f = 0;
510 }
511}
512
513
514/*
515 * Main silence/speech region detection routine. If currently in
516 * SILENCE state, switch to SPEECH state if a window (r->winsize)
517 * of frames is mostly non-silence. If in SPEECH state, switch to
518 * SILENCE state if the window is mostly silence.
519 */
520static void
521boundary_detect(cont_ad_t * r, int32 frm)
522{
523 assert(r->n_other >= 0);
524
525 r->win_validfrm++;
526 if (r->tail_state == CONT_AD_STATE_SIL) {
527 if (r->frm_pow[frm] >= r->thresh_speech)
528 r->n_other++;
529 }
530 else {
531 if (r->frm_pow[frm] <= r->thresh_sil)
532 r->n_other++;
533 }
534
535 if (r->logfp) {
536 fprintf(r->logfp,
537 "%7.2fs %8d[%3d]f: P: %2d, N: %2d, T+: %2d, T-: %2d, #O: %2d, %s\n",
538 (double) (r->tot_frm * r->spf) / (double) (r->sps),
539 r->tot_frm, frm, r->frm_pow[frm], r->noise_level,
541 (r->tail_state == CONT_AD_STATE_SIL) ? "--" : "Sp");
542 }
543
544 if (r->win_validfrm < r->winsize) /* Not reached full analysis window size */
545 return;
546 assert(r->win_validfrm == r->winsize);
547
548 if (r->tail_state == CONT_AD_STATE_SIL) { /* Currently in SILENCE state */
549 if (r->n_frm >= r->winsize + r->leader
550 && r->n_other >= r->speech_onset) {
551 sil2speech_transition(r, frm);
552 }
553 }
554 else {
555 if (r->n_other >= r->sil_onset) {
556 speech2sil_transition(r, frm);
557 }
558 else {
559 /* In speech state, and staying there; add this frame to segment */
560 r->spseg_tail->nfrm++;
561 }
562 }
563
564 /*
565 * Get rid of oldest frame in analysis window. Not quite correct;
566 * thresholds could have changed over the window; should preserve
567 * the original speech/silence label for the frame and undo it. Later..
568 */
569 if (r->tail_state == CONT_AD_STATE_SIL) {
570 if (r->frm_pow[r->win_startfrm] >= r->thresh_speech) {
571 if (r->n_other > 0)
572 r->n_other--;
573 }
574 }
575 else {
576 if (r->frm_pow[r->win_startfrm] <= r->thresh_sil) {
577 if (r->n_other > 0)
578 r->n_other--;
579 }
580 }
581 r->win_validfrm--;
582 r->win_startfrm++;
583 if (r->win_startfrm >= CONT_AD_ADFRMSIZE)
584 r->win_startfrm = 0;
585
586 if (r->logfp)
587 fflush(r->logfp);
588}
589
590
591static int32
592max_siglvl(cont_ad_t * r, int32 startfrm, int32 nfrm)
593{
594 int32 siglvl, i, f;
595
596 siglvl = 0;
597 if (nfrm > 0) {
598 for (i = 0, f = startfrm; i < nfrm; i++, f++) {
599 if (f >= CONT_AD_ADFRMSIZE)
600 f -= CONT_AD_ADFRMSIZE;
601 if (r->frm_pow[f] > siglvl)
602 siglvl = r->frm_pow[f];
603 }
604 }
605 return siglvl;
606}
607
608
609#if 0
610/*
611 * RKM(2005/01/31): Where did this come from? If needed, it should be called
612 * cont_ad_get_audio_data.
613 */
614void
615get_audio_data(cont_ad_t * r, int16 * buf, int32 max)
616{
617}
618#endif
619
620
621static void
622cont_ad_read_log(cont_ad_t * r, int32 retval)
623{
624 spseg_t *seg;
625
626 fprintf(r->logfp, "return from cont_ad_read() -> %d:\n", retval);
627 fprintf(r->logfp, "\tstate: %d\n", r->state);
628 fprintf(r->logfp, "\tread_ts: %d (%.2fs)\n",
629 r->read_ts, (float32) r->read_ts / (float32) r->sps);
630 fprintf(r->logfp, "\tseglen: %d (%.2fs)\n",
631 r->seglen, (float32) r->seglen / (float32) r->sps);
632 fprintf(r->logfp, "\tsiglvl: %d\n", r->siglvl);
633 fprintf(r->logfp, "\theadfrm: %d\n", r->headfrm);
634 fprintf(r->logfp, "\tn_frm: %d\n", r->n_frm);
635 fprintf(r->logfp, "\tn_sample: %d\n", r->n_sample);
636 fprintf(r->logfp, "\twin_startfrm: %d\n", r->win_startfrm);
637 fprintf(r->logfp, "\twin_validfrm: %d\n", r->win_validfrm);
638 fprintf(r->logfp, "\tnoise_level: %d\n", r->noise_level);
639 fprintf(r->logfp, "\tthresh_sil: %d\n", r->thresh_sil);
640 fprintf(r->logfp, "\tthresh_speech: %d\n", r->thresh_speech);
641 fprintf(r->logfp, "\tn_other: %d\n", r->n_other);
642 fprintf(r->logfp, "\ttail_state: %d\n", r->tail_state);
643 fprintf(r->logfp, "\ttot_frm: %d\n", r->tot_frm);
644
645 fprintf(r->logfp, "\tspseg:");
646 for (seg = r->spseg_head; seg; seg = seg->next)
647 fprintf(r->logfp, " %d[%d]", seg->startfrm, seg->nfrm);
648 fprintf(r->logfp, "\n");
649
650 fflush(r->logfp);
651}
652
653
654/*
655 * Copy data from r->adbuf[sf], for nf frames, into buf.
656 * All length checks must have been completed before this call; hence, this
657 * function will copy exactly the specified number of frames.
658 *
659 * Return value: Index of frame just after the segment copied, possibly wrapped
660 * around to 0.
661 */
662static int32
663buf_copy(cont_ad_t * r, int32 sf, int32 nf, int16 * buf)
664{
665 int32 f, l;
666
667 assert((sf >= 0) && (sf < CONT_AD_ADFRMSIZE));
668 assert(nf >= 0);
669
670 if (sf + nf > CONT_AD_ADFRMSIZE) {
671 /* Amount to be copied wraps around adbuf; copy in two stages */
672 f = CONT_AD_ADFRMSIZE - sf;
673 l = (f * r->spf);
674 memcpy(buf, r->adbuf + (sf * r->spf), l * sizeof(int16));
675
676 if (r->logfp) {
677 fprintf(r->logfp,
678 "return %d speech frames [%d..%d]; %d samples\n",
679 f, sf, sf + f - 1, l);
680 }
681
682 buf += l;
683 sf = 0;
684 nf -= f;
685 }
686
687 if (nf > 0) {
688 l = (nf * r->spf);
689 memcpy(buf, r->adbuf + (sf * r->spf), l * sizeof(int16));
690
691 if (r->logfp) {
692 fprintf(r->logfp,
693 "return %d speech frames [%d..%d]; %d samples\n",
694 nf, sf, sf + nf - 1, l);
695 }
696 }
697
698 if ((sf + nf) >= CONT_AD_ADFRMSIZE) {
699 assert((sf + nf) == CONT_AD_ADFRMSIZE);
700 return 0;
702 else
703 return (sf + nf);
704}
705
706int32
708{
709 return r->adbufsize - r->n_sample;
710}
711
712/*
713 * Read as much data as possible from r->adfunc into r->adbuf.
714 */
715static int32
716cont_ad_read_internal(cont_ad_t *r, int16 *buf, int32 max)
717{
718 int32 head, tail, len, l;
719
720 /*
721 * First read as much of raw A/D as possible and available. adbuf is not
722 * really a circular buffer, so may have to read in two steps for wrapping
723 * around.
724 */
725 head = r->headfrm * r->spf;
726 tail = head + r->n_sample;
727 len = r->n_sample - (r->n_frm * r->spf); /* #partial frame samples at the tail */
728 assert((len >= 0) && (len < r->spf));
729
730 if ((tail < r->adbufsize) && (!r->eof)) {
731 if (r->adfunc) {
732 if ((l =
733 (*(r->adfunc)) (r->ad, r->adbuf + tail,
734 r->adbufsize - tail)) < 0) {
735 r->eof = 1;
736 l = 0;
737 }
738 }
739 else {
740 l = r->adbufsize - tail;
741 if (l > max) {
742 l = max;
743 max = 0;
744 }
745 else {
746 max -= l;
747 }
748 memcpy(r->adbuf + tail, buf, l * sizeof(int16));
749 buf += l;
750 }
751 if ((l > 0) && r->rawfp) {
752 fwrite(r->adbuf + tail, sizeof(int16), l, r->rawfp);
753 fflush(r->rawfp);
754 }
755
756 tail += l;
757 len += l;
758 r->n_sample += l;
759 }
760 if ((tail >= r->adbufsize) && (!r->eof)) {
761 tail -= r->adbufsize;
762 if (tail < head) {
763 if (r->adfunc) {
764 if ((l =
765 (*(r->adfunc)) (r->ad,
766 r->adbuf + tail, head - tail)) < 0) {
767 r->eof = 1;
768 l = 0;
769 }
770 }
771 else {
772 l = head - tail;
773 if (l > max)
774 l = max;
775 memcpy(r->adbuf + tail, buf, l * sizeof(int16));
776 }
777 if ((l > 0) && r->rawfp) {
778 fwrite(r->adbuf + tail, sizeof(int16), l, r->rawfp);
779 fflush(r->rawfp);
780 }
781
782 tail += l;
783 len += l;
784 r->n_sample += l;
785 }
786 }
787
788 return len;
789}
790
791/*
792 * Classify incoming frames as silence or speech.
793 */
794int32
795cont_ad_classify(cont_ad_t *r, int32 len)
796{
797 int32 tailfrm;
798
799 tailfrm = (r->headfrm + r->n_frm); /* Next free frame slot to be filled */
800 if (tailfrm >= CONT_AD_ADFRMSIZE)
801 tailfrm -= CONT_AD_ADFRMSIZE;
802
803 for (; len >= r->spf; len -= r->spf) {
804 compute_frame_pow(r, tailfrm);
805 r->n_frm++;
806 r->tot_frm++;
807
808 /*
809 * Find speech/sil state change, if any. Also, if staying in speech state
810 * add this frame to current speech segment.
811 */
812 boundary_detect(r, tailfrm);
813
814 if (++tailfrm >= CONT_AD_ADFRMSIZE)
815 tailfrm = 0;
816
817 /* Update thresholds if time to do so */
818 if (r->thresh_update <= 0) {
819 int32 i, f;
820 find_thresh(r);
821 decay_hist(r);
822 r->thresh_update = CONT_AD_THRESH_UPDATE;
823
824#if 1
825 /*
826 * Since threshold has been updated, recompute r->n_other.
827 * (RKM: Is this really necessary? Comment out??)
828 */
829 r->n_other = 0;
830 if (r->tail_state == CONT_AD_STATE_SIL) {
831 for (i = r->win_validfrm, f = r->win_startfrm; i > 0; --i) {
832 if (r->frm_pow[f] >= r->thresh_speech)
833 r->n_other++;
834
835 f++;
836 if (f >= CONT_AD_ADFRMSIZE)
837 f = 0;
838 }
839 }
840 else {
841 for (i = r->win_validfrm, f = r->win_startfrm; i > 0; --i) {
842 if (r->frm_pow[f] <= r->thresh_sil)
843 r->n_other++;
844
845 f++;
846 if (f >= CONT_AD_ADFRMSIZE)
847 f = 0;
848 }
849 }
850#endif
851 }
852 }
853
854 return r->tail_state;
855}
856
858 * Main function called by the application to filter out silence regions.
859 * Maintains a linked list of speech segments pointing into r->adbuf and feeds
860 * data to application from them.
861 */
862int32
863cont_ad_read(cont_ad_t * r, int16 * buf, int32 max)
864{
865 int32 flen, len, retval, newstate;
866 spseg_t *seg;
867
868 if ((r == NULL) || (buf == NULL))
869 return -1;
870
871 if (max < r->spf) {
872 E_ERROR
873 ("cont_ad_read requires buffer of at least %d samples\n",
874 r->spf);
875 return -1;
876 }
877
878 if (r->logfp) {
879 fprintf(r->logfp, "cont_ad_read(,, %d)\n", max);
880 fflush(r->logfp);
881 }
882
883 /* Read data from adfunc or from buf. */
884 len = cont_ad_read_internal(r, buf, max);
885
886 /* Compute frame power for unprocessed+new data and find speech/silence boundaries */
887 cont_ad_classify(r, len);
888
889 /*
890 * If eof on input data source, cleanup the final segment.
891 */
892 if (r->eof) {
893 if (r->tail_state == CONT_AD_STATE_SPEECH) {
894 /*
895 * Still inside a speech segment when input data got over. Absort any
896 * remaining frames into the final speech segment.
897 */
898 assert(r->spseg_tail != NULL);
899
900 /* Absorb frames still in analysis window into final speech seg */
901 assert((r->win_validfrm >= 0)
902 && (r->win_validfrm < r->winsize));
903 r->spseg_tail->nfrm += r->win_validfrm;
904
905 r->tail_state = CONT_AD_STATE_SIL;
906 }
907
908 r->win_startfrm += r->win_validfrm;
909 if (r->win_startfrm >= CONT_AD_ADFRMSIZE)
910 r->win_startfrm -= CONT_AD_ADFRMSIZE;
911 r->win_validfrm = 0;
912 r->n_other = 0;
913 }
914
915 /*
916 * At last ready to copy speech data, if any, into caller's buffer. Raw
917 * speech data is segmented into alternating speech and silence segments.
918 * But any single call to cont_ad_read will never cross a speech/silence
919 * boundary.
920 */
921 seg = r->spseg_head; /* first speech segment available, if any */
922
923 if ((seg == NULL) || (r->headfrm != seg->startfrm)) {
924 /*
925 * Either no speech data available, or inside a silence segment. Find
926 * length of silence segment.
927 */
928 if (seg == NULL) {
929 assert(r->tail_state == CONT_AD_STATE_SIL);
930
931 flen =
932 (r->eof) ? r->n_frm : r->n_frm - (r->winsize +
933 r->leader - 1);
934 if (flen < 0)
935 flen = 0;
936 }
937 else {
938 flen = seg->startfrm - r->headfrm;
939 if (flen < 0)
940 flen += CONT_AD_ADFRMSIZE;
941 }
942
943 if (r->rawmode) {
944 /* Restrict silence segment to user buffer size, integral #frames */
945 int32 f = max / r->spf;
946 if (flen > f)
947 flen = f;
948 }
949
950 newstate = CONT_AD_STATE_SIL;
951 }
952 else {
953 flen = max / r->spf; /* truncate read-size to integral #frames */
954 if (flen > seg->nfrm)
955 flen = seg->nfrm; /* truncate further to this segment size */
956
957 newstate = CONT_AD_STATE_SPEECH;
958 }
959
960 len = flen * r->spf; /* #samples being consumed */
961
962 r->siglvl = max_siglvl(r, r->headfrm, flen);
963
964 if ((newstate == CONT_AD_STATE_SIL) && (!r->rawmode)) {
965 /* Skip silence data */
966 r->headfrm += flen;
967 if (r->headfrm >= CONT_AD_ADFRMSIZE)
968 r->headfrm -= CONT_AD_ADFRMSIZE;
969
970 retval = 0; /* #samples being copied/returned */
971 }
972 else {
973 /* Copy speech/silence(in rawmode) data */
974 r->headfrm = buf_copy(r, r->headfrm, flen, buf);
975
976 retval = len; /* #samples being copied/returned */
977 }
978
979 r->n_frm -= flen;
980 r->n_sample -= len;
981 assert((r->n_frm >= 0) && (r->n_sample >= 0));
982 assert(r->win_validfrm <= r->n_frm);
983
984 if (r->state == newstate)
985 r->seglen += len;
986 else
987 r->seglen = len;
988 r->state = newstate;
989
990 if (newstate == CONT_AD_STATE_SPEECH) {
991 seg->startfrm = r->headfrm;
992 assert(seg->startfrm >= 0);
993 seg->nfrm -= flen;
994
995 /* Free seg if empty and not recording into it */
996 if ((seg->nfrm == 0)
997 && (seg->next || (r->tail_state == CONT_AD_STATE_SIL))) {
998 r->spseg_head = seg->next;
999 if (seg->next == NULL)
1000 r->spseg_tail = NULL;
1001 free(seg);
1002 }
1003 }
1004
1005 /* Update timestamp. Total raw A/D read - those remaining to be consumed */
1006 r->read_ts = (r->tot_frm - r->n_frm) * r->spf;
1007
1008 if (retval == 0)
1009 retval = (r->eof && (r->spseg_head == NULL)) ? -1 : 0;
1010
1011 if (r->logfp)
1012 cont_ad_read_log(r, retval);
1013
1014 return retval;
1015}
1017
1018/*
1019 * Calibrate input channel for silence threshold.
1020 */
1021int32
1023{
1024 int32 i, s, k, len, tailfrm;
1025
1026 if (r == NULL)
1027 return -1;
1028
1029 /* clear histogram */
1030 for (i = 0; i < CONT_AD_POWHISTSIZE; i++)
1031 r->pow_hist[i] = 0;
1032 tailfrm = r->headfrm + r->n_frm;
1033 if (tailfrm >= CONT_AD_ADFRMSIZE)
1034 tailfrm -= CONT_AD_ADFRMSIZE;
1035 s = (tailfrm * r->spf);
1036
1037 for (r->n_calib_frame = 0;
1038 r->n_calib_frame < CONT_AD_CALIB_FRAMES;
1039 ++r->n_calib_frame) {
1040 len = r->spf;
1041 while (len > 0) {
1042 /*Trouble */
1043 if ((k = (*(r->adfunc)) (r->ad, r->adbuf + s, len)) < 0)
1044 return -1;
1045 len -= k;
1046 s += k;
1047 }
1048 s -= r->spf;
1049
1050 compute_frame_pow(r, tailfrm);
1051 }
1053 r->thresh_update = CONT_AD_THRESH_UPDATE;
1054 return find_thresh(r);
1055}
1056
1057int32
1059{
1060 return r->spf * CONT_AD_CALIB_FRAMES;
1061}
1062
1063int32
1064cont_ad_calib_loop(cont_ad_t * r, int16 * buf, int32 max)
1065{
1066 int32 i, s, len, tailfrm;
1067
1068 if (r->n_calib_frame == CONT_AD_CALIB_FRAMES) {
1069 /* If calibration previously succeeded, then this is a
1070 * recalibration, so start again. */
1071 r->n_calib_frame = 0;
1072 /* clear histogram */
1073 for (i = 0; i < CONT_AD_POWHISTSIZE; i++)
1074 r->pow_hist[i] = 0;
1075 }
1076
1077 tailfrm = r->headfrm + r->n_frm;
1078 if (tailfrm >= CONT_AD_ADFRMSIZE)
1079 tailfrm -= CONT_AD_ADFRMSIZE;
1080 s = (tailfrm * r->spf);
1081
1082 len = r->spf;
1083 for (; r->n_calib_frame < CONT_AD_CALIB_FRAMES;
1084 ++r->n_calib_frame) {
1085 if (max < len)
1086 return 1;
1087 memcpy(r->adbuf + s, buf, len * sizeof(int16));
1088 max -= len;
1089 buf += len;
1090 compute_frame_pow(r, tailfrm);
1091 }
1092
1093 r->thresh_update = CONT_AD_THRESH_UPDATE;
1094 return find_thresh(r);
1095}
1096
1097
1098/* PWP 1/14/98 -- modified for compatibility with old code */
1099int32
1100cont_ad_set_thresh(cont_ad_t * r, int32 sil, int32 speech)
1101{
1102 if (r == NULL)
1103 return -1;
1104
1105 if ((sil < 0) || (speech < 0)) {
1106 fprintf(stderr,
1107 "cont_ad_set_thresh: invalid threshold arguments: %d, %d\n",
1108 sil, speech);
1109 return -1;
1110 }
1111 r->delta_sil = (3 * sil) / 2;
1112 r->delta_speech = (3 * speech) / 2;
1113
1114 return 0;
1115}
1116
1117
1118/*
1119 * PWP 1/14/98 -- set the changable params.
1121 * delta_sil, delta_speech, min_noise, and max_noise are in dB,
1122 * winsize, speech_onset, sil_onset, leader and trailer are in frames of
1123 * 16 ms length (256 samples @ 16kHz sampling).
1124 */
1125int32
1126cont_ad_set_params(cont_ad_t * r, int32 delta_sil,
1127 int32 delta_speech, int32 min_noise,
1128 int32 max_noise, int32 winsize,
1129 int32 speech_onset, int32 sil_onset, int32 leader,
1130 int32 trailer, float32 adapt_rate)
1131{
1132 if ((delta_sil < 0) || (delta_speech < 0) || (min_noise < 0)
1133 || (max_noise < 0)) {
1134 E_ERROR("threshold arguments: "
1135 "%d, %d, %d, %d must all be >=0\n", delta_sil,
1136 delta_speech, min_noise, max_noise);
1137 return -1;
1138 }
1139
1140 if ((speech_onset > winsize) || (speech_onset <= 0)
1141 || (winsize <= 0)) {
1142 E_ERROR
1143 ("speech_onset, %d, must be <= winsize, %d, and both >0\n",
1144 speech_onset, winsize);
1145 return -1;
1146 }
1147
1148 if ((sil_onset > winsize) || (sil_onset <= 0) || (winsize <= 0)) {
1149 E_ERROR
1150 ("sil_onset, %d, must be <= winsize, %d, and both >0\n",
1151 sil_onset, winsize);
1152 return -1;
1153 }
1154
1155 if (((leader + trailer) > winsize) || (leader <= 0)
1156 || (trailer <= 0)) {
1157 E_ERROR
1158 ("leader, %d, plus trailer, %d, must be <= winsize, %d, and both >0\n",
1159 leader, trailer, winsize);
1160 return -1;
1161 }
1162
1163 if ((adapt_rate < 0.0) || (adapt_rate > 1.0)) {
1164 E_ERROR("adapt_rate, %e; must be in range 0..1\n", adapt_rate);
1165 return -1;
1166 }
1167
1168 if (r == NULL)
1169 return -1;
1170
1171 r->delta_sil = delta_sil;
1172 r->delta_speech = delta_speech;
1173 r->min_noise = min_noise;
1174 r->max_noise = max_noise;
1175
1176 r->winsize = winsize;
1177 r->speech_onset = speech_onset;
1178 r->sil_onset = sil_onset;
1179 r->leader = leader;
1180 r->trailer = trailer;
1181
1182 r->adapt_rate = adapt_rate;
1183
1184 if (r->win_validfrm >= r->winsize)
1185 r->win_validfrm = r->winsize - 1;
1186
1187 return 0;
1188}
1189
1190
1191/*
1192 * PWP 1/14/98 -- get the changable params.
1194 * delta_sil, delta_speech, min_noise, and max_noise are in dB,
1195 * winsize, speech_onset, sil_onset, leader and trailer are in frames of
1196 * 16 ms length (256 samples @ 16kHz sampling).
1197 */
1198int32
1199cont_ad_get_params(cont_ad_t * r, int32 * delta_sil,
1200 int32 * delta_speech, int32 * min_noise,
1201 int32 * max_noise, int32 * winsize,
1202 int32 * speech_onset, int32 * sil_onset,
1203 int32 * leader, int32 * trailer, float32 * adapt_rate)
1204{
1205 if (!delta_sil || !delta_speech || !min_noise || !max_noise
1206 || !winsize || !speech_onset || !sil_onset || !leader
1207 || !trailer || !adapt_rate) {
1208 fprintf(stderr, "cont_ad_get_params: some param slots are NULL\n");
1209 return (-1);
1210 }
1211
1212 if (r == NULL)
1213 return -1;
1214
1215 *delta_sil = r->delta_sil;
1216 *delta_speech = r->delta_speech;
1217 *min_noise = r->min_noise;
1218 *max_noise = r->max_noise;
1219
1220 *winsize = r->winsize;
1221 *speech_onset = r->speech_onset;
1222 *sil_onset = r->sil_onset;
1223 *leader = r->leader;
1224 *trailer = r->trailer;
1225
1226 *adapt_rate = r->adapt_rate;
1227
1228 return 0;
1229}
1231
1232/*
1233 * Reset, discarded any accumulated speech.
1234 */
1235int32
1237{
1238 spseg_t *seg;
1239
1240 if (r == NULL)
1241 return -1;
1242
1243 while (r->spseg_head) {
1244 seg = r->spseg_head;
1245 r->spseg_head = seg->next;
1246 free(seg);
1247 }
1248 r->spseg_tail = NULL;
1249
1250 r->headfrm = 0;
1251 r->n_frm = 0;
1252 r->n_sample = 0;
1253 r->win_startfrm = 0;
1254 r->win_validfrm = 0;
1255 r->n_other = 0;
1256
1257 r->tail_state = CONT_AD_STATE_SIL;
1259 return 0;
1260}
1261
1262
1263int32
1265{
1266 if (cont == NULL)
1267 return -1;
1268
1269 cont_ad_reset(cont); /* Frees any remaining speech segments */
1270
1271 free(cont->adbuf);
1272 free(cont->pow_hist);
1273 free(cont->frm_pow);
1274 free(cont);
1276 return 0;
1277}
1278
1279
1280int32
1282{
1283 if (c == NULL)
1284 return -1;
1285
1286 c->ad = NULL;
1287 c->adfunc = NULL;
1288 return 0;
1289}
1290
1291
1292int32
1294 int32(*func) (ad_rec_t *, int16 *, int32))
1295{
1296 if (c == NULL)
1297 return -1;
1298
1299 c->ad = a;
1300 c->adfunc = func;
1301 c->eof = 0;
1303 return 0;
1304}
1305
1306
1307int32
1308cont_set_thresh(cont_ad_t * r, int32 silence, int32 speech)
1309{
1310 int32 i, f;
1311
1312 r->thresh_speech = speech;
1313 r->thresh_sil = silence;
1314
1315 /* Since threshold has been updated, recompute r->n_other */
1316 r->n_other = 0;
1317 if (r->tail_state == CONT_AD_STATE_SIL) {
1318 for (i = r->win_validfrm, f = r->win_startfrm; i > 0; --i) {
1319 if (r->frm_pow[f] >= r->thresh_speech)
1320 r->n_other++;
1321
1322 f++;
1323 if (f >= CONT_AD_ADFRMSIZE)
1324 f = 0;
1325 }
1326 }
1327 else if (r->tail_state == CONT_AD_STATE_SPEECH) {
1328 for (i = r->win_validfrm, f = r->win_startfrm; i > 0; --i) {
1329 if (r->frm_pow[f] <= r->thresh_sil)
1330 r->n_other++;
1331
1332 f++;
1333 if (f >= CONT_AD_ADFRMSIZE)
1334 f = 0;
1335 }
1336 }
1337
1338 return 0;
1339}
1341
1342/*
1343 * Set the file pointer for dumping the raw input audio stream.
1344 */
1345int32
1346cont_ad_set_rawfp(cont_ad_t * r, FILE * fp)
1347{
1348 if (r == NULL)
1349 return -1;
1350
1351 r->rawfp = fp;
1352 return 0;
1353}
1355
1356/*
1357 * Set the file pointer for logging cont_ad progress.
1358 */
1359int32
1360cont_ad_set_logfp(cont_ad_t * r, FILE * fp)
1361{
1362 if (r == NULL)
1363 return -1;
1364
1365 r->logfp = fp;
1366 return 0;
1367}
1368
1369
1370/*
1371 * One-time initialization.
1372 */
1373cont_ad_t *
1374cont_ad_init(ad_rec_t * a, int32(*func) (ad_rec_t *, int16 *, int32))
1375{
1376 cont_ad_t *r;
1377
1378 if ((r = malloc(sizeof(*r))) == NULL) {
1379 E_ERROR_SYSTEM("allocation of cont_ad_t failed");
1380 return NULL;
1381 }
1382
1383 r->ad = a;
1384 r->adfunc = func;
1385 r->eof = 0;
1386 r->rawmode = 0;
1387
1388 if (a != NULL)
1389 r->sps = a->sps;
1390 else
1391 r->sps = CONT_AD_SPS;
1392
1393 /* Set samples/frame such that when sps=16000, spf=256 */
1394 r->spf = (r->sps * 256) / CONT_AD_SPS;
1395 r->adbufsize = CONT_AD_ADFRMSIZE * r->spf;
1396
1397 if ((r->adbuf = malloc(r->adbufsize * sizeof(*r->adbuf))) == NULL) {
1398 E_ERROR_SYSTEM("allocation of audio buffer failed");
1399 free(r);
1400 return NULL;
1401 }
1402 if ((r->pow_hist =
1403 calloc(CONT_AD_POWHISTSIZE, sizeof(*r->pow_hist))) == NULL) {
1404 E_ERROR_SYSTEM("allocation of power history buffer failed");
1405 free(r->adbuf);
1406 free(r);
1407 return NULL;
1408 }
1409 if ((r->frm_pow =
1410 calloc(CONT_AD_ADFRMSIZE, sizeof(*r->frm_pow))) == NULL) {
1411 E_ERROR_SYSTEM("allocation of frame power buffer failed");
1412 free(r->pow_hist);
1413 free(r->adbuf);
1414 free(r);
1415 return NULL;
1416 }
1417
1418 r->state = CONT_AD_STATE_SIL;
1419 r->read_ts = 0;
1420 r->seglen = 0;
1421 r->siglvl = 0;
1422 r->prev_sample = 0;
1423 r->tot_frm = 0;
1424 r->noise_level = CONT_AD_DEFAULT_NOISE;
1425
1426 r->auto_thresh = 1;
1427 r->delta_sil = CONT_AD_DELTA_SIL;
1428 r->delta_speech = CONT_AD_DELTA_SPEECH;
1429 r->min_noise = CONT_AD_MIN_NOISE;
1430 r->max_noise = CONT_AD_MAX_NOISE;
1431 r->winsize = CONT_AD_WINSIZE;
1432 r->speech_onset = CONT_AD_SPEECH_ONSET;
1433 r->sil_onset = CONT_AD_SIL_ONSET;
1434 r->leader = CONT_AD_LEADER;
1435 r->trailer = CONT_AD_TRAILER;
1436
1437 r->thresh_sil = r->noise_level + r->delta_sil;
1439 r->thresh_update = CONT_AD_THRESH_UPDATE;
1440 r->adapt_rate = CONT_AD_ADAPT_RATE;
1441
1442 r->tail_state = CONT_AD_STATE_SIL;
1443
1444 r->spseg_head = NULL;
1445 r->spseg_tail = NULL;
1446
1447 r->rawfp = NULL;
1448 r->logfp = NULL;
1449
1450 r->n_calib_frame = 0;
1451
1452 cont_ad_reset(r);
1453
1454 return r;
1455}
1456
1457
1458cont_ad_t *
1460 int32(*func) (ad_rec_t *, int16 *, int32))
1461{
1462 cont_ad_t *r;
1463
1464 r = cont_ad_init(a, func);
1465 r->rawmode = 1;
1466
1467 return r;
1468}
generic live audio interface for recording and playback
Continuous A/D listening and silence filtering module.
SPHINXBASE_EXPORT int32 cont_ad_reset(cont_ad_t *cont)
Reset, discarding any accumulated speech segments.
SPHINXBASE_EXPORT int32 cont_ad_set_logfp(cont_ad_t *c, FILE *fp)
Set the file to which cont_ad logs its progress.
SPHINXBASE_EXPORT void cont_ad_powhist_dump(FILE *fp, cont_ad_t *cont)
Dump the power histogram.
SPHINXBASE_EXPORT int32 cont_ad_close(cont_ad_t *cont)
Close the continuous listening object.
SPHINXBASE_EXPORT int32 cont_set_thresh(cont_ad_t *r, int32 silence, int32 speech)
Set the silence and speech thresholds.
SPHINXBASE_EXPORT int32 cont_ad_attach(cont_ad_t *c, ad_rec_t *a, int32(*func)(ad_rec_t *, int16 *, int32))
Attach the continuous listening module to the given audio device/function.
SPHINXBASE_EXPORT int32 cont_ad_set_rawfp(cont_ad_t *c, FILE *fp)
Set a file for dumping raw audio input.
SPHINXBASE_EXPORT int32 cont_ad_buffer_space(cont_ad_t *r)
Get the maximum number of samples which can be passed into cont_ad_read().
SPHINXBASE_EXPORT int32 cont_ad_calib_size(cont_ad_t *r)
Get the number of samples required to calibrate the silence filter.
SPHINXBASE_EXPORT int32 cont_ad_calib(cont_ad_t *cont)
Calibrate the silence filter.
SPHINXBASE_EXPORT int32 cont_ad_calib_loop(cont_ad_t *r, int16 *buf, int32 max)
Calibrate the silence filter without an audio device.
SPHINXBASE_EXPORT int32 cont_ad_detach(cont_ad_t *c)
Detach the given continuous listening module from the associated audio device.
SPHINXBASE_EXPORT int32 cont_ad_set_params(cont_ad_t *r, int32 delta_sil, int32 delta_speech, int32 min_noise, int32 max_noise, int32 winsize, int32 speech_onset, int32 sil_onset, int32 leader, int32 trailer, float32 adapt_rate)
Set the changable parameters.
SPHINXBASE_EXPORT cont_ad_t * cont_ad_init_rawmode(ad_rec_t *ad, int32(*adfunc)(ad_rec_t *ad, int16 *buf, int32 max))
Initializes a continuous listening object which simply passes data through (!)
SPHINXBASE_EXPORT int32 cont_ad_read(cont_ad_t *r, int16 *buf, int32 max)
Read raw audio data into the silence filter.
SPHINXBASE_EXPORT int32 cont_ad_get_params(cont_ad_t *r, int32 *delta_sil, int32 *delta_speech, int32 *min_noise, int32 *max_noise, int32 *winsize, int32 *speech_onset, int32 *sil_onset, int32 *leader, int32 *trailer, float32 *adapt_rate)
PWP 1/14/98 – get the changable params.
SPHINXBASE_EXPORT cont_ad_t * cont_ad_init(ad_rec_t *ad, int32(*adfunc)(ad_rec_t *ad, int16 *buf, int32 max))
Initialize a continuous listening/silence filtering object.
SPHINXBASE_EXPORT int32 cont_ad_set_thresh(cont_ad_t *cont, int32 sil, int32 sp)
Set silence and speech threshold parameters.
Implementation of logging routines.
#define E_ERROR_SYSTEM
Print error text; Call perror("");.
Definition err.h:142
#define E_ERROR
Print error message to standard error stream.
Definition err.h:169
Basic type definitions used in Sphinx.
Definition ad.h:255
int32 sps
Samples/sec.
Definition ad.h:256
Continuous listening module or object Continuous listening module or object.
Definition cont_ad.h:151
int32 * pow_hist
Histogram of frame power, moving window, decayed.
Definition cont_ad.h:194
int32 thresh_speech
Frame considered to be speech if power >= thresh_speech (for transitioning from SILENCE to SPEECH sta...
Definition cont_ad.h:208
int32 leader
pad beggining of speech with this many extra frms
Definition cont_ad.h:205
int32 sps
Samples/sec; moved from ad->sps to break dependence on ad by N.
Definition cont_ad.h:180
int32 win_validfrm
Number of frames currently available from win_startfrm for analysis.
Definition cont_ad.h:221
char * frm_pow
Frame power.
Definition cont_ad.h:195
int32 speech_onset
start speech on >= these many frames out of winsize, of >= delta_speech
Definition cont_ad.h:203
int32 thresh_update
Number of frames before next update to pow_hist/thresholds.
Definition cont_ad.h:212
int32 seglen
Total no.
Definition cont_ad.h:171
int32 tot_frm
Total number of frames of A/D data read, including consumed ones.
Definition cont_ad.h:191
int32 prev_sample
For pre-emphasis filter.
Definition cont_ad.h:187
int32 siglvl
Max signal level for the data consumed by the most recent cont_ad_read call (dB range: 0-99).
Definition cont_ad.h:175
int32 spf
Samples/frame; audio level is analyzed within frames.
Definition cont_ad.h:185
int32 adbufsize
Buffer size (Number of samples)
Definition cont_ad.h:186
int32 state
State of data returned by most recent cont_ad_read call; CONT_AD_STATE_SIL or CONT_AD_STATE_SPEECH.
Definition cont_ad.h:165
int32 n_other
If in SILENCE state, number of frames in analysis window considered to be speech; otherwise number of...
Definition cont_ad.h:222
int16 * adbuf
Circular buffer for maintaining A/D data read until consumed.
Definition cont_ad.h:158
int32 delta_sil
Max silence power/frame ABOVE noise level.
Definition cont_ad.h:198
int32 win_startfrm
Where next analysis window begins.
Definition cont_ad.h:220
int32 rawmode
Pass all input data through, without filtering silence.
Definition cont_ad.h:156
int32 n_sample
Number of samples of unconsumed data in adbuf.
Definition cont_ad.h:190
int32 read_ts
Absolute timestamp (total no.
Definition cont_ad.h:167
int32 winsize
how many frames to look at for speech det
Definition cont_ad.h:202
int32 auto_thresh
Do automatic threshold adjustment or not.
Definition cont_ad.h:197
int32 sil_onset
end speech on >= these many frames out of winsize, of <= delta_sil
Definition cont_ad.h:204
int32 headfrm
Frame number in adbuf with unconsumed A/D data.
Definition cont_ad.h:188
int32 trailer
pad end of speech with this many extra frms
Definition cont_ad.h:206
ad_rec_t * ad
A/D device argument for adfunc.
Definition cont_ad.h:154
int32 n_frm
Number of complete frames of unconsumed A/D data in adbuf.
Definition cont_ad.h:189
spseg_t * spseg_head
First of unconsumed speech segments.
Definition cont_ad.h:224
float32 adapt_rate
Linear interpolation constant for rate at which noise level adapted to each estimate; range: 0-1; 0=>...
Definition cont_ad.h:213
int32 min_noise
noise lower than this we ignore
Definition cont_ad.h:200
int32 n_calib_frame
Number of frames of calibration data seen so far.
Definition cont_ad.h:236
spseg_t * spseg_tail
Last of unconsumed speech segments.
Definition cont_ad.h:225
FILE * logfp
If non-NULL, write detailed logs of this object's progress to the file.
Definition cont_ad.h:231
int32 thresh_sil
Frame considered to be silence if power <= thresh_sil (for transitioning from SPEECH to SILENCE state...
Definition cont_ad.h:210
int32 tail_state
State at the end of its internal buffer (internal use): CONT_AD_STATE_SIL or CONT_AD_STATE_SPEECH.
Definition cont_ad.h:217
int32 max_noise
noise higher than this signals an error
Definition cont_ad.h:201
int32 noise_level
PWP: what we claim as the "current" noise level.
Definition cont_ad.h:192
int32 eof
Whether the source ad device has encountered EOF.
Definition cont_ad.h:183
FILE * rawfp
If non-NULL, raw audio input data processed by cont_ad is dumped to this file.
Definition cont_ad.h:227
int32 delta_speech
Min speech power/frame ABOVE noise level.
Definition cont_ad.h:199
int32 startfrm
Frame-id in adbuf (see below) of start of this segment.
Definition cont_ad.h:136
int32 nfrm
Number of frames in segment (may wrap around adbuf)
Definition cont_ad.h:137
struct spseg_s * next
Next speech segment (with some intervening silence)
Definition cont_ad.h:138
(FOR INTERNAL USE ) Data structure for maintaining speech (non-silence) segments not yet consumed by ...