00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095 #include <stdio.h>
00096 #include <stdlib.h>
00097 #include <string.h>
00098 #include <assert.h>
00099 #include <math.h>
00100
00101 #include <prim_type.h>
00102 #include <ad.h>
00103 #include <cont_ad.h>
00104 #include <err.h>
00105
00106 static FILE *infp;
00107 static int32 swap;
00108
00109
00110 static int32 max_ad_read_size;
00111
00112 #if defined(WIN32) && !defined(GNUWINCE)
00113 #define NULL_DEVICE "NUL"
00114 #else
00115 #define NULL_DEVICE "/dev/null"
00116 #endif
00117
00118
00119
00120
00121
00122
00123
00124 static int32
00125 file_ad_read(ad_rec_t * r, int16 * buf, int32 max)
00126 {
00127 int32 i, k;
00128
00129 if (max > max_ad_read_size)
00130 max = max_ad_read_size;
00131
00132 k = fread(buf, sizeof(int16), max, infp);
00133 if (swap) {
00134 for (i = 0; i < k; i++) {
00135 buf[i] = ((buf[i] >> 8) & 0x00ff) | ((buf[i] << 8) & 0xff00);
00136 }
00137 }
00138
00139 return ((k > 0) ? k : -1);
00140 }
00141
00142
00143 static void
00144 usagemsg(char *pgm)
00145 {
00146 E_INFO("Usage: %s \\\n", pgm);
00147 E_INFOCONT("\t[-? | -h] \\\n");
00148 E_INFOCONT("\t[-d | -debug] \\\n");
00149 E_INFOCONT("\t[-sps <sampling-rate> (16000)] \\\n");
00150 E_INFOCONT("\t[-b | -byteswap] \\\n");
00151 E_INFOCONT
00152 ("\t[{-s | -silsep} <length-silence-separator(sec) (0.5)]> \\\n");
00153 E_INFOCONT("\t[-w | -writeseg] \\\n");
00154 E_INFOCONT("\t[-min-noise <min-noise>] \\\n");
00155 E_INFOCONT("\t[-max-noise <max-noise>] \\\n");
00156 E_INFOCONT("\t[-delta-sil <delta-sil>] \\\n");
00157 E_INFOCONT("\t[-delta-speech <delta-speech>] \\\n");
00158 E_INFOCONT("\t[-sil-onset <sil-onset>] \\\n");
00159 E_INFOCONT("\t[-speech-onset <speech-onset>] \\\n");
00160 E_INFOCONT("\t[-adapt-rate <adapt-rate>] \\\n");
00161 E_INFOCONT("\t[-max-adreadsize <ad_read_blksize>] \\\n");
00162 E_INFOCONT("\t[-c <copy-input-file>] \\\n");
00163 E_INFOCONT("\t[-r | -rawmode] \\\n");
00164 E_INFOCONT("\t-i <input-file>\n");
00165
00166 exit(0);
00167 }
00168
00169
00170
00171
00172
00173
00174 int
00175 main(int32 argc, char **argv)
00176 {
00177 cont_ad_t *cont;
00178 int32 uttid, uttlen, starttime, siltime, sps, debug, writeseg, rawmode;
00179 int16 buf[4096];
00180 char *infile, *copyfile, segfile[1024];
00181 FILE *fp;
00182 float endsil;
00183 ad_rec_t ad;
00184 int32 i, k;
00185 int32 winsize, leader, trailer;
00186 int32 orig_min_noise, orig_max_noise;
00187 int32 orig_delta_sil, orig_delta_speech;
00188 int32 orig_speech_onset, orig_sil_onset;
00189 int32 min_noise, max_noise;
00190 int32 delta_sil, delta_speech;
00191 int32 sil_onset, speech_onset;
00192 float32 orig_adapt_rate;
00193 float32 adapt_rate;
00194 int32 total_speech_samples;
00195 float32 total_speech_sec;
00196 FILE *rawfp;
00197
00198
00199 cont = NULL;
00200 sps = 16000;
00201 swap = 0;
00202 endsil = 0.5;
00203 writeseg = 0;
00204 min_noise = max_noise = -1;
00205 delta_sil = delta_speech = -1;
00206 sil_onset = speech_onset = -1;
00207 adapt_rate = -1.0;
00208 max_ad_read_size = (int32) 0x7ffffff0;
00209 debug = 0;
00210 infile = NULL;
00211 copyfile = NULL;
00212 rawfp = NULL;
00213 rawmode = 0;
00214
00215
00216 for (i = 1; i < argc; i++) {
00217 if ((strcmp(argv[i], "-help") == 0)
00218 || (strcmp(argv[i], "-h") == 0)
00219 || (strcmp(argv[i], "-?") == 0)) {
00220 usagemsg(argv[0]);
00221 }
00222 else if ((strcmp(argv[i], "-debug") == 0)
00223 || (strcmp(argv[i], "-d") == 0)) {
00224 debug = 1;
00225 }
00226 else if (strcmp(argv[i], "-sps") == 0) {
00227 i++;
00228 if ((i == argc)
00229 || (sscanf(argv[i], "%d", &sps) != 1)
00230 || (sps <= 0)) {
00231 E_ERROR("Invalid -sps argument\n");
00232 usagemsg(argv[0]);
00233 }
00234 }
00235 else if ((strcmp(argv[i], "-byteswap") == 0)
00236 || (strcmp(argv[i], "-b") == 0)) {
00237 swap = 1;
00238 }
00239 else if ((strcmp(argv[i], "-silsep") == 0)
00240 || (strcmp(argv[i], "-s") == 0)) {
00241 i++;
00242 if ((i == argc)
00243 || (sscanf(argv[i], "%f", &endsil) != 1)
00244 || (endsil <= 0.0)) {
00245 E_ERROR("Invalid -silsep argument\n");
00246 usagemsg(argv[0]);
00247 }
00248 }
00249 else if ((strcmp(argv[i], "-writeseg") == 0)
00250 || (strcmp(argv[i], "-w") == 0)) {
00251 writeseg = 1;
00252 }
00253 else if (strcmp(argv[i], "-min-noise") == 0) {
00254 i++;
00255 if ((i == argc) ||
00256 (sscanf(argv[i], "%d", &min_noise) != 1) ||
00257 (min_noise < 0)) {
00258 E_ERROR("Invalid -min-noise argument\n");
00259 usagemsg(argv[0]);
00260 }
00261 }
00262 else if (strcmp(argv[i], "-max-noise") == 0) {
00263 i++;
00264 if ((i == argc) ||
00265 (sscanf(argv[i], "%d", &max_noise) != 1) ||
00266 (max_noise < 0)) {
00267 E_ERROR("Invalid -max-noise argument\n");
00268 usagemsg(argv[0]);
00269 }
00270 }
00271 else if (strcmp(argv[i], "-delta-sil") == 0) {
00272 i++;
00273 if ((i == argc) ||
00274 (sscanf(argv[i], "%d", &delta_sil) != 1) ||
00275 (delta_sil < 0)) {
00276 E_ERROR("Invalid -delta-sil argument\n");
00277 usagemsg(argv[0]);
00278 }
00279 }
00280 else if (strcmp(argv[i], "-delta-speech") == 0) {
00281 i++;
00282 if ((i == argc) ||
00283 (sscanf(argv[i], "%d", &delta_speech) != 1) ||
00284 (delta_speech < 0)) {
00285 E_ERROR("Invalid -delta-speech argument\n");
00286 usagemsg(argv[0]);
00287 }
00288 }
00289 else if (strcmp(argv[i], "-sil-onset") == 0) {
00290 i++;
00291 if ((i == argc) ||
00292 (sscanf(argv[i], "%d", &sil_onset) != 1) ||
00293 (sil_onset < 1)) {
00294 E_ERROR("Invalid -sil-onset argument\n");
00295 usagemsg(argv[0]);
00296 }
00297 }
00298 else if (strcmp(argv[i], "-speech-onset") == 0) {
00299 i++;
00300 if ((i == argc) ||
00301 (sscanf(argv[i], "%d", &speech_onset) != 1) ||
00302 (speech_onset < 1)) {
00303 E_ERROR("Invalid -speech-onset argument\n");
00304 usagemsg(argv[0]);
00305 }
00306 }
00307 else if (strcmp(argv[i], "-adapt-rate") == 0) {
00308 i++;
00309 if ((i == argc) ||
00310 (sscanf(argv[i], "%f", &adapt_rate) != 1) ||
00311 (adapt_rate < 0.0) || (adapt_rate > 1.0)) {
00312 E_ERROR("Invalid -adapt-rate argument\n");
00313 usagemsg(argv[0]);
00314 }
00315 }
00316 else if (strcmp(argv[i], "-max-adreadsize") == 0) {
00317 i++;
00318 if ((i == argc) ||
00319 (sscanf(argv[i], "%d", &max_ad_read_size) != 1) ||
00320 (max_ad_read_size < 1)) {
00321 E_ERROR("Invalid -max-adreadsize argument\n");
00322 usagemsg(argv[0]);
00323 }
00324 }
00325 else if (strcmp(argv[i], "-c") == 0) {
00326 i++;
00327 if (i == argc) {
00328 E_ERROR("Invalid -c argument\n");
00329 usagemsg(argv[0]);
00330 }
00331 copyfile = argv[i];
00332 }
00333 else if ((strcmp(argv[i], "-rawmode") == 0)
00334 || (strcmp(argv[i], "-r") == 0)) {
00335 rawmode = 1;
00336 }
00337 else if (strcmp(argv[i], "-i") == 0) {
00338 i++;
00339 if (i == argc) {
00340 E_ERROR("Invalid -i argument\n");
00341 usagemsg(argv[0]);
00342 }
00343 infile = argv[i];
00344 }
00345 else {
00346 usagemsg(argv[0]);
00347 }
00348 }
00349
00350 if (infile == NULL) {
00351 E_ERROR("No input file specified\n");
00352 usagemsg(argv[0]);
00353 }
00354
00355 if ((infp = fopen(infile, "rb")) == NULL)
00356 E_FATAL("fopen(%s,rb) failed\n", infile);
00357
00358
00359
00360
00361
00362
00363 ad.sps = sps;
00364 ad.bps = sizeof(int16);
00365 if (!rawmode)
00366 cont = cont_ad_init(&ad, file_ad_read);
00367 else
00368 cont = cont_ad_init_rawmode(&ad, file_ad_read);
00369
00370 printf("Calibrating ...");
00371 fflush(stdout);
00372 if (cont_ad_calib(cont) < 0)
00373 printf(" failed; file too short?\n");
00374 else
00375 printf(" done\n");
00376 rewind(infp);
00377
00378
00379 siltime = (int32) (endsil * sps);
00380
00381
00382 if (copyfile) {
00383 if ((rawfp = fopen(copyfile, "wb")) == NULL)
00384 E_ERROR("fopen(%s,wb) failed; not dumping raw file\n",
00385 copyfile);
00386 else
00387 cont_ad_set_rawfp(cont, rawfp);
00388 }
00389
00390 cont_ad_get_params(cont,
00391 &orig_delta_sil, &orig_delta_speech,
00392 &orig_min_noise, &orig_max_noise,
00393 &winsize,
00394 &orig_speech_onset, &orig_sil_onset,
00395 &leader, &trailer, &orig_adapt_rate);
00396
00397 E_INFO("Default parameters:\n");
00398 E_INFOCONT("\tmin-noise = %d, max-noise = %d\n",
00399 orig_min_noise, orig_max_noise);
00400 E_INFOCONT("\tdelta-sil = %d, delta-speech = %d\n",
00401 orig_delta_sil, orig_delta_speech);
00402 E_INFOCONT("\tsil-onset = %d, speech-onset = %d\n",
00403 orig_sil_onset, orig_speech_onset);
00404 E_INFOCONT("\tadapt_rate = %.3f\n", orig_adapt_rate);
00405
00406 if (min_noise < 0)
00407 min_noise = orig_min_noise;
00408 if (max_noise < 0)
00409 max_noise = orig_max_noise;
00410 if (delta_sil < 0)
00411 delta_sil = orig_delta_sil;
00412 if (delta_speech < 0)
00413 delta_speech = orig_delta_speech;
00414 if (sil_onset < 0)
00415 sil_onset = orig_sil_onset;
00416 if (speech_onset < 0)
00417 speech_onset = orig_speech_onset;
00418 if (adapt_rate < 0.0)
00419 adapt_rate = orig_adapt_rate;
00420
00421 cont_ad_set_params(cont,
00422 delta_sil, delta_speech,
00423 min_noise, max_noise,
00424 winsize,
00425 speech_onset, sil_onset,
00426 leader, trailer, adapt_rate);
00427
00428 E_INFO("Current parameters:\n");
00429 E_INFOCONT("\tmin-noise = %d, max-noise = %d\n", min_noise, max_noise);
00430 E_INFOCONT("\tdelta-sil = %d, delta-speech = %d\n", delta_sil,
00431 delta_speech);
00432 E_INFOCONT("\tsil-onset = %d, speech-onset = %d\n", sil_onset,
00433 speech_onset);
00434 E_INFOCONT("\tadapt_rate = %.3f\n", adapt_rate);
00435
00436 E_INFO("Sampling rate: %d", sps);
00437 E_INFOCONT("; Byteswap: %s", swap ? "Yes" : "No");
00438 E_INFOCONT("; Max ad-read size: %d\n", max_ad_read_size);
00439
00440 if (debug)
00441 cont_ad_set_logfp(cont, stdout);
00442
00443 total_speech_samples = 0;
00444 total_speech_sec = 0.0;
00445
00446 uttid = 0;
00447 uttlen = 0;
00448 starttime = 0;
00449 fp = NULL;
00450
00451
00452 for (;;) {
00453
00454 k = cont_ad_read(cont, buf, 4096);
00455
00456 if (k < 0) {
00457 if (fp != NULL) {
00458 fclose(fp);
00459 fp = NULL;
00460
00461 printf
00462 ("Utt %08d, st= %8.2fs, et= %8.2fs, seg= %7.2fs (#samp= %10d)\n",
00463 uttid, (double) starttime / (double) sps,
00464 (double) (starttime + uttlen) / (double) sps,
00465 (double) uttlen / (double) sps, uttlen);
00466 fflush(stdout);
00467
00468 total_speech_samples += uttlen;
00469 total_speech_sec += (double) uttlen / (double) sps;
00470
00471 uttid++;
00472 }
00473
00474 break;
00475 }
00476
00477 if (cont->state == CONT_AD_STATE_SIL) {
00478 if (fp != NULL) {
00479 if (cont->seglen > siltime) {
00480 fclose(fp);
00481 fp = NULL;
00482
00483 printf
00484 ("Utt %08d, st= %8.2fs, et= %8.2fs, seg= %7.2fs (#samp= %10d)\n",
00485 uttid, (double) starttime / (double) sps,
00486 (double) (starttime + uttlen) / (double) sps,
00487 (double) uttlen / (double) sps, uttlen);
00488 fflush(stdout);
00489
00490 total_speech_samples += uttlen;
00491 total_speech_sec += (double) uttlen / (double) sps;
00492
00493 uttid++;
00494 }
00495 else {
00496
00497
00498
00499
00500 if (k > 0) {
00501 fwrite(buf, sizeof(int16), k, fp);
00502 uttlen += k;
00503 }
00504 }
00505 }
00506 }
00507 else {
00508 assert(cont->state == CONT_AD_STATE_SPEECH);
00509
00510 if (fp == NULL) {
00511 if (writeseg)
00512 sprintf(segfile, "%08d.raw", uttid);
00513 else
00514 strcpy(segfile, NULL_DEVICE);
00515 if ((fp = fopen(segfile, "wb")) == NULL)
00516 E_FATAL("fopen(%s,wb) failed\n", segfile);
00517
00518 starttime = cont->read_ts - k;
00519 uttlen = 0;
00520 }
00521
00522
00523 if (k > 0) {
00524 fwrite(buf, sizeof(int16), k, fp);
00525 uttlen += k;
00526 }
00527 }
00528 }
00529
00530 if (rawfp)
00531 fclose(rawfp);
00532
00533 E_INFO("Total raw input speech = %d frames, %d samples, %.2f sec\n",
00534 cont->tot_frm, cont->tot_frm * cont->spf,
00535 (cont->tot_frm * cont->spf) / (float32) cont->sps);
00536 E_INFO("Total speech detected = %d samples, %.2f sec\n",
00537 total_speech_samples, total_speech_sec);
00538
00539 cont_ad_close(cont);
00540
00541 return 0;
00542 }