SphinxBase
0.6
|
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 00002 /* ==================================================================== 00003 * Copyright (c) 1996-2004 Carnegie Mellon University. All rights 00004 * reserved. 00005 * 00006 * Redistribution and use in source and binary forms, with or without 00007 * modification, are permitted provided that the following conditions 00008 * are met: 00009 * 00010 * 1. Redistributions of source code must retain the above copyright 00011 * notice, this list of conditions and the following disclaimer. 00012 * 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in 00015 * the documentation and/or other materials provided with the 00016 * distribution. 00017 * 00018 * This work was supported in part by funding from the Defense Advanced 00019 * Research Projects Agency and the National Science Foundation of the 00020 * United States of America, and the CMU Sphinx Speech Consortium. 00021 * 00022 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 00023 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 00024 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00025 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 00026 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00027 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00028 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00029 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00030 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00031 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00032 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00033 * 00034 * ==================================================================== 00035 * 00036 */ 00037 #include <stdio.h> 00038 #include <stdlib.h> 00039 #include <string.h> 00040 #include <time.h> 00041 #include <assert.h> 00042 00043 #ifdef HAVE_CONFIG_H 00044 #include <config.h> 00045 #endif 00046 00047 #ifdef HAVE_SNDFILE_H 00048 #include <sndfile.h> 00049 #endif 00050 00051 #include <sphinxbase/fe.h> 00052 #include <sphinxbase/strfuncs.h> 00053 #include <sphinxbase/pio.h> 00054 #include <sphinxbase/filename.h> 00055 #include <sphinxbase/cmd_ln.h> 00056 #include <sphinxbase/err.h> 00057 #include <sphinxbase/ckd_alloc.h> 00058 #include <sphinxbase/byteorder.h> 00059 #include <sphinxbase/hash_table.h> 00060 00061 #include "sphinx_wave2feat.h" 00062 #include "cmd_ln_defn.h" 00063 00064 typedef struct audio_type_s { 00065 char const *name; 00066 int (*detect)(sphinx_wave2feat_t *wtf, char const *infile); 00067 int (*decode)(sphinx_wave2feat_t *wtf); 00068 } audio_type_t; 00069 00070 typedef struct output_type_s { 00071 char const *name; 00072 int (*output_header)(sphinx_wave2feat_t *wtf, int nfloat); 00073 int (*output_frames)(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr); 00074 } output_type_t; 00075 00076 struct sphinx_wave2feat_s { 00077 int refcount; 00078 cmd_ln_t *config; 00079 fe_t *fe; 00080 char *infile; 00081 char *outfile; 00082 FILE *infh; 00083 FILE *outfh; 00084 short *audio; 00085 mfcc_t **feat; 00086 int blocksize; 00087 int featsize; 00088 int veclen; 00089 int in_veclen; 00090 int byteswap; 00091 #ifdef HAVE_SNDFILE_H 00092 SNDFILE *insfh; 00093 #endif 00094 output_type_t const *ot; 00095 }; 00096 00098 typedef struct RIFFHeader{ 00099 char rifftag[4]; /* "RIFF" string */ 00100 int32 TotalLength; /* Total length */ 00101 char wavefmttag[8]; /* "WAVEfmt " string (note space after 't') */ 00102 int32 RemainingLength; /* Remaining length */ 00103 int16 data_format; /* data format tag, 1 = PCM */ 00104 int16 numchannels; /* Number of channels in file */ 00105 int32 SamplingFreq; /* Sampling frequency */ 00106 int32 BytesPerSec; /* Average bytes/sec */ 00107 int16 BlockAlign; /* Block align */ 00108 int16 BitsPerSample; /* 8 or 16 bit */ 00109 char datatag[4]; /* "data" string */ 00110 int32 datalength; /* Raw data length */ 00111 } MSWAV_hdr; 00112 00118 static int 00119 detect_riff(sphinx_wave2feat_t *wtf, char const *infile) 00120 { 00121 FILE *fh; 00122 MSWAV_hdr hdr; 00123 00124 if ((fh = fopen(infile, "rb")) == NULL) { 00125 E_ERROR_SYSTEM("Failed to open %s", infile); 00126 return -1; 00127 } 00128 if (fread(&hdr, sizeof(hdr), 1, fh) != 1) { 00129 E_ERROR_SYSTEM("Failed to read RIFF header"); 00130 fclose(fh); 00131 return -1; 00132 } 00133 /* Make sure it is actually a RIFF file. */ 00134 if (0 != memcmp(hdr.rifftag, "RIFF", 4)) { 00135 fclose(fh); 00136 return FALSE; 00137 } 00138 00139 /* Get relevant information. */ 00140 cmd_ln_set_int32_r(wtf->config, "-nchans", hdr.numchannels); 00141 cmd_ln_set_float32_r(wtf->config, "-samprate", hdr.SamplingFreq); 00142 if (wtf->infile) 00143 ckd_free(wtf->infile); 00144 wtf->infile = ckd_salloc(infile); 00145 wtf->infh = fh; 00146 00147 return TRUE; 00148 } 00149 00150 static int 00151 open_nist_file(sphinx_wave2feat_t *wtf, char const *infile, FILE **out_fh) 00152 { 00153 char nist[7]; 00154 lineiter_t *li; 00155 FILE *fh; 00156 00157 if ((fh = fopen(infile, "rb")) == NULL) { 00158 E_ERROR_SYSTEM("Failed to open %s", infile); 00159 return -1; 00160 } 00161 if (fread(&nist, 1, 7, fh) != 7) { 00162 E_ERROR_SYSTEM("Failed to read NIST header"); 00163 fclose(fh); 00164 return -1; 00165 } 00166 /* Is this actually a NIST file? */ 00167 if (0 != strncmp(nist, "NIST_1A", 7)) { 00168 fclose(fh); 00169 return FALSE; 00170 } 00171 /* Rewind, parse lines. */ 00172 fseek(fh, 0, SEEK_SET); 00173 for (li = lineiter_start(fh); li; li = lineiter_next(li)) { 00174 char **words; 00175 int nword; 00176 00177 string_trim(li->buf, STRING_BOTH); 00178 if (strlen(li->buf) == 0) { 00179 lineiter_free(li); 00180 break; 00181 } 00182 nword = str2words(li->buf, NULL, 0); 00183 if (nword != 3) 00184 continue; 00185 words = ckd_calloc(nword, sizeof(*words)); 00186 str2words(li->buf, words, nword); 00187 if (0 == strcmp(words[0], "sample_rate")) { 00188 cmd_ln_set_float32_r(wtf->config, "-samprate", atof_c(words[2])); 00189 } 00190 if (0 == strcmp(words[0], "channel_count")) { 00191 cmd_ln_set_int32_r(wtf->config, "-nchans", atoi(words[2])); 00192 } 00193 if (0 == strcmp(words[0], "sample_byte_format")) { 00194 cmd_ln_set_str_r(wtf->config, "-input_endian", 00195 (0 == strcmp(words[2], "10")) ? "big" : "little"); 00196 } 00197 ckd_free(words); 00198 } 00199 00200 fseek(fh, 1024, SEEK_SET); 00201 if (out_fh) 00202 *out_fh = fh; 00203 else 00204 fclose(fh); 00205 return TRUE; 00206 } 00207 00208 #ifdef HAVE_POPEN 00209 static int 00210 detect_sph2pipe(sphinx_wave2feat_t *wtf, char const *infile) 00211 { 00212 FILE *fh; 00213 char *cmdline; 00214 int rv; 00215 00216 /* Determine if it's NIST file and get parameters. */ 00217 if ((rv = open_nist_file(wtf, infile, NULL)) != TRUE) 00218 return rv; 00219 00220 /* Now popen it with sph2pipe. */ 00221 cmdline = string_join("sph2pipe -f raw '", infile, "'", NULL); 00222 if ((fh = popen(cmdline, "r")) == NULL) { 00223 E_ERROR_SYSTEM("Failed to popen(\"sph2pipe -f raw '%s'\")", infile); 00224 ckd_free(cmdline); 00225 return -1; 00226 } 00227 00228 if (wtf->infile) 00229 ckd_free(wtf->infile); 00230 wtf->infile = ckd_salloc(infile); 00231 wtf->infh = fh; 00232 return TRUE; 00233 } 00234 #else /* !HAVE_POPEN */ 00235 static int 00236 detect_sph2pipe(sphinx_wave2feat_t *wtf, char const *infile) 00237 { 00238 E_ERROR("popen() not available, cannot run sph2pipe\n"); 00239 return -1; 00240 } 00241 #endif /* !HAVE_POPEN */ 00242 00248 static int 00249 detect_nist(sphinx_wave2feat_t *wtf, char const *infile) 00250 { 00251 FILE *fh; 00252 int rv; 00253 00254 if ((rv = open_nist_file(wtf, infile, &fh)) != TRUE) 00255 return rv; 00256 if (wtf->infile) 00257 ckd_free(wtf->infile); 00258 wtf->infile = ckd_salloc(infile); 00259 wtf->infh = fh; 00260 return TRUE; 00261 } 00262 00263 00270 static int 00271 detect_raw(sphinx_wave2feat_t *wtf, char const *infile) 00272 { 00273 FILE *fh; 00274 00275 if ((fh = fopen(infile, "rb")) == NULL) { 00276 E_ERROR_SYSTEM("Failed to open %s", infile); 00277 return -1; 00278 } 00279 if (wtf->infile) 00280 ckd_free(wtf->infile); 00281 wtf->infile = ckd_salloc(infile); 00282 wtf->infh = fh; 00283 return TRUE; 00284 } 00285 00292 static int 00293 detect_sphinx_mfc(sphinx_wave2feat_t *wtf, char const *infile) 00294 { 00295 FILE *fh; 00296 int32 len; 00297 long flen; 00298 00299 if ((fh = fopen(infile, "rb")) == NULL) { 00300 E_ERROR_SYSTEM("Failed to open %s", infile); 00301 return -1; 00302 } 00303 if (fread(&len, 4, 1, fh) != 1) { 00304 E_ERROR_SYSTEM("Failed to read header from %s\n", infile); 00305 return -1; 00306 } 00307 fseek(fh, 0, SEEK_END); 00308 flen = ftell(fh); 00309 00310 /* figure out whether to byteswap */ 00311 flen = (flen / 4) - 1; 00312 if (flen != len) { 00313 /* First make sure this is an endianness problem, otherwise fail. */ 00314 SWAP_INT32(&len); 00315 if (flen != len) { 00316 SWAP_INT32(&len); 00317 E_ERROR("Mismatch in header/file lengths: 0x%08x vs 0x%08x\n", 00318 len, flen); 00319 return -1; 00320 } 00321 /* Set the input endianness to the opposite of the machine endianness... */ 00322 cmd_ln_set_str_r(wtf->config, "-input_endian", 00323 (0 == strcmp("big", cmd_ln_str_r(wtf->config, "-mach_endian")) 00324 ? "little" : "big")); 00325 } 00326 00327 fseek(fh, 4, SEEK_SET); 00328 if (wtf->infile) 00329 ckd_free(wtf->infile); 00330 wtf->infile = ckd_salloc(infile); 00331 wtf->infh = fh; 00332 if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) { 00333 wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-nfilt"); 00334 } 00335 else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) { 00336 wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-ncep"); 00337 wtf->veclen = cmd_ln_int32_r(wtf->config, "-nfilt"); 00338 } 00339 else { 00340 /* Should not happen. */ 00341 E_ERROR("Sphinx MFCC file reading requested but -spec2cep/-cep2spec not given\n"); 00342 assert(FALSE); 00343 } 00344 00345 return TRUE; 00346 } 00347 00348 int 00349 mixnpick_channels(int16 *buf, int32 nsamp, int32 nchans, int32 whichchan) 00350 { 00351 int i, j; 00352 00353 if (whichchan > 0) { 00354 for (i = whichchan - 1; i < nsamp; i += nchans) 00355 buf[i/nchans] = buf[i]; 00356 } 00357 else { 00358 for (i = 0; i < nsamp; i += nchans) { 00359 float64 tmp = 0.0; 00360 for (j = 0; j < nchans && i + j < nsamp; ++j) { 00361 tmp += buf[i + j]; 00362 } 00363 buf[i/nchans] = (int16)(tmp / nchans); 00364 } 00365 } 00366 return i/nchans; 00367 } 00368 00369 #ifdef HAVE_SNDFILE_H 00370 00375 static int 00376 detect_sndfile(sphinx_wave2feat_t *wtf, char const *infile) 00377 { 00378 SNDFILE *sf; 00379 SF_INFO sfinfo; 00380 00381 memset(&sfinfo, 0, sizeof(sfinfo)); 00382 /* We let other detectors catch I/O errors, since there is 00383 no way to tell them from format errors when opening :( */ 00384 if ((sf = sf_open(infile, SFM_READ, &sfinfo)) == NULL) { 00385 return FALSE; 00386 } 00387 /* Get relevant information. */ 00388 cmd_ln_set_int32_r(wtf->config, "-nchans", sfinfo.channels); 00389 cmd_ln_set_float32_r(wtf->config, "-samprate", sfinfo.samplerate); 00390 if (wtf->infile) 00391 ckd_free(wtf->infile); 00392 wtf->infile = ckd_salloc(infile); 00393 wtf->insfh = sf; 00394 wtf->infh = NULL; 00395 00396 return TRUE; 00397 } 00398 00403 static int 00404 decode_sndfile(sphinx_wave2feat_t *wtf) 00405 { 00406 size_t nsamp; 00407 int32 nfr, nchans, whichchan; 00408 int nfloat, n; 00409 00410 nchans = cmd_ln_int32_r(wtf->config, "-nchans"); 00411 whichchan = cmd_ln_int32_r(wtf->config, "-whichchan"); 00412 fe_start_utt(wtf->fe); 00413 nfloat = 0; 00414 while ((nsamp = sf_read_short(wtf->insfh, 00415 wtf->audio, 00416 wtf->blocksize)) != 0) { 00417 int16 const *inspeech; 00418 size_t nvec; 00419 00420 /* Mix or pick channels. */ 00421 if (nchans > 1) 00422 nsamp = mixnpick_channels(wtf->audio, nsamp, nchans, whichchan); 00423 00424 inspeech = wtf->audio; 00425 nvec = wtf->featsize; 00426 /* Consume all samples. */ 00427 while (nsamp) { 00428 nfr = nvec; 00429 fe_process_frames(wtf->fe, &inspeech, &nsamp, wtf->feat, &nfr); 00430 if (nfr) { 00431 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0) 00432 return -1; 00433 nfloat += n; 00434 } 00435 } 00436 inspeech = wtf->audio; 00437 } 00438 /* Now process any leftover audio frames. */ 00439 fe_end_utt(wtf->fe, wtf->feat[0], &nfr); 00440 if (nfr) { 00441 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0) 00442 return -1; 00443 nfloat += n; 00444 } 00445 00446 sf_close(wtf->insfh); 00447 wtf->insfh = NULL; 00448 return nfloat; 00449 } 00450 #endif /* HAVE_SNDFILE_H */ 00451 00456 static int 00457 decode_pcm(sphinx_wave2feat_t *wtf) 00458 { 00459 size_t nsamp; 00460 int32 nfr, nchans, whichchan; 00461 int nfloat, n; 00462 00463 nchans = cmd_ln_int32_r(wtf->config, "-nchans"); 00464 whichchan = cmd_ln_int32_r(wtf->config, "-whichchan"); 00465 fe_start_utt(wtf->fe); 00466 nfloat = 0; 00467 while ((nsamp = fread(wtf->audio, 2, wtf->blocksize, wtf->infh)) != 0) { 00468 size_t nvec; 00469 int16 const *inspeech; 00470 00471 /* Byteswap stuff here if necessary. */ 00472 if (wtf->byteswap) { 00473 for (n = 0; n < nsamp; ++n) 00474 SWAP_INT16(wtf->audio + n); 00475 } 00476 00477 /* Mix or pick channels. */ 00478 if (nchans > 1) 00479 nsamp = mixnpick_channels(wtf->audio, nsamp, nchans, whichchan); 00480 00481 inspeech = wtf->audio; 00482 nvec = wtf->featsize; 00483 /* Consume all samples. */ 00484 while (nsamp) { 00485 nfr = nvec; 00486 fe_process_frames(wtf->fe, &inspeech, &nsamp, wtf->feat, &nfr); 00487 if (nfr) { 00488 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0) 00489 return -1; 00490 nfloat += n; 00491 } 00492 } 00493 inspeech = wtf->audio; 00494 } 00495 /* Now process any leftover audio frames. */ 00496 fe_end_utt(wtf->fe, wtf->feat[0], &nfr); 00497 if (nfr) { 00498 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0) 00499 return -1; 00500 nfloat += n; 00501 } 00502 00503 if (fclose(wtf->infh) == EOF) 00504 E_ERROR_SYSTEM("Failed to close input file"); 00505 wtf->infh = NULL; 00506 return nfloat; 00507 } 00508 00513 static int 00514 decode_sphinx_mfc(sphinx_wave2feat_t *wtf) 00515 { 00516 int nfloat = 0, n; 00517 int featsize = wtf->featsize; 00518 00519 /* If the input vector length is less than the output length, we 00520 * need to do this one frame at a time, because there's empty 00521 * space at the end of each vector in wtf->feat. */ 00522 if (wtf->in_veclen < wtf->veclen) 00523 featsize = 1; 00524 while ((n = fread(wtf->feat[0], sizeof(**wtf->feat), 00525 featsize * wtf->in_veclen, wtf->infh)) != 0) { 00526 int i, nfr = n / wtf->in_veclen; 00527 if (n % wtf->in_veclen) { 00528 E_ERROR("Size of file %d not a multiple of veclen %d\n", 00529 n, wtf->in_veclen); 00530 return -1; 00531 } 00532 /* Byteswap stuff here if necessary. */ 00533 if (wtf->byteswap) { 00534 for (i = 0; i < n; ++i) 00535 SWAP_FLOAT32(wtf->feat[0] + i); 00536 } 00537 fe_float_to_mfcc(wtf->fe, (float32 **)wtf->feat, wtf->feat, nfr); 00538 for (i = 0; i < nfr; ++i) { 00539 if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) { 00540 if (0 == strcmp(cmd_ln_str_r(wtf->config, "-transform"), "legacy")) 00541 fe_logspec_to_mfcc(wtf->fe, wtf->feat[i], wtf->feat[i]); 00542 else 00543 fe_logspec_dct2(wtf->fe, wtf->feat[i], wtf->feat[i]); 00544 } 00545 else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) { 00546 fe_mfcc_dct3(wtf->fe, wtf->feat[i], wtf->feat[i]); 00547 } 00548 } 00549 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0) 00550 return -1; 00551 nfloat += n; 00552 } 00553 00554 if (fclose(wtf->infh) == EOF) 00555 E_ERROR_SYSTEM("Failed to close input file"); 00556 wtf->infh = NULL; 00557 return nfloat; 00558 } 00559 00560 static const audio_type_t types[] = { 00561 #ifdef HAVE_SNDFILE_H 00562 { "-sndfile", &detect_sndfile, &decode_sndfile }, 00563 #endif 00564 { "-mswav", &detect_riff, &decode_pcm }, 00565 { "-nist", &detect_nist, &decode_pcm }, 00566 { "-raw", &detect_raw, &decode_pcm }, 00567 { "-sph2pipe", &detect_sph2pipe, &decode_pcm } 00568 }; 00569 static const int ntypes = sizeof(types)/sizeof(types[0]); 00570 static const audio_type_t mfcc_type = { 00571 "sphinx_mfc", &detect_sphinx_mfc, &decode_sphinx_mfc 00572 }; 00573 00579 static int 00580 output_header_sphinx(sphinx_wave2feat_t *wtf, int32 nfloat) 00581 { 00582 if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1) { 00583 E_ERROR_SYSTEM("Failed to write to %s", wtf->outfile); 00584 return -1; 00585 } 00586 return 0; 00587 } 00588 00594 static int 00595 output_frames_sphinx(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr) 00596 { 00597 int i, nfloat = 0; 00598 00599 fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr); 00600 for (i = 0; i < nfr; ++i) { 00601 if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) { 00602 E_ERROR_SYSTEM("Writing %d values to %s failed", 00603 wtf->veclen, wtf->outfile); 00604 return -1; 00605 } 00606 nfloat += wtf->veclen; 00607 } 00608 return nfloat; 00609 } 00610 00611 typedef enum htk_feature_kind_e { 00612 WAVEFORM = 0, /* PCM audio (rarely used) */ 00613 LPC = 1, /* LPC filter coefficients */ 00614 LPCREFC = 2, /* LPC reflection coefficients */ 00615 LPCEPSTRA = 3, /* LPC-based cepstral coefficients */ 00616 LPCDELCEP = 4, /* LPCC plus deltas */ 00617 IREFC = 5, /* 16-bit integer LPC reflection coefficients */ 00618 MFCC = 6, /* MFCCs */ 00619 FBANK = 7, /* Log mel spectrum */ 00620 MELSPEC = 8, /* Linear mel spectrum */ 00621 USER = 9, /* User defined */ 00622 DISCRETE = 10, /* Vector quantized data */ 00623 PLP = 11 /* PLP coefficients */ 00624 } htk_feature_kind_t; 00625 00626 typedef enum htk_feature_flag_e { 00627 _E = 0000100, /* has energy */ 00628 _N = 0000200, /* absolute energy supressed */ 00629 _D = 0000400, /* has delta coefficients */ 00630 _A = 0001000, /* has acceleration (delta-delta) coefficients */ 00631 _C = 0002000, /* is compressed */ 00632 _Z = 0004000, /* has zero mean static coefficients (i.e. CMN) */ 00633 _K = 0010000, /* has CRC checksum */ 00634 _O = 0020000, /* has 0th cepstral coefficient */ 00635 _V = 0040000, /* has VQ data */ 00636 _T = 0100000 /* has third differential coefficients */ 00637 } htk_feature_flag_t; 00638 00642 static int 00643 output_header_htk(sphinx_wave2feat_t *wtf, int32 nfloat) 00644 { 00645 int32 samp_period; 00646 int16 samp_size; 00647 int16 param_kind; 00648 int swap = FALSE; 00649 00650 /* HTK files are big-endian. */ 00651 if (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian"))) 00652 swap = TRUE; 00653 /* Same file size thing as in Sphinx files (I think) */ 00654 if (swap) SWAP_INT32(&nfloat); 00655 if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1) 00656 return -1; 00657 /* Sample period in 100ns units. */ 00658 samp_period = (int32)(1e+7 / cmd_ln_float32_r(wtf->config, "-frate")); 00659 if (swap) SWAP_INT32(&samp_period); 00660 if (fwrite(&samp_period, 4, 1, wtf->outfh) != 1) 00661 return -1; 00662 /* Sample size - veclen * sizeof each sample. */ 00663 samp_size = wtf->veclen * 4; 00664 if (swap) SWAP_INT16(&samp_size); 00665 if (fwrite(&samp_size, 2, 1, wtf->outfh) != 1) 00666 return -1; 00667 /* Format and flags. */ 00668 if (cmd_ln_boolean_r(wtf->config, "-logspec") 00669 || cmd_ln_boolean_r(wtf->config, "-cep2spec")) 00670 param_kind = FBANK; /* log mel-filter bank outputs */ 00671 else 00672 param_kind = MFCC | _O; /* MFCC + CEP0 (note reordering...) */ 00673 if (swap) SWAP_INT16(¶m_kind); 00674 if (fwrite(¶m_kind, 2, 1, wtf->outfh) != 1) 00675 return -1; 00676 00677 return 0; 00678 } 00679 00683 static int 00684 output_frames_htk(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr) 00685 { 00686 int i, j, swap, htk_reorder, nfloat = 0; 00687 00688 fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr); 00689 /* This is possibly inefficient, but probably not a big deal. */ 00690 swap = (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian"))); 00691 htk_reorder = (0 == strcmp("htk", wtf->ot->name) 00692 && !(cmd_ln_boolean_r(wtf->config, "-logspec") 00693 || cmd_ln_boolean_r(wtf->config, "-cep2spec"))); 00694 for (i = 0; i < nfr; ++i) { 00695 if (htk_reorder) { 00696 mfcc_t c0 = frames[i][0]; 00697 memmove(frames[i] + 1, frames[i], (wtf->veclen - 1) * 4); 00698 frames[i][wtf->veclen - 1] = c0; 00699 } 00700 if (swap) 00701 for (j = 0; j < wtf->veclen; ++j) 00702 SWAP_FLOAT32(frames[i] + j); 00703 if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) { 00704 E_ERROR_SYSTEM("Writing %d values to %s failed", 00705 wtf->veclen, wtf->outfile); 00706 return -1; 00707 } 00708 nfloat += wtf->veclen; 00709 } 00710 return nfloat; 00711 } 00712 00716 static int 00717 output_frames_text(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr) 00718 { 00719 int i, j, nfloat = 0; 00720 00721 fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr); 00722 for (i = 0; i < nfr; ++i) { 00723 for (j = 0; j < wtf->veclen; ++j) { 00724 fprintf(wtf->outfh, "%.5g", frames[i][j]); 00725 if (j == wtf->veclen - 1) 00726 fprintf(wtf->outfh, "\n"); 00727 else 00728 fprintf(wtf->outfh, " "); 00729 } 00730 nfloat += wtf->veclen; 00731 } 00732 return nfloat; 00733 } 00734 00735 static const output_type_t outtypes[] = { 00736 { "sphinx", &output_header_sphinx, &output_frames_sphinx }, 00737 { "htk", &output_header_htk, &output_frames_htk }, 00738 { "text", NULL, &output_frames_text } 00739 }; 00740 static const int nouttypes = sizeof(outtypes)/sizeof(outtypes[0]); 00741 00742 sphinx_wave2feat_t * 00743 sphinx_wave2feat_init(cmd_ln_t *config) 00744 { 00745 sphinx_wave2feat_t *wtf; 00746 int i; 00747 00748 wtf = ckd_calloc(1, sizeof(*wtf)); 00749 wtf->refcount = 1; 00750 wtf->config = cmd_ln_retain(config); 00751 wtf->fe = fe_init_auto_r(wtf->config); 00752 wtf->ot = outtypes; /* Default (sphinx) type. */ 00753 for (i = 0; i < nouttypes; ++i) { 00754 output_type_t const *otype = &outtypes[i]; 00755 if (0 == strcmp(cmd_ln_str_r(config, "-ofmt"), otype->name)) { 00756 wtf->ot = otype; 00757 break; 00758 } 00759 } 00760 if (i == nouttypes) { 00761 E_ERROR("Unknown output type: '%s'\n", 00762 cmd_ln_str_r(config, "-ofmt")); 00763 sphinx_wave2feat_free(wtf); 00764 return NULL; 00765 } 00766 00767 return wtf; 00768 } 00769 00770 int 00771 sphinx_wave2feat_free(sphinx_wave2feat_t *wtf) 00772 { 00773 if (wtf == NULL) 00774 return 0; 00775 if (--wtf->refcount > 0) 00776 return wtf->refcount; 00777 00778 ckd_free(wtf->audio); 00779 ckd_free_2d(wtf->feat); 00780 ckd_free(wtf->infile); 00781 ckd_free(wtf->outfile); 00782 if (wtf->infh) { 00783 if (fclose(wtf->infh) == EOF) 00784 E_ERROR_SYSTEM("Failed to close input file"); 00785 } 00786 if (wtf->outfh) { 00787 if (fclose(wtf->outfh) == EOF) 00788 E_ERROR_SYSTEM("Failed to close output file"); 00789 } 00790 cmd_ln_free_r(wtf->config); 00791 fe_free(wtf->fe); 00792 ckd_free(wtf); 00793 00794 return 0; 00795 } 00796 00797 sphinx_wave2feat_t * 00798 sphinx_wave2feat_retain(sphinx_wave2feat_t *wtf) 00799 { 00800 ++wtf->refcount; 00801 return wtf; 00802 } 00803 00804 static audio_type_t const * 00805 detect_audio_type(sphinx_wave2feat_t *wtf, char const *infile) 00806 { 00807 audio_type_t const *atype; 00808 int i; 00809 00810 /* Special case audio type for Sphinx MFCC inputs. */ 00811 if (cmd_ln_boolean_r(wtf->config, "-spec2cep") 00812 || cmd_ln_boolean_r(wtf->config, "-cep2spec")) { 00813 int rv = mfcc_type.detect(wtf, infile); 00814 if (rv == -1) 00815 goto error_out; 00816 return &mfcc_type; 00817 } 00818 00819 /* Try to use the type of infile given on the command line. */ 00820 for (i = 0; i < ntypes; ++i) { 00821 int rv; 00822 atype = &types[i]; 00823 if (cmd_ln_boolean_r(wtf->config, atype->name)) { 00824 rv = (*atype->detect)(wtf, infile); 00825 if (rv == -1) 00826 goto error_out; 00827 else if (rv == TRUE) 00828 break; 00829 } 00830 } 00831 if (i == ntypes) { 00832 /* Detect file type of infile and get parameters. */ 00833 for (i = 0; i < ntypes; ++i) { 00834 int rv; 00835 atype = &types[i]; 00836 rv = (*atype->detect)(wtf, infile); 00837 if (rv == -1) 00838 goto error_out; 00839 else if (rv == TRUE) 00840 break; 00841 } 00842 if (i == ntypes) 00843 goto error_out; 00844 } 00845 return atype; 00846 error_out: 00847 if (wtf->infh) 00848 fclose(wtf->infh); 00849 wtf->infh = NULL; 00850 return NULL; 00851 } 00852 00853 int 00854 sphinx_wave2feat_convert_file(sphinx_wave2feat_t *wtf, 00855 char const *infile, char const *outfile) 00856 { 00857 int nchans, minfft, nfft, nfloat, veclen; 00858 audio_type_t const *atype; 00859 int fshift, fsize; 00860 00861 if (cmd_ln_boolean_r(wtf->config, "-verbose")) 00862 E_INFO("Converting %s to %s\n", infile, outfile); 00863 00864 /* Detect input file type. */ 00865 if ((atype = detect_audio_type(wtf, infile)) == NULL) 00866 return -1; 00867 00868 /* Determine whether to byteswap input. */ 00869 wtf->byteswap = strcmp(cmd_ln_str_r(wtf->config, "-mach_endian"), 00870 cmd_ln_str_r(wtf->config, "-input_endian")); 00871 00872 /* Make sure the FFT size is sufficiently large. */ 00873 minfft = (int)(cmd_ln_float32_r(wtf->config, "-samprate") 00874 * cmd_ln_float32_r(wtf->config, "-wlen") + 0.5); 00875 for (nfft = 1; nfft < minfft; nfft <<= 1) 00876 ; 00877 if (nfft > cmd_ln_int32_r(wtf->config, "-nfft")) { 00878 E_WARN("Value of -nfft = %d is too small, increasing to %d\n", 00879 cmd_ln_int32_r(wtf->config, "-nfft"), nfft); 00880 cmd_ln_set_int32_r(wtf->config, "-nfft", nfft); 00881 fe_free(wtf->fe); 00882 wtf->fe = fe_init_auto_r(wtf->config); 00883 } 00884 00885 /* Get the output frame size (if not already set). */ 00886 if (wtf->veclen == 0) 00887 wtf->veclen = fe_get_output_size(wtf->fe); 00888 00889 /* Set up the input and output buffers. */ 00890 fe_get_input_size(wtf->fe, &fshift, &fsize); 00891 /* Want to get at least a whole frame plus shift in here. Also we 00892 will either pick or mix multiple channels so we need to read 00893 them all at once. */ 00894 nchans = cmd_ln_int32_r(wtf->config, "-nchans"); 00895 wtf->blocksize = cmd_ln_int32_r(wtf->config, "-blocksize") * nchans; 00896 if (wtf->blocksize < (fsize + fshift) * nchans) { 00897 E_INFO("Block size of %d too small, increasing to %d\n", 00898 wtf->blocksize, 00899 (fsize + fshift) * nchans); 00900 wtf->blocksize = (fsize + fshift) * nchans; 00901 } 00902 wtf->audio = ckd_calloc(wtf->blocksize, sizeof(*wtf->audio)); 00903 wtf->featsize = (wtf->blocksize / nchans - fsize) / fshift; 00904 00905 /* Use the maximum of the input and output frame sizes to allocate this. */ 00906 veclen = wtf->veclen; 00907 if (wtf->in_veclen > veclen) veclen = wtf->in_veclen; 00908 wtf->feat = ckd_calloc_2d(wtf->featsize, veclen, sizeof(**wtf->feat)); 00909 00910 /* Let's go! */ 00911 if ((wtf->outfh = fopen(outfile, "wb")) == NULL) { 00912 E_ERROR_SYSTEM("Failed to open %s for writing", outfile); 00913 return -1; 00914 } 00915 /* Write an empty header, which we'll fill in later. */ 00916 if (wtf->ot->output_header && 00917 (*wtf->ot->output_header)(wtf, 0) < 0) { 00918 E_ERROR_SYSTEM("Failed to write empty header to %s\n", outfile); 00919 goto error_out; 00920 } 00921 wtf->outfile = ckd_salloc(outfile); 00922 00923 if ((nfloat = (*atype->decode)(wtf)) < 0) 00924 return -1; 00925 00926 if (wtf->ot->output_header) { 00927 if (fseek(wtf->outfh, 0, SEEK_SET) < 0) { 00928 E_ERROR_SYSTEM("Failed to seek to beginning of %s\n", outfile); 00929 goto error_out; 00930 } 00931 if ((*wtf->ot->output_header)(wtf, nfloat) < 0) { 00932 E_ERROR_SYSTEM("Failed to write header to %s\n", outfile); 00933 goto error_out; 00934 } 00935 } 00936 if (fclose(wtf->outfh) == EOF) 00937 E_ERROR_SYSTEM("Failed to close output file"); 00938 wtf->outfh = NULL; 00939 00940 return 0; 00941 error_out: 00942 if (wtf->outfh) { 00943 fclose(wtf->outfh); 00944 wtf->outfh = NULL; 00945 } 00946 return -1; 00947 } 00948 00949 void 00950 build_filenames(cmd_ln_t *config, char const *basename, 00951 char **out_infile, char **out_outfile) 00952 { 00953 char const *di, *do_, *ei, *eo; 00954 00955 di = cmd_ln_str_r(config, "-di"); 00956 do_ = cmd_ln_str_r(config, "-do"); 00957 ei = cmd_ln_str_r(config, "-ei"); 00958 eo = cmd_ln_str_r(config, "-eo"); 00959 00960 *out_infile = string_join(di ? di : "", 00961 di ? "/" : "", 00962 basename, 00963 ei ? "." : "", 00964 ei ? ei : "", 00965 NULL); 00966 *out_outfile = string_join(do_ ? do_ : "", 00967 do_ ? "/" : "", 00968 basename, 00969 eo ? "." : "", 00970 eo ? eo : "", 00971 NULL); 00972 /* Build output directory structure if possible/requested (it is 00973 * by default). */ 00974 if (cmd_ln_boolean_r(config, "-build_outdirs")) { 00975 char *dirname = ckd_salloc(*out_outfile); 00976 path2dirname(*out_outfile, dirname); 00977 build_directory(dirname); 00978 ckd_free(dirname); 00979 } 00980 } 00981 00982 static int 00983 run_control_file(sphinx_wave2feat_t *wtf, char const *ctlfile) 00984 { 00985 hash_table_t *files; 00986 hash_iter_t *itor; 00987 lineiter_t *li; 00988 FILE *ctlfh; 00989 int nskip, runlen, npart, rv = 0; 00990 00991 if ((ctlfh = fopen(ctlfile, "r")) == NULL) { 00992 E_ERROR_SYSTEM("Failed to open control file %s", ctlfile); 00993 return -1; 00994 } 00995 nskip = cmd_ln_int32_r(wtf->config, "-nskip"); 00996 runlen = cmd_ln_int32_r(wtf->config, "-runlen"); 00997 if ((npart = cmd_ln_int32_r(wtf->config, "-npart"))) { 00998 /* Count lines in the file. */ 00999 int partlen, part, nlines = 0; 01000 part = cmd_ln_int32_r(wtf->config, "-part"); 01001 for (li = lineiter_start(ctlfh); li; li = lineiter_next(li)) 01002 ++nlines; 01003 fseek(ctlfh, 0, SEEK_SET); 01004 partlen = nlines / npart; 01005 nskip = partlen * (part - 1); 01006 if (part == npart) 01007 runlen = -1; 01008 else 01009 runlen = partlen; 01010 } 01011 if (runlen != -1){ 01012 E_INFO("Processing %d utterances at position %d\n", runlen, nskip); 01013 files = hash_table_new(runlen, HASH_CASE_YES); 01014 } 01015 else { 01016 E_INFO("Processing all remaining utterances at position %d\n", nskip); 01017 files = hash_table_new(1000, HASH_CASE_YES); 01018 } 01019 for (li = lineiter_start(ctlfh); li; li = lineiter_next(li)) { 01020 char *c, *infile, *outfile; 01021 01022 if (nskip-- > 0) 01023 continue; 01024 if (runlen == 0) { 01025 lineiter_free(li); 01026 break; 01027 } 01028 --runlen; 01029 01030 string_trim(li->buf, STRING_BOTH); 01031 /* Extract the file ID from the control line. */ 01032 if ((c = strchr(li->buf, ' ')) != NULL) 01033 *c = '\0'; 01034 build_filenames(wtf->config, li->buf, &infile, &outfile); 01035 if (hash_table_lookup(files, infile, NULL) == 0) 01036 continue; 01037 rv = sphinx_wave2feat_convert_file(wtf, infile, outfile); 01038 hash_table_enter(files, infile, outfile); 01039 if (rv != 0) { 01040 lineiter_free(li); 01041 if (fclose(ctlfh) == EOF) 01042 E_ERROR_SYSTEM("Failed to close control file"); 01043 break; 01044 } 01045 } 01046 for (itor = hash_table_iter(files); itor; 01047 itor = hash_table_iter_next(itor)) { 01048 ckd_free((void *)hash_entry_key(itor->ent)); 01049 ckd_free(hash_entry_val(itor->ent)); 01050 } 01051 hash_table_free(files); 01052 return rv; 01053 } 01054 01055 int 01056 main(int argc, char *argv[]) 01057 { 01058 sphinx_wave2feat_t *wtf; 01059 cmd_ln_t *config; 01060 int rv; 01061 01062 /* Initialize config. */ 01063 if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL) 01064 return 2; 01065 01066 /* Parse an argument file if there's one in there. */ 01067 if (cmd_ln_str_r(config, "-argfile")) 01068 config = cmd_ln_parse_file_r(config, defn, 01069 cmd_ln_str_r(config, "-argfile"), FALSE); 01070 if (config == NULL) { 01071 E_ERROR("Command line parsing failed\n"); 01072 return 1; 01073 } 01074 if ((wtf = sphinx_wave2feat_init(config)) == NULL) { 01075 E_ERROR("Failed to initialize wave2feat object\n"); 01076 return 1; 01077 } 01078 01079 /* If there's a control file run through it, otherwise we will do 01080 * a single file (which is what run_control_file will do 01081 * internally too) */ 01082 if (cmd_ln_str_r(config, "-c")) 01083 rv = run_control_file(wtf, cmd_ln_str_r(config, "-c")); 01084 else 01085 rv = sphinx_wave2feat_convert_file(wtf, cmd_ln_str_r(config, "-i"), 01086 cmd_ln_str_r(config, "-o")); 01087 01088 sphinx_wave2feat_free(wtf); 01089 return rv; 01090 }