00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051 #if defined(WIN32) && !defined(GNUWINCE)
00052 #define srand48(x) srand(x)
00053 #define lrand48() rand()
00054 #endif
00055
00056 #ifndef _NEW_FE_H_
00057 #define _NEW_FE_H_
00058
00059
00060 #include <sphinxbase_export.h>
00061
00062 #include <sphinx_config.h>
00063 #include <cmd_ln.h>
00064 #include <fixpoint.h>
00065
00066 #ifdef __cplusplus
00067 extern "C" {
00068 #endif
00069 #if 0
00070
00071 }
00072 #endif
00073
00074 #ifdef WORDS_BIGENDIAN
00075 #define NATIVE_ENDIAN "big"
00076 #else
00077 #define NATIVE_ENDIAN "little"
00078 #endif
00079
00081 #define DEFAULT_SAMPLING_RATE 16000
00082
00083 #define DEFAULT_FRAME_RATE 100
00084
00086 #define DEFAULT_FRAME_SHIFT 160
00087
00088 #define DEFAULT_WINDOW_LENGTH 0.025625
00089
00090 #define DEFAULT_FFT_SIZE 512
00091
00092 #define DEFAULT_NUM_CEPSTRA 13
00093
00094 #define DEFAULT_NUM_FILTERS 40
00095
00096 #define DEFAULT_LOWER_FILT_FREQ 133.33334
00097
00098 #define DEFAULT_UPPER_FILT_FREQ 6855.4976
00099
00100 #define DEFAULT_PRE_EMPHASIS_ALPHA 0.97
00101
00102 #define DEFAULT_WARP_TYPE "inverse_linear"
00103
00104 #define SEED -1
00105
00106 #define waveform_to_cepstral_command_line_macro() \
00107 { "-logspec", \
00108 ARG_BOOLEAN, \
00109 "no", \
00110 "Write out logspectral files instead of cepstra" }, \
00111 \
00112 { "-smoothspec", \
00113 ARG_BOOLEAN, \
00114 "no", \
00115 "Write out cepstral-smoothed logspectral files" }, \
00116 \
00117 { "-transform", \
00118 ARG_STRING, \
00119 "legacy", \
00120 "Which type of transform to use to calculate cepstra (legacy, dct, or htk)" }, \
00121 \
00122 { "-spec2cep", \
00123 ARG_BOOLEAN, \
00124 "no", \
00125 "Input is log spectral files, output is cepstral files" }, \
00126 \
00127 { "-cep2spec", \
00128 ARG_BOOLEAN, \
00129 "no", \
00130 "Input is cepstral files, output is log spectral files" }, \
00131 \
00132 { "-alpha", \
00133 ARG_FLOAT32, \
00134 ARG_STRINGIFY(DEFAULT_PRE_EMPHASIS_ALPHA), \
00135 "Preemphasis parameter" }, \
00136 \
00137 { "-samprate", \
00138 ARG_FLOAT32, \
00139 ARG_STRINGIFY(DEFAULT_SAMPLING_RATE), \
00140 "Sampling rate" }, \
00141 \
00142 { "-frate", \
00143 ARG_INT32, \
00144 ARG_STRINGIFY(DEFAULT_FRAME_RATE), \
00145 "Frame rate" }, \
00146 \
00147 { "-wlen", \
00148 ARG_FLOAT32, \
00149 ARG_STRINGIFY(DEFAULT_WINDOW_LENGTH), \
00150 "Hamming window length" }, \
00151 \
00152 { "-nfft", \
00153 ARG_INT32, \
00154 ARG_STRINGIFY(DEFAULT_FFT_SIZE), \
00155 "Size of FFT" }, \
00156 \
00157 { "-nfilt", \
00158 ARG_INT32, \
00159 ARG_STRINGIFY(DEFAULT_NUM_FILTERS), \
00160 "Number of filter banks" }, \
00161 \
00162 { "-lowerf", \
00163 ARG_FLOAT32, \
00164 ARG_STRINGIFY(DEFAULT_LOWER_FILT_FREQ), \
00165 "Lower edge of filters" }, \
00166 \
00167 { "-upperf", \
00168 ARG_FLOAT32, \
00169 ARG_STRINGIFY(DEFAULT_UPPER_FILT_FREQ), \
00170 "Upper edge of filters" }, \
00171 \
00172 { "-unit_area", \
00173 ARG_BOOLEAN, \
00174 "yes", \
00175 "Normalize mel filters to unit area" }, \
00176 \
00177 { "-round_filters", \
00178 ARG_BOOLEAN, \
00179 "yes", \
00180 "Round mel filter frequencies to DFT points" }, \
00181 \
00182 { "-ncep", \
00183 ARG_INT32, \
00184 ARG_STRINGIFY(DEFAULT_NUM_CEPSTRA), \
00185 "Number of cep coefficients" }, \
00186 \
00187 { "-doublebw", \
00188 ARG_BOOLEAN, \
00189 "no", \
00190 "Use double bandwidth filters (same center freq)" }, \
00191 \
00192 { "-lifter", \
00193 ARG_INT32, \
00194 "0", \
00195 "Length of sin-curve for liftering, or 0 for no liftering." }, \
00196 \
00197 { "-input_endian", \
00198 ARG_STRING, \
00199 NATIVE_ENDIAN, \
00200 "Endianness of input data, big or little, ignored if NIST or MS Wav" }, \
00201 \
00202 { "-warp_type", \
00203 ARG_STRING, \
00204 DEFAULT_WARP_TYPE, \
00205 "Warping function type (or shape)" }, \
00206 \
00207 { "-warp_params", \
00208 ARG_STRING, \
00209 NULL, \
00210 "Parameters defining the warping function" }, \
00211 \
00212 { "-dither", \
00213 ARG_BOOLEAN, \
00214 "no", \
00215 "Add 1/2-bit noise" }, \
00216 \
00217 { "-seed", \
00218 ARG_INT32, \
00219 ARG_STRINGIFY(SEED), \
00220 "Seed for random number generator; if less than zero, pick our own" }, \
00221 \
00222 { "-remove_dc", \
00223 ARG_BOOLEAN, \
00224 "no", \
00225 "Remove DC offset from each frame" }, \
00226 \
00227 { "-verbose", \
00228 ARG_BOOLEAN, \
00229 "no", \
00230 "Show input filenames" }
00231
00232 #ifdef FIXED_POINT
00233
00234 typedef fixed32 mfcc_t;
00235
00237 #define FLOAT2MFCC(x) FLOAT2FIX(x)
00238
00239 #define MFCC2FLOAT(x) FIX2FLOAT(x)
00240
00241 #define MFCCMUL(a,b) FIXMUL(a,b)
00242 #define MFCCLN(x,in,out) FIXLN_ANY(x,in,out)
00243 #else
00244
00246 typedef float32 mfcc_t;
00248 #define FLOAT2MFCC(x) (x)
00249
00250 #define MFCC2FLOAT(x) (x)
00251
00252 #define MFCCMUL(a,b) ((a)*(b))
00253 #define MFCCLN(x,in,out) log(x)
00254 #endif
00255
00259 typedef struct fe_s fe_t;
00260
00264 enum fe_error_e {
00265 FE_SUCCESS = 0,
00266 FE_OUTPUT_FILE_SUCCESS = 0,
00267 FE_CONTROL_FILE_ERROR = -1,
00268 FE_START_ERROR = -2,
00269 FE_UNKNOWN_SINGLE_OR_BATCH = -3,
00270 FE_INPUT_FILE_OPEN_ERROR = -4,
00271 FE_INPUT_FILE_READ_ERROR = -5,
00272 FE_MEM_ALLOC_ERROR = -6,
00273 FE_OUTPUT_FILE_WRITE_ERROR = -7,
00274 FE_OUTPUT_FILE_OPEN_ERROR = -8,
00275 FE_ZERO_ENERGY_ERROR = -9,
00276 FE_INVALID_PARAM_ERROR = -10
00277 };
00278
00286 SPHINXBASE_EXPORT
00287 fe_t* fe_init_auto(void);
00288
00296 SPHINXBASE_EXPORT
00297 arg_t const *fe_get_args(void);
00298
00309 SPHINXBASE_EXPORT
00310 fe_t *fe_init_auto_r(cmd_ln_t *config);
00311
00319 SPHINXBASE_EXPORT
00320 cmd_ln_t *fe_get_config(fe_t *fe);
00321
00326 SPHINXBASE_EXPORT
00327 int fe_start_utt(fe_t *fe);
00328
00341 SPHINXBASE_EXPORT
00342 int fe_get_output_size(fe_t *fe);
00343
00356 SPHINXBASE_EXPORT
00357 void fe_get_input_size(fe_t *fe, int *out_frame_shift,
00358 int *out_frame_size);
00359
00374 SPHINXBASE_EXPORT
00375 int fe_end_utt(fe_t *fe, mfcc_t *out_cepvector, int32 *out_nframes);
00376
00382 SPHINXBASE_EXPORT
00383 fe_t *fe_retain(fe_t *fe);
00384
00392 SPHINXBASE_EXPORT
00393 int fe_free(fe_t *fe);
00394
00403 SPHINXBASE_EXPORT
00404 int fe_process_frame(fe_t *fe, int16 const *spch,
00405 int32 nsamps, mfcc_t *out_cep);
00406
00453 SPHINXBASE_EXPORT
00454 int fe_process_frames(fe_t *fe,
00455 int16 const **inout_spch,
00456 size_t *inout_nsamps,
00457 mfcc_t **buf_cep,
00458 int32 *inout_nframes);
00459
00475 SPHINXBASE_EXPORT
00476 int fe_process_utt(fe_t *fe,
00477 int16 const *spch,
00478 size_t nsamps,
00479 mfcc_t ***cep_block,
00480 int32 *nframes
00481 );
00482
00486 SPHINXBASE_EXPORT
00487 void fe_free_2d(void *arr);
00488
00492 SPHINXBASE_EXPORT
00493 int fe_mfcc_to_float(fe_t *fe,
00494 mfcc_t **input,
00495 float32 **output,
00496 int32 nframes);
00497
00501 SPHINXBASE_EXPORT
00502 int fe_float_to_mfcc(fe_t *fe,
00503 float32 **input,
00504 mfcc_t **output,
00505 int32 nframes);
00506
00530 SPHINXBASE_EXPORT
00531 int fe_logspec_to_mfcc(fe_t *fe,
00532 const mfcc_t *fr_spec,
00533 mfcc_t *fr_cep
00534 );
00535
00544 SPHINXBASE_EXPORT
00545 int fe_logspec_dct2(fe_t *fe,
00546 const mfcc_t *fr_spec,
00547 mfcc_t *fr_cep
00548 );
00549
00558 SPHINXBASE_EXPORT
00559 int fe_mfcc_dct3(fe_t *fe,
00560 const mfcc_t *fr_cep,
00561 mfcc_t *fr_spec
00562 );
00563
00564 #ifdef __cplusplus
00565 }
00566 #endif
00567
00568
00569 #endif