Intorduction of Wave File Format
08/14/2019 Tags: Singal_ProcessPurpose
When doing the project on speech recognition, the test audio data for voice processing utilizes WAVE PCM file.
For quickly recapping the concept of WAVE PCM formay and avoiding to forget bit and pieces of this knowledge, I recorded this relevant information in this post.
Abstract
The WAVE file format is a subset of Microsoft’s RIFF specificaiton for the storage of the multimedia file. A RIFF file starts out with a file header followed by a sequence of data chunks. A WAVE file is often just a RIFF file with a single “WAVE” chunk which consists of two sub-chunks, a “fmt” chunk specifying the data format and a “data” chunk containing the actual sample data.
The WAVE File Structure
The typically canonical WAVE format starts with the RIFF header:
Field | Bytes | Bytes Offset | Endian | Description |
---|---|---|---|---|
Chunk ID | 4 | 0 | Big endian | This chunk contains the letters "RIFF" in ASCII form. |
Chunk Size | 4 | 4 | Little endian | This is the size of the rest of the chunk following this number - the size of the entire file in bytes minus 8 bytes, Chunk ID and Chunk Size. |
Format | 4 | 8 | Big endian | This chunk contains the letters "WAVE". |
Sub-Chunk1 ID | 4 | 12 | Big endian | This chunk contains letter "fmt". |
Sub-Chunk1 Size | 4 | 16 | Little endian | 16 for PCM. |
Audio Format | 2 | 20 | Little endian | 1 for PCM, 2 for ADPCM, 3 for IEEE floating point, 7 for u-law, and 67734 for WaveFormatExtensible. |
Num Channels | 2 | 22 | Little endian | 1 for Mono, 2 for stereo, etc. |
Sample Rate | 4 | 24 | Little endian | 8000, 16000, 44100, etc Hz. A typical value would be 44100, which is the same as an audio CD. The value of telephone and wireless microphone transmission, adequate for human speech, is 8000. The 16000 is used in most modern VoIP and VVoIP communication products. |
Byte Rate | 4 | 28 | Little endian | This value is equals to Sample Rate * Num of Channel * Bytes per Sample. |
Block Align | 2 | 32 | Little endian | This value is equals to Num of Channel * Bytes per Sample. |
Bits Per Sample | 2 | 34 | Little endian | 8 for 8 bits (1 bytes), 16 for 16 bits (2 bytes), etc. |
Sub-Chunk2 ID | 4 | 36 | Big endian | This chunk contains the letters "data". |
Sub-Chunk2 Size | 4 | 40 | Little endian | This value is equals to Num of Samples * Num of Channels * Bytes Per Sample |
Data | * | 44 | Little endian | The actual sound data. |
Example: the structure of WAVE file format
typedef struct WAV_FORMAT_T{
// RIFF header
char chunkID[4]; // Contains the letters "RIFF"
int chunkSize; // This is the size of the entire file in bytes minus 8 bytes for the two fields not included in this count
char format[4]; // Contains the letters "WAVE"
// fmt sub-chunk
char subchunk1[4]; // Contains the letters "fmt "
int subchunk1Size; // 16 for PCM
short audioFormat; // PCM = 1
short numChannels; // Mono = 1, Stereo = 2
int sampleRate; // 8000, 16000, 44100, etc.
int byteRate; // sampleRate * numChannels * Bytes per sample
short blockAlign; // numChannels * Bytes per sample
short bitsPerSample; // The number of bits (1 byte = 8 bits) for one sample including all channels
// data sub-chunk
char subChunk2[4]; // Contains the letters "data"
int subChunk2Size; // Num of Samples * numChannels * Bytes per sample
char bytes[]; // The actual sound data (temporarily use 1 bytes)
} WAV_FORMAT;
PCM Data Chunk
The simplest and most common WAVE file is PCM. The PCM samples are just raw sample data and stored as integers. The bytes per sample field will indicate the range of the sample data:
Bytes per Sample | Minimum Sample | Maximum Sample |
---|---|---|
1 | 0 | 255 |
2 | -32768 | 32767 |
3 | -8388608 | 8388607 |
In addition, the samples at a moment in time are called a sample frame. In a stereo file, a sample frame has 2 samples, one sample for the left channel and the other for right channel.
Parsing the wave header format by C Program
Assume: The audio wave file is recorded at 25 fps (40 msec per frame) and a stereo channel. Also, the sample rate is 16000 Hz and the number of bits per sample is 16 bits (2 bytes)
Method 1.
#include < stdio.h >
#include < stdlib.h >
#include < string.h >
#define SAMPLE_PER_FRAME 640 // 16000/25 = 640
#define NUM_CHANEL 2
#define BYTE_PER_SAMPLE 2
#define WAVE_SIZE_PER_FRAME SAMPLE_PER_FRAME*BYTE_PER_SAMPLE*BYTE_PER_SAMPLE
int main(int argc, char **argv) {
FILE *wav_list = NULL;
FILE *fp = NULL;
int frame_num = 0;
WAV_FORMAT wav_chunk;
// Static Memory
char wav_test_case[200];
char wav_per_frame[WAVE_SIZE_PER_FRAME];
// Open Wav file
wav_list = fopen(("wav_test_case.txt"), "rb");
// Protection on Reading file
if (wav_list == NULL) {
printf("Error opening file");
return (-1);
}
// Read Wav file - char * fgets(char* str, int Max num of char, FILE* stream), one test case
if (fgets(wav_test_case, sizeof(wav_test_case), wav_list) != NULL) {
fp = fopen(wav_test_case, "rb");
// Protection on Reading file
if (wav_list == NULL) {
printf("Can't opening wav file");
return (-1);
}
// Parsing WAV FORMAT
fread(&wav_chunk, 1, sizeof(WAV_FORMAT), fp);
printf("fmt sub-chunk: %.3s \n", wav_chunk.subchunk1); // Read fmt sub-chunk
printf("data sub-chunk: %.4s \n", wav_chunk.subChunk2); // Read data sub-chunk
printf("numChannels = %d \n", wav_chunk.numChannels); // Print the Format of Wav
printf("sampleRate = %d \n", wav_chunk.sampleRate);
printf("byteRate = %d \n", wav_chunk.byteRate);
printf("bitsPerSample = %d \n", wav_chunk.bitsPerSample);
printf("sample_alignment (numChannels * bitsPerSample) = %d \n",
wav_chunk.blockAlign);
printf("audio_format = %s \n",
wav_chunk.audioFormat ? "PCM" : "IEEE Float");
/***** Frame Process *****/
while (fread(wav_per_frame, wav_chunk.blockAlign, WAVE_SIZE_PER_FRAME, fp) == WAVE_SIZE_PER_FRAME) {
printf("Frame = %d", frame_num);
/***************************/
/* Doing Signal Process per Frame */
/***************************/
frame_num++;
} /***** End of Frame Process *****/
}
fclose(wav_list);
return 0;
}
Result:
fmt sub-chunk: fmt
data sub-chunk: data
numChannels = 1
sampleRate = 16000
byteRate = 32000
bitsPerSample = 16
sample_alignment (numChannels * bitsPerSample) = 2
audio_format = PCM
Frame = 0
...
Method 2.
#include < stdio.h >
#include < stdlib.h >
#include < string.h >
#define NUM_FRAME 16000
#define SAMPLE_PER_FRAME 640 // 16000/25 = 640, 25 fps
#define NUM_CHANEL 2
#define BYTE_PER_SAMPLE 2
#define WAVE_SIZE_PER_FRAME SAMPLE_PER_FRAME*BYTE_PER_SAMPLE*BYTE_PER_SAMPLE
// Signal parameter structure
typedef struct SIGNAL_PARA_T {
int sampleSize;
} SIGNAL_PARA;
// Default Value for signal parameter
SIGNAL_PARA signal_para = { SAMPLE_PER_FRAME };
int main(int argc, char **argv) {
FILE *wav_list = NULL;
FILE *fp = NULL;
int frame_num = 0;
WAV_FORMAT wav_chunk;
// Static Memory
char wav_test_case[200];
char wav_per_frame[WAVE_SIZE_PER_FRAME];
// Open Wav file
wav_list = fopen(("wav_test_case.txt"), "rb");
// Protection on Reading file
if (wav_list == NULL) {
printf("Error opening file");
return (-1);
}
// Read Wav file - char * fgets(char* str, int Max num of char, FILE* stream), one test case
if (fgets(wav_test_case, sizeof(wav_test_case), wav_list) != NULL) {
fp = fopen(wav_test_case, "rb");
// Protection on Reading file
if (wav_list == NULL) {
printf("Can't opening wav file");
return (-1);
}
// Parsing WAV FORMAT
fread(&wav_chunk, 1, sizeof(WAV_FORMAT), fp);
// Read fmt sub-chunk
printf("fmt sub-chunk: %.3s \n", wav_chunk.subchunk1);
// Read data sub-chunk
printf("data sub-chunk: %.4s \n", wav_chunk.subChunk2);
//Print the Format of Wav
printf("numChannels = %d \n", wav_chunk.numChannels);
printf("sampleRate = %d \n", wav_chunk.sampleRate);
printf("byteRate = %d \n", wav_chunk.byteRate);
printf("bitsPerSample = %d \n", wav_chunk.bitsPerSample);
printf("sample_alignment (numChannels * bitsPerSample) = %d \n",
wav_chunk.blockAlign);
printf("audio_format = %s \n",
wav_chunk.audioFormat ? "PCM" : "IEEE Float");
/***** Frame Process *****/
for (frame_num = 0; frame_num < NUM_FRAME; frame_num++) {
// Read samples
result = fread(wav_per_frame, wav_chunk.blockAlign, signal_para.sampleSize, fp);
printf("Frame = %d \n", frame_num);
/***************************/
/* Doing Signal Process per Frame */
/***************************/
} /***** End of Frame Process *****/
}
fclose(wav_list);
return 0;
}
Result:
fmt sub-chunk: fmt
data sub-chunk: data
numChannels = 1
sampleRate = 16000
byteRate = 32000
bitsPerSample = 16
sample_alignment (numChannels * bitsPerSample) = 2
audio_format = PCM
Frame = 0
...
=========== To be continued…. ==========
Reference
[1] Wave File Format
Thanks for reading! Feel free to leave the comments below or email to me. Any pieces of advice or discussions are always welcome. :)