I'm trying to use OpenMP in my audio code. I'm using Miniaudio as audio backend, a pretty handy single-header multi-platform library, you can find it here.
Here is my code, I tried to distill it to its minimum working form:
/*** DEFINES */
#define MA_NO_DECODING
#define MA_NO_ENCODING
#define MINIAUDIO_IMPLEMENTATION
#define WAVES_QTY 1000
#define FREQ_INIT 50.f
#define FREQ_STEP 1.f
#define DEVICE_FORMAT ma_format_f32
#define DEVICE_CHANNELS 1
#define DEVICE_SAMPLE_RATE 8000
/* DEFINES end. */
/*** INCLUDES */
/* single header library from https://github.com/mackron/miniaudio/blob/master/miniaudio.h */
#include "miniaudio.h"
#include <stdio.h>
#include <stdint.h>
#include <math.h>
#include <omp.h>
/* INCLUDES end. */
/*** GLOBALS */
float phaseArray[WAVES_QTY];
float amplitudeArray[WAVES_QTY];
float freqArray[WAVES_QTY];
/* GLOBALS end. */
/*** FUNCTION DECLARATIONS */
void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount);
/* FUNCTION DECLARATIONS end. */
/*** MAIN */
int main(){
uint32_t i;
ma_device_config deviceConfig;
ma_device device;
printf("== will now initialize arrays...");
for(i=0;i<WAVES_QTY;i ){
phaseArray[i] = 0.f;
amplitudeArray[i] = 1.f;
freqArray[i] = FREQ_INIT i*FREQ_STEP;
amplitudeArray[i] /= ((float)WAVES_QTY);/* so we don't overflow max volume */
}
printf(" DONE!\n");
deviceConfig = ma_device_config_init(ma_device_type_playback);
deviceConfig.playback.format = DEVICE_FORMAT;
deviceConfig.playback.channels = DEVICE_CHANNELS;
deviceConfig.sampleRate = DEVICE_SAMPLE_RATE;
deviceConfig.dataCallback = data_callback;
if(ma_device_init(NULL, &deviceConfig, &device) != MA_SUCCESS){
printf("Failed to open playback device.\n");
return -4;
}
printf("== Device Name: %s\n", device.playback.name);
/* this is the actual sound start */
if (ma_device_start(&device) != MA_SUCCESS) {
printf("== Failed to start playback device.\n");
ma_device_uninit(&device);
return -5;
}
printf("~~~ You should hear sound now ~~~\n");
printf("== Press Enter to quit...");
getchar();
ma_device_uninit(&device); /* clean up */
return 0;
}
/* MAIN end. */
/*** FUNCTION DEFINITIONS */
void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount){
float* Samples = pOutput;
ma_uint32 SampleIndex;
for(SampleIndex = 0; SampleIndex < frameCount; SampleIndex ){
uint32_t ii;
*Samples = 0.f;
/**** HERE */
#pragma omp parallel for private(ii) shared(phaseArray, freqArray, amplitudeArray, Samples)
for(ii=0; ii<WAVES_QTY; ii ){
phaseArray[ii] = fmod(phaseArray[ii] (freqArray[ii] / (float)(DEVICE_SAMPLE_RATE)), 1.f);
*Samples = (float)sin((double)(phaseArray[ii] * (float)MA_TAU)) * amplitudeArray[ii];
}
Samples ;
}
(void)pDevice;
(void)pInput;
}
/* FUNCTION DEFINITIONS end. */
compiling on Win10, MinGW32 with
gcc -g0 thousandwaves.c -fopenmp -o thousandwaves.exe -Wall -Wextra -Wshadow -Werror=implicit-int -Werror=incompatible-pointer-types -Werror=int-conversion -Wvla -pedantic-errors -ansi
The program is quite simple, it generates 1000 sinewaves and mixes them together in real-time.
I'd like to run the sinewave generation on all available threads using OpenMP, so I added the #pragma omp parallel for
in my data_callback
function (I also marked the exact spot with the comment /**** HERE */
).
It compiles and runs without errors/warnings, but the program generates a very noisy sound, vaguely resembling the original.(just comment/delete the #pragma to hear how it should sound).
I guess it has something to do with the nested for loops, but I can't pinpoint the problem. I'm targeting only the inner loop with my #pragma, so it should parallelize only that section.
Let me know what do you think, thanks :))
EDIT:
thanks a lot all, the reduction
clause does the trick indeed, and also, using atomic
inside the loop further speeds up things
void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount){
float* Samples = pOutput;
ma_uint32 SampleIndex;
for(SampleIndex = 0; SampleIndex < frameCount; SampleIndex ){
uint32_t ii;
float sample = 0.f;
#pragma omp parallel for private(ii) shared(phaseArray, freqArray, amplitudeArray, Samples) reduction( :sample)
for(ii=0; ii<WAVES_QTY; ii ){
phaseArray[ii] = fmod(phaseArray[ii] (freqArray[ii] / (float)(DEVICE_SAMPLE_RATE)), 1.f);
#pragma omp atomic update
sample = (float)sin((double)(phaseArray[ii] * (float)MA_TAU)) * amplitudeArray[ii];
}
*Samples = sample;
Samples ;
}
(void)pDevice;
(void)pInput;
}
CodePudding user response:
all threads are writing to *Samples
, this is a race condition which should be atomic, and would result in slower code than serial code.
#pragma omp atomic
*Samples = (float)sin((double)(phaseArray[ii] * (float)MA_TAU)) * amplitudeArray[ii];
Edit: as pointed by @Jérôme a reduction will be faster
void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) {
float* Samples = pOutput;
ma_uint32 SampleIndex;
for (SampleIndex = 0; SampleIndex < frameCount; SampleIndex ) {
int32_t ii;
*Samples = 0.f;
float sample = 0.f;
/**** HERE */
#pragma omp parallel for private(ii) shared(phaseArray, freqArray, amplitudeArray) reduction( :sample)
for (ii = 0; ii < WAVES_QTY; ii ) {
phaseArray[ii] = fmod(phaseArray[ii] (freqArray[ii] / (float)(DEVICE_SAMPLE_RATE)), 1.f);
sample = (float)sin((double)(phaseArray[ii] * (float)MA_TAU)) * amplitudeArray[ii];
}
*Samples = sample;
Samples ;
}
(void)pDevice;
(void)pInput;
}