why is OpenMP messing this for loop?-CodePudding

I'm trying to use OpenMP in my audio code. I'm using Miniaudio as audio backend, a pretty handy single-header multi-platform library, you can find it here.

Here is my code, I tried to distill it to its minimum working form:

/*** DEFINES */
    #define MA_NO_DECODING
    #define MA_NO_ENCODING
    #define MINIAUDIO_IMPLEMENTATION

    #define WAVES_QTY 1000
    #define FREQ_INIT 50.f
    #define FREQ_STEP 1.f

    #define DEVICE_FORMAT ma_format_f32
    #define DEVICE_CHANNELS 1
    #define DEVICE_SAMPLE_RATE 8000
/* DEFINES end. */

/*** INCLUDES */
    /* single header library from https://github.com/mackron/miniaudio/blob/master/miniaudio.h */
    #include "miniaudio.h"

    #include <stdio.h>
    #include <stdint.h>
    #include <math.h>
    #include <omp.h>
/* INCLUDES end. */

/*** GLOBALS */
    float phaseArray[WAVES_QTY];
    float amplitudeArray[WAVES_QTY];
    float freqArray[WAVES_QTY];
/* GLOBALS end. */

/*** FUNCTION DECLARATIONS */
    void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount);
/* FUNCTION DECLARATIONS end. */    

/*** MAIN */
    int main(){
        uint32_t i;
        ma_device_config deviceConfig;
        ma_device device;

        printf("== will now initialize arrays...");
        for(i=0;i<WAVES_QTY;i  ){
            phaseArray[i] = 0.f;
            amplitudeArray[i] = 1.f;
            freqArray[i] = FREQ_INIT   i*FREQ_STEP;
            amplitudeArray[i] /=  ((float)WAVES_QTY);/* so we don't overflow max volume */
        }
        printf(" DONE!\n");

        deviceConfig = ma_device_config_init(ma_device_type_playback);
        deviceConfig.playback.format   = DEVICE_FORMAT;
        deviceConfig.playback.channels = DEVICE_CHANNELS;
        deviceConfig.sampleRate        = DEVICE_SAMPLE_RATE;
        deviceConfig.dataCallback      = data_callback;
        if(ma_device_init(NULL, &deviceConfig, &device) != MA_SUCCESS){
            printf("Failed to open playback device.\n");
            return -4;
        }

        printf("== Device Name: %s\n", device.playback.name);

        /* this is the actual sound start */
        if (ma_device_start(&device) != MA_SUCCESS) {
            printf("== Failed to start playback device.\n");
            ma_device_uninit(&device);
            return -5;
        }

        printf("~~~ You should hear sound now ~~~\n");
        printf("== Press Enter to quit...");
        getchar();

        ma_device_uninit(&device); /* clean up */

        return 0;
    }
/* MAIN end. */

/*** FUNCTION DEFINITIONS */
    void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount){
        float* Samples = pOutput;
        ma_uint32 SampleIndex;

        for(SampleIndex = 0; SampleIndex < frameCount; SampleIndex  ){
            uint32_t ii;
            *Samples = 0.f;

            /**** HERE */
            #pragma omp parallel for private(ii) shared(phaseArray, freqArray, amplitudeArray, Samples)
            for(ii=0;  ii<WAVES_QTY; ii  ){
                phaseArray[ii] = fmod(phaseArray[ii]   (freqArray[ii] / (float)(DEVICE_SAMPLE_RATE)), 1.f);
                *Samples  = (float)sin((double)(phaseArray[ii] * (float)MA_TAU)) * amplitudeArray[ii];
            }

            Samples  ;
        }
        (void)pDevice;
        (void)pInput;
    }
/* FUNCTION DEFINITIONS end. */

compiling on Win10, MinGW32 with gcc -g0 thousandwaves.c -fopenmp -o thousandwaves.exe -Wall -Wextra -Wshadow -Werror=implicit-int -Werror=incompatible-pointer-types -Werror=int-conversion -Wvla -pedantic-errors -ansi

The program is quite simple, it generates 1000 sinewaves and mixes them together in real-time. I'd like to run the sinewave generation on all available threads using OpenMP, so I added the #pragma omp parallel for in my data_callback function (I also marked the exact spot with the comment /**** HERE */). It compiles and runs without errors/warnings, but the program generates a very noisy sound, vaguely resembling the original.(just comment/delete the #pragma to hear how it should sound).

I guess it has something to do with the nested for loops, but I can't pinpoint the problem. I'm targeting only the inner loop with my #pragma, so it should parallelize only that section.

Let me know what do you think, thanks :))

EDIT: thanks a lot all, the reduction clause does the trick indeed, and also, using atomic inside the loop further speeds up things

    void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount){
        float* Samples = pOutput;
        ma_uint32 SampleIndex;

        for(SampleIndex = 0; SampleIndex < frameCount; SampleIndex  ){
            uint32_t ii;
            float sample = 0.f;

            #pragma omp parallel for private(ii) shared(phaseArray, freqArray, amplitudeArray, Samples) reduction( :sample)
            for(ii=0;  ii<WAVES_QTY; ii  ){
                phaseArray[ii] = fmod(phaseArray[ii]   (freqArray[ii] / (float)(DEVICE_SAMPLE_RATE)), 1.f);
                #pragma omp atomic update
                sample  = (float)sin((double)(phaseArray[ii] * (float)MA_TAU)) * amplitudeArray[ii];
                
            }

            *Samples = sample;
            Samples  ;
        }
        (void)pDevice;
        (void)pInput;
    }

CodePudding user response：

all threads are writing to *Samples, this is a race condition which should be atomic, and would result in slower code than serial code.

#pragma omp atomic
*Samples  = (float)sin((double)(phaseArray[ii] * (float)MA_TAU)) * amplitudeArray[ii];

Edit: as pointed by @Jérôme a reduction will be faster

void data_callback(ma_device* pDevice, void* pOutput, const void* pInput, ma_uint32 frameCount) {
    float* Samples = pOutput;
    ma_uint32 SampleIndex;

    for (SampleIndex = 0; SampleIndex < frameCount; SampleIndex  ) {
        int32_t ii;
        *Samples = 0.f;
        float sample = 0.f;

        /**** HERE */
        #pragma omp parallel for private(ii) shared(phaseArray, freqArray, amplitudeArray) reduction( :sample)
        for (ii = 0; ii < WAVES_QTY; ii  ) {
            phaseArray[ii] = fmod(phaseArray[ii]   (freqArray[ii] / (float)(DEVICE_SAMPLE_RATE)), 1.f);
            sample  = (float)sin((double)(phaseArray[ii] * (float)MA_TAU)) * amplitudeArray[ii];
        }
        *Samples = sample;
        Samples  ;
    }
    (void)pDevice;
    (void)pInput;
}