使用OpenMP多執行緒的Cuda設備記憶體變數會產生錯誤的結果-有解無憂

我有一個函式，我在回圈中連續呼叫 cuda 內核。此函式使用 OpenMP 在執行緒中并行執行。通過每次迭代，我更新一個變數currentTime：

cudaMemcpyFromSymbolAsync(&currentTime, minChangeTime, sizeof(currentTime), 0, cudaMemcpyDeviceToHost, stream_id);

whereminChangeTime在內核中計算。currentTime不知何故，當使用 OpenMP 并行呼叫多個內核時，此變數的更新未正確完成。我在最后提供了一個可重現的代碼。我期待的結果是：

但是在啟用 OpenMP 時，我沒有得到 121 的差異：

我在做什么錯或誤解？如果設備記憶體變數在這里不合適，那么更好的變數型別是什么？非常感謝您的提示或幫助。

#ifdef __CUDACC__
#define CUDA_HOSTDEV __host__ __device__
#define CUDA_DEVICE __device__
#define CUDA_GLOBAL __global__
#define CUDA_CONST __constant__
#else
#define CUDA_HOSTDEV
#define CUDA_DEVICE
#define CUDA_GLOBAL
#define CUDA_CONST
#endif

#include <cuda.h>
#include <cuda_runtime.h>
#include <omp.h>

#include "helper_cuda.h"
#include "helper_functions.h"

CUDA_DEVICE int minChangeTime;
CUDA_DEVICE bool foundMinimum;

CUDA_GLOBAL void reduction(
  int* cu_adjustment_time
  ){

  unsigned int tid = threadIdx.x;
  unsigned int i = blockIdx.x*blockDim.x   threadIdx.x;
  __syncthreads();
  for (unsigned int s=1; s < blockDim.x; s *= 2) {
    if (tid % (2*s) == 0){
      atomicMin(&minChangeTime, cu_adjustment_time[tid s]);
    }
    __syncthreads();
  }
}

CUDA_GLOBAL void wh(int* cu_adjustment_time, int currentTime){
  int tid = threadIdx.x   blockDim.x*blockIdx.x;
  cu_adjustment_time[tid] = currentTime tid;
}

void iteration_function(int *iRows, int time_data_index, int num_nets, cudaStream_t stream_id){
    
    int currentTime = 0;
    int limit = *iRows-1;
    int starting_point = time_data_index;
    time_data_index =currentTime;

    int* cu_adjustment_time;
    cudaMalloc((void **)&cu_adjustment_time, sizeof(int) * (num_nets));

    limit = (*iRows) - 1;
    cudaStreamSynchronize(stream_id);

    int loop = 0;
    while(currentTime<limit){

        cudaMemcpyToSymbolAsync(minChangeTime, &limit, sizeof(*iRows), 0, cudaMemcpyHostToDevice, stream_id);
        
        wh<<<num_nets, 1, 0, stream_id>>>(
            cu_adjustment_time,
            currentTime
        );
        cudaStreamSynchronize(stream_id);
        
        reduction<<<1, num_nets, 0, stream_id>>>(
          cu_adjustment_time
        );
      
        cudaStreamSynchronize(stream_id);        
        cudaMemcpyFromSymbolAsync(&currentTime, minChangeTime, sizeof(currentTime), 0, cudaMemcpyDeviceToHost, stream_id);
        cudaStreamSynchronize(stream_id);

        currentTime =num_nets;
        time_data_index =num_nets 1;
        
        std::cout << loop << " " << currentTime << " " << time_data_index << std::endl;
        loop  ;
        
    }
    std::cout << "finished" << std::endl;

}

int main(){
    //compiled with: nvcc no_fun.cu -Xcompiler=-fopenmp -o no_fun 
    int iRows = 3000;
    int iter = 300;
    int time_data_index = 121;
    int num_nets = 64;
    cudaStream_t streams[iter];
    //#pragma omp parallel for simd schedule(dynamic) -> including this part causes undefined results
    for(unsigned int j = 0; j < iter; j  ){
        cudaStreamCreate(&streams[j]);
        iteration_function(&iRows, time_data_index, num_nets, streams[j]);
        cudaStreamSynchronize(streams[j]);
        cudaStreamDestroy(streams[j]);
    }

}

uj5u.com熱心網友回復：

當多個reduction內核同時運行時，全域變數存在競爭條件minChangeTime。您需要為應該并行運行的每個內核擁有單獨的設備記憶體。最簡單的方法是minChangeTime在每個執行緒中只使用 cudaMalloc 而不是將其宣告為全域變數，然后將其傳遞給內核。

轉載請註明出處，本文鏈接：https://www.uj5u.com/gongcheng/412299.html

標籤：

上一篇：我可以在python的守護執行緒中運行清理代碼嗎？

下一篇：如何在此回圈中獲取執行緒/子行程回傳的值，以驗證它們是否將繼續處于活動狀態或是否應該停用？