使用CUDA在C中并行化基數排序的問題-有解無憂

我正在嘗試使用 C 中的 CUDA 實作一個基數排序演算法，以便能夠并行化它；代碼如下：

#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <cuda_profiler_api.h>

int* populateArray(int * arr, int n){
    int j=0;
    for (int i=n; i>0 ; i--){
        arr[j]= i;
        j=j 1;
    }
    return arr;
    }

void printArray(int * array, int size){
    printf("Ordered array:\n");
    for (int j=0; j<size; j  ){
        printf("%d ", array[j]);
    }
    printf("\n");
}

__device__  int getMax(int* array, int n) {
  int max = array[0];
  for (int i = 1; i < n; i  )
    if (array[i] > max)
      max = array[i];
  return max;
}

__device__ void countingSort(int* array,int size,int digit, int index, int* output) {
  int count[10]={0};

  for (int i = 0; i < size; i  )
    count[(array[i] / digit) % 10]  ;

  for (int i = 1; i < 10; i  )
    count[i]  = count[i - 1];

  for (int i = size - 1; i >= 0; i--) {
    output[count[(array[i] /digit) % 10] - 1] = array[i];
    count[(array[i] / digit) % 10]--;
    }

  for (int i = 0; i < size; i  )
    array[i]= output[i];
  }


__global__ void radixsort(int* array, int size, int* output) {
  // define index
  int index = blockIdx.x * blockDim.x   threadIdx.x;
  
  // check that the thread is not out of the vector boundary
  if (index >= size) return;

  int max = getMax(array, size);
  for (int digit = 1; max / digit > 0; digit *= 10){
    countingSort(array, size, digit, index, output);
    }
 
}

int main(int argc, char *argv[]) {
    // Init array
    int n = 1000;
    int* array_h = (int*) malloc(sizeof(int) * n);
    populateArray(array_h, n);
    
    // allocate memory on device
    int* array_dev;
    cudaMalloc((void**)&array_dev, n*sizeof(int));
    int* output;
    cudaMalloc((void**)&output, n*sizeof(int));

    // copy data from host to device
    cudaMemcpy(array_dev, array_h, n*sizeof(int), cudaMemcpyHostToDevice);

    dim3 block(32);
    dim3 grid((n-1)/block.x   1);
    printf("Number of threads for each block: %d\n", block.x);
    printf("Number of blocks in the grid: %d\n", grid.x);

    // Create start and stop CUDA events 
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    cudaError_t mycudaerror;
    mycudaerror = cudaGetLastError();
    // define the execution configuration
    radixsort<<<grid,block>>>(array_dev, n, output);

    // device synchronization and cudaGetLastError call
    mycudaerror = cudaGetLastError();
    if(mycudaerror != cudaSuccess)  {
      fprintf(stderr,"%s\n",cudaGetErrorString(mycudaerror)) ;
      //printf("Error in kernel!");
      exit(1);
    }

    // event record, synchronization, elapsed time and destruction
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float elapsed;
    cudaEventElapsedTime(&elapsed, start, stop);
    elapsed = elapsed/1000.f; // convert to seconds
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    printf("Number of elements in the array: %d\n", n);
    printf("Kernel elapsed time: %.5f s\n", elapsed);

    // copy back results from device
    cudaMemcpy(array_h, array_dev, n*sizeof(int), cudaMemcpyDeviceToHost);

    // print ordered array
    printArray(array_h, n);
    
    // free resources on device
    cudaFree(array_dev);
    cudaFree(output);

    // free resources on host
    free(array_h);

    cudaProfilerStop();
    return 0; 
}

為了運行它，我使用的是 Google Colab。每個塊的最大執行緒數固定為 32（網格變數），而使用的塊數在 main 中根據需要排序的元素數（塊變數）計算。

當我開始更改要排序的陣列中的元素數量（main 中存在的變數“n”）時，就會出現問題，因為一旦超過某個閾值，排序將不再正確執行。

為了獲得有關此錯誤執行的更多資訊，我還使用了命令
cuda-memcheck，nvcc -lineinfo發現錯誤是由于內核中的這行代碼引起的：

output[count[(array[i] /digit) % 10] - 1] = array[i];

進行了幾次嘗試，錯誤似乎主要在于計算我要寫入的“輸出”陣列的索引；但是，例如，當我嘗試對 32 或 64 個元素進行排序時，不會發生此錯誤。因此，我想知道我是否在代碼中做錯了什么，或者簡單地說，基數排序在我嘗試的方式中是否不可并行化。我知道讓每個執行緒在不使用相對于每個執行緒的索引的情況下進行排序在計算上是非常繁重的，但首先我想嘗試解決這個問題，然后嘗試總體上優化代碼。

各種經過驗證的方法包括：

使用原子指令；
在內核中宣告“輸出”陣列，不要將它作為 main 的指標傳遞；
僅使用一個執行緒塊來計算排序陣列（一種奇怪的方法似乎有效，但考慮到塊數的動態分配，這是無用的）

感謝您的任何回復。

uj5u.com熱心網友回復：

您似乎已經明白這不是解決此問題的方法：

我知道讓每個執行緒在不使用相對于每個執行緒的索引的情況下進行排序在計算上是非常繁重的，但首先我想嘗試解決這個問題，然后嘗試總體上優化代碼。

幾乎任何本質上是串行的代碼都可以放在 CUDA 內核中，在單個執行緒中運行，并且應該產生相同的結果。然而，這不是撰寫 CUDA 代碼的方式；表現會很慘淡。

在許多情況下，本質上是串行的演算法不容易適應并行化。您的排序方法就是其中之一。一種（有效）并行化基數排序的典型方法在這里。

盡管如此，將串行演算法放入單個執行緒“應該”作業。當您在多個執行緒中（不必要地）運行相同的串行代碼時，就會出現問題。

如果所有執行緒都完美地步調一致，那么它們都會在相同的指令或時鐘周期內做完全相同的事情。但是 GPU 不是這樣作業的。您可以看到的最大“鎖步”是在經線級別（執行緒塊中有 32 個執行緒）。一旦你進入多個warp，無論它們是在同一個執行緒塊中還是不同的執行緒塊中，你的串行代碼中都會有不同點的執行緒，有效地相互踩踏。

一個簡單的證明點是將網格配置更改為單個執行緒：

dim3 block(1);
dim3 grid(1);

然后錯誤消失，即使對于n = 1000，并且陣列已正確排序。由于單個經線的典型鎖步性質，這也應該有效：

dim3 block(32);
dim3 grid(1);

某些較大的配置在某些情況下可能會起作用，但是如果塊數量足夠多，最終所有執行緒都不會同時啟動，這會導致麻煩。

uj5u.com熱心網友回復：

另一種解決方案是使用具有最高有效位基數排序的單個執行緒將陣列分成多個箱，例如如果使用基數 256，則為 256 個箱。最低有效位第一基數排序。為了使其正常作業，資料需要合理統一（至少對于最重要的數字），以便多個 bin 的大小有點相似。

嘗試并行執行初始步驟很復雜。

轉載請註明出處，本文鏈接：https://www.uj5u.com/qianduan/407413.html

標籤：

上一篇：如何在沒有.sort()的情況下按鍵對python串列進行排序？

下一篇：如何訪問在Angular12中動態創建的元素