我正在嘗試使演算法在OpenCL. 我使用這個存盤庫 ( Source.cpp) 作為模板。我現在想將整個程式轉換為long演算法型別而不是float. 但我總是CL_INVALID_VALUE在第二個得到一個 (-30) 例外clEnqueueWriteBuffer。我浪費了幾個小時沒有發現錯誤,所以也許我已經監督了一些明顯的事情(我還沒有對 opencl 做太多......)?
我的代碼(不作業)
#include <cassert>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include<time.h>
#include <CL/cl.h>
//#define DATA_SIZE 1024
#define DATA_SIZE 1024
using namespace std;
//$ /f/Tools/OCL_SDK_Light/lib/x86_64/opencl.lib blelloch_scan.cpp
const char* ProgramSource =
"__kernel void add(__global long *input, __global long *output, __global long *temp, int size){\n"\
"int thid = get_global_id(0); \n"\
"int offset = 1; \n"\
"printf('%d',thid); \n"\
"temp[2*thid] = input[2*thid]; \n"\
"temp[2*thid 1] = input[2*thid 1]; \n"\
"for(int d= size>>1; d>0; d >>= 1){ \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d){ \n"\
"int ai = offset*(2*thid 1)-1; \n"\
"int bi = offset*(2*thid 2)-1; \n"\
"temp[bi] = temp[ai]; } \n"\
"offset = offset*2; \n"\
"} \n"\
"temp[size-1] = 0; \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"for(int d = 1; d<size; d *= 2){ \n"\
"offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d) { \n"\
"int ai = offset*(2*thid 1)-1; int bi = offset*(2*thid 2)-1; \n"\
"long t = temp[ai]; temp[ai] = temp[bi]; temp[bi] = t; } \n"\
"} \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"output[2*thid] = temp[2*thid]; \n"\
"output[2*thid 1] = temp[2*thid 1]; \n"\
"}\n"\
"\n";
/*
*/
int main(void)
{
cl_context context;
cl_context_properties properties[3];
cl_command_queue command_queue;
cl_kernel kernel;
cl_program program;
cl_int err;
cl_uint num_platforms = 0;
cl_platform_id* platforms;
cl_device_id device_id;
cl_uint num_of_devices = 0;
cl_mem inputA, inputB, output;
size_t global, loc;
std::cout << "Setup \n";
long arr[DATA_SIZE];
long inputDataA[DATA_SIZE];
long results[2 * DATA_SIZE];
long i;
for (i = 1; i < DATA_SIZE - 1;i )
{
inputDataA[i-1] = (long)i;
arr[i-1] = (long)i;
}
clock_t ends;
/* --------------------- Get platform ---------------------*/
cl_int clResult = clGetPlatformIDs(0, NULL, &num_platforms);
assert(clResult == CL_SUCCESS);
platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
clResult = clGetPlatformIDs(num_platforms, platforms, NULL);
assert(clResult == CL_SUCCESS);
/* --------------------- ------------ ---------------------*/
/* --------------------- Get devices ---------------------*/
cl_device_id* devices = NULL;
cl_uint num_devices;
clResult = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
assert(clResult == CL_SUCCESS);
devices = (cl_device_id*)malloc(sizeof(cl_device_id) * num_platforms);
if (clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, devices, NULL) != CL_SUCCESS)
{
printf("could not find device id");
}
assert(clResult == CL_SUCCESS);
/* --------------------- ----------- ---------------------*/
properties[0] = CL_CONTEXT_PLATFORM;
properties[1] = 0;
cl_int contextResult;
context = clCreateContext(NULL, 1, &devices[0], NULL, NULL, &contextResult);
assert(contextResult == CL_SUCCESS);
// create command queue using the context and device
command_queue = clCreateCommandQueueWithProperties(context, devices[0], 0, &err);
// create a program from the kernel source code
program = clCreateProgramWithSource(context, 1, (const char**)&ProgramSource, NULL, &err);
// compile the program
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS)
{
printf("Error building program\n");
return 1;
}
// specify which kernel from the program to execute
kernel = clCreateKernel(program, "add", &err);
// create buffers for the input and ouput
cl_int result;
inputA = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);
inputB = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);
// load data into the input buffer
clResult = clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataA, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
clResult = clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(long) * DATA_SIZE , 0, 0, NULL, NULL);
assert(clResult == CL_SUCCESS); // ERROR HERE
clResult = clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, 0, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
int temp = DATA_SIZE;
clock_t start = clock();
// set the argument list for the kernel command
clResult = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 2, sizeof(cl_mem), &inputB);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 3, sizeof(int), &temp);
assert(clResult == CL_SUCCESS);
global = DATA_SIZE; // num of processors
loc = 256;
printf("\n>> start parallel ---------- \n");
// enqueue the kernel command for execution
clResult = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, &loc, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
// copy the results from out of the output buffer
clResult = clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, results, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
clFinish(command_queue);
ends = clock();
// print the results
int k = 1;
for (k = 1;k < 8; k )
{
printf("%d - ", k);
printf("%d \n", results[k]);
}
double time_taken = ((double)(ends - start)) / CLK_TCK;
printf("\n>>finished parallel in %lf seconds\n", time_taken);
// cleanup - release OpenCL resources
printf("\n-------------------------------------\n");
/* -------sequential ------- */
printf("\n>> start sequential ---------- \n");
long prefixSum[DATA_SIZE] = { 0 };
const clock_t startSequential = clock();
prefixSum[0] = arr[0];
long idx = 1;
for (idx = 1; idx < DATA_SIZE; idx ) {
prefixSum[idx] = prefixSum[idx - 1] arr[idx];
}
const clock_t endSequential = clock();
double seqTime = ((double)(endSequential - startSequential)) / CLK_TCK;
printf("\n>> finished sequential in %lf\n", seqTime);
for (int j = 0;j < 8; j )
{
printf("%d - ", j);
printf("%d \n", prefixSum[j]);
}
clReleaseMemObject(inputA);
clReleaseMemObject(inputB);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
return 0;
}
存盤庫代碼(作業):
#include <cassert>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include<time.h>
#include <CL/cl.h>
#define DATA_SIZE 1024
using namespace std;
ofstream outfile;
const char* ProgramSource =
"__kernel void add(__global float *input, __global float *output, __global float *temp, int size){\n"\
"int thid = get_global_id(0); \n"\
"int offset = 1; \n"\
"temp[2*thid] = input[2*thid]; \n"\
"temp[2*thid 1] = input[2*thid 1]; \n"\
"for(int d= size>>1; d>0; d >>= 1){ \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d){ \n"\
"int ai = offset*(2*thid 1)-1; \n"\
"int bi = offset*(2*thid 2)-1; \n"\
"temp[bi] = temp[ai]; } \n"\
"offset = offset*2; \n"\
"} \n"\
"temp[size-1] = 0; \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"for(int d = 1; d<size; d *= 2){ \n"\
"offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d) { \n"\
"int ai = offset*(2*thid 1)-1; int bi = offset*(2*thid 2)-1; \n"\
"float t = temp[ai]; temp[ai] = temp[bi]; temp[bi] = t; } \n"\
"} \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"output[2*thid] = temp[2*thid]; \n"\
"output[2*thid 1] = temp[2*thid 1]; \n"\
"}\n"\
"\n";
/*
*/
int main(void)
{
cl_uint num_platforms = 0;
cl_context context;
cl_context_properties properties[3];
cl_kernel kernel;
cl_platform_id* platforms;
cl_command_queue command_queue;
cl_program program;
cl_int err;
cl_uint num_of_platforms = 0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices = 0;
cl_mem inputA, inputB, output;
outfile.open("shubham.txt");
size_t global, loc;
float inputDataA[DATA_SIZE];
float results[2 * DATA_SIZE] = { 0 };
int i;
for (i = 0; i < DATA_SIZE;i )
{
inputDataA[i] = (float)i;
}
clock_t start, ends;
/* --------------------- Get platform ---------------------*/
cl_int clResult = clGetPlatformIDs(0, NULL, &num_platforms);
assert(clResult == CL_SUCCESS);
platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
clResult = clGetPlatformIDs(num_platforms, platforms, NULL);
assert(clResult == CL_SUCCESS);
/* --------------------- ------------ ---------------------*/
/* --------------------- Get devices ---------------------*/
cl_device_id* devices = NULL;
cl_uint num_devices;
clResult = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
assert(clResult == CL_SUCCESS);
devices = (cl_device_id*)malloc(sizeof(cl_device_id) * num_platforms);
if (clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, devices, NULL) != CL_SUCCESS)
{
printf("could not find device id");
}
assert(clResult == CL_SUCCESS);
/* --------------------- ----------- ---------------------*/
properties[0] = CL_CONTEXT_PLATFORM;
properties[1] = 0;
cl_int contextResult;
context = clCreateContext(NULL, 1, &devices[0], NULL, NULL, &contextResult);
assert(contextResult == CL_SUCCESS);
// create command queue using the context and device
// create command queue using the context and device
command_queue = clCreateCommandQueueWithProperties(context, devices[0], 0, &err);
// create a program from the kernel source code
program = clCreateProgramWithSource(context, 1, (const char**)&ProgramSource, NULL, &err);
// compile the program
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS)
{
printf("Error building program\n");
return 1;
}
// specify which kernel from the program to execute
kernel = clCreateKernel(program, "add", &err);
// create buffers for the input and ouput
inputA = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);
inputB = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);
// load data into the input buffer
clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(float) * DATA_SIZE, inputDataA, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(float) * DATA_SIZE, 0, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, 0, 0, NULL, NULL);
int temp = DATA_SIZE;
start = clock();
// set the argument list for the kernel command
clResult = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 2, sizeof(cl_mem), &inputB);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 3, sizeof(int), &temp);
assert(clResult == CL_SUCCESS);
global = DATA_SIZE;
loc = 256;
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, &loc, 0, NULL, NULL);
clFinish(command_queue);
// copy the results from out of the output buffer
clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, results, 0, NULL, NULL);
//clEnqueueReadBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(float) *16, shubh, 0, NULL, NULL);
// print the results
printf("output: ");
for (i = 0;i < 5; i )
{
printf("%f \n", results[i]);
outfile << results[i] << " ";
}
ends = clock();
double time_taken = ((double)(ends - start)) / CLK_TCK;
outfile << endl << "Time taken is : " << time_taken << endl;
clReleaseMemObject(inputA);
clReleaseMemObject(inputB);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
return 0;
}
提前致謝
uj5u.com熱心網友回復:
我發現了你的錯誤。對我來說,它首先不會編譯 OpenCL C 代碼,所以我用
char info[1024];
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 1024*sizeof(char), (void*)info, NULL); // print build log
printf(info);
獲取構建日志:
<kernel>:4:8: warning: multi-character character constant
printf('0',thid);
^
<kernel>:4:8: warning: incompatible integer to pointer conversion passing 'int' to parameter of type '__constant char *'
printf('190',thid);
^~~~
cl_kernel.h:4694:32: note: passing argument to parameter here
printf(constant char * restrict, ...) __asm("llvm.nvvm.internal.printf.cl");
似乎'不是\"問題。更改此行:
"printf(\"%d\",thid); \n"
然后 OpenCL C 代碼編譯,我可以重現 CL_INVALID_VALUE 錯誤。
這是問題所在:您使用clEnqueueWriteBuffer將資料從復制inputB到0. 您需要添加 C 陣列以將資料復制到:
long inputDataA[DATA_SIZE];
long inputDataB[DATA_SIZE];
long outputData[DATA_SIZE];
和
clResult = clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataA, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
clResult = clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataB, 0, NULL, NULL);
assert(clResult == CL_SUCCESS); // WORKS NOW
clResult = clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, outputData, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
然后它作業,我得到這個輸出:
Setup
0
>> start parallel ----------
25625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035101234567891011121314151617181920212223242526272829303138438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441564656667686970717273747576777879808182838485868788899091929394953523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823839697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612741641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863932333435363738394041424344454647484950515253545556575859606162634484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784798328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628635445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745751921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222234804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105118648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948955125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425431281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581598008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308315765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066072242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542559609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909916406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706711601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901917687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987997367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667679929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210236726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027039289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589597047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347358968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269271 - 0
2 - 1
3 - 2
4 - 4
5 - 6
6 - 9
7 - 12
>>finished parallel in 0.023000 seconds
-------------------------------------
>> start sequential ----------
>> finished sequential in 0.000000
0 - 1
1 - 3
2 - 6
3 - 10
4 - 15
5 - 21
6 - 28
7 - 36
另請注意,無論出于何種愚蠢的原因,在 OpenCL C 中,long= 64 位整數,但在 C 中,long=“至少”32 位整數。在 C 中,您應該使用long long int它,因為這始終是 64 位整數。例如typedef int64_t slong;,您可以使用 whereint64_t本身是long long int.
另一個問題是程式不是確定性的。多次執行時,每次都會得到不同的結果。必須存在一些競爭條件。我想你錯誤地認為它barrier(CLK_GLOBAL_MEM_FENCE);提供了所有執行緒的全域同步,但事實并非如此。唯一的全域同步是在所需的同步點將內核拆分為多個內核,并一個接一個地執行它們。
最后,為了使 C 中的 OpenCL 開發更容易,并防止在這些簡單的錯誤上浪費時間,我創建了一個輕量級的OpenCL-Wrapper來消除所有 OpenCL 代碼開銷。這樣,您的代碼將縮短 4 倍,并且更易于理解:
#include "opencl.hpp"
#define DATA_SIZE 1024
int main() {
Clock clock;
clock.start();
Device device(select_device_with_most_flops()); // compile OpenCL C code for the fastest available device
Memory<slong> arr(device, DATA_SIZE, 1u, true, false);
Memory<slong> inputA(device, DATA_SIZE);
Memory<slong> inputB(device, DATA_SIZE);
Memory<slong> output(device, DATA_SIZE);
for(int i=1; i<DATA_SIZE-1; i ) {
inputA[i-1] = (slong)i;
arr[i-1] = (slong)i;
}
inputA.write_to_device();
Kernel kernel(device, DATA_SIZE, "add", inputA, output, inputB);
kernel.add_constants(DATA_SIZE);
kernel.run();
output.read_from_device();
double time_taken = clock.stop();
// print the results
for(int k=1; k<8; k ) {
printf("%d - ", k);
printf("%d \n", output[k]);
}
printf("\n>>finished parallel in %lf seconds\n", time_taken);
printf("\n-------------------------------------\n");
printf("\n>> start sequential ---------- \n");
long prefixSum[DATA_SIZE] = { 0 };
clock.start();
prefixSum[0] = arr[0];
for(long idx=1; idx<DATA_SIZE; idx ) {
prefixSum[idx] = prefixSum[idx-1] arr[idx];
}
double seqTime = clock.stop();
printf("\n>> finished sequential in %lf\n", seqTime);
for(int j=0; j<8; j ) {
printf("%d - ", j);
printf("%d \n", prefixSum[j]);
}
wait();
return 0;
}
#include "kernel.hpp" // note: unbalanced round brackets () are not allowed and string literals can't be arbitrarily long, so periodically interrupt with ) R(
string opencl_c_container() { return R( // ########################## begin of OpenCL C code ####################################################################
kernel void add(__global long* input, __global long* output, __global long* temp, int size) {
int thid = get_global_id(0);
int offset = 1;
printf("%d",thid);
temp[2*thid] = input[2*thid];
temp[2*thid 1] = input[2*thid 1];
for(int d= size>>1; d>0; d >>= 1) {
barrier(CLK_GLOBAL_MEM_FENCE);
if(thid < d) {
int ai = offset*(2*thid 1)-1;
int bi = offset*(2*thid 2)-1;
temp[bi] = temp[ai];
}
offset = offset*2;
}
temp[size-1] = 0;
barrier(CLK_GLOBAL_MEM_FENCE);
for(int d = 1; d<size; d *= 2) {
offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE);
if(thid < d) {
int ai = offset*(2*thid 1)-1; int bi = offset*(2*thid 2)-1;
long t = temp[ai]; temp[ai] = temp[bi]; temp[bi] = t;
}
}
barrier(CLK_GLOBAL_MEM_FENCE);
output[2*thid] = temp[2*thid];
output[2*thid 1] = temp[2*thid 1];
}
);} // ############################################################### end of OpenCL C code #####################################################################
轉載請註明出處,本文鏈接:https://www.uj5u.com/qukuanlian/466751.html
下一篇:兩組頂點之間的最短路徑
