clEnqueueWriteBuffer上的OpenCLCL_INVALID

我正在嘗試使演算法在OpenCL. 我使用這個存盤庫 ( Source.cpp) 作為模板。我現在想將整個程式轉換為long演算法型別而不是float. 但我總是CL_INVALID_VALUE在第二個得到一個 (-30) 例外clEnqueueWriteBuffer。我浪費了幾個小時沒有發現錯誤，所以也許我已經監督了一些明顯的事情（我還沒有對 opencl 做太多......）？

我的代碼（不作業）

#include <cassert>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include<time.h>
#include <CL/cl.h>

//#define DATA_SIZE 1024
#define DATA_SIZE 1024

using namespace std;


//$ /f/Tools/OCL_SDK_Light/lib/x86_64/opencl.lib blelloch_scan.cpp
const char* ProgramSource =
"__kernel void add(__global long *input, __global long *output, __global long *temp, int size){\n"\
"int thid = get_global_id(0); \n"\
"int offset = 1; \n"\
"printf('%d',thid); \n"\
"temp[2*thid] = input[2*thid]; \n"\
"temp[2*thid 1] = input[2*thid 1]; \n"\
"for(int d= size>>1; d>0; d >>= 1){ \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d){ \n"\
"int ai = offset*(2*thid   1)-1; \n"\
"int bi = offset*(2*thid   2)-1; \n"\
"temp[bi]  = temp[ai]; } \n"\
"offset = offset*2; \n"\
"} \n"\
"temp[size-1] = 0; \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"for(int d = 1; d<size; d *= 2){ \n"\
"offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d) { \n"\
"int ai = offset*(2*thid 1)-1; int bi = offset*(2*thid 2)-1; \n"\
"long t = temp[ai]; temp[ai] = temp[bi]; temp[bi]  = t; }  \n"\
"} \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"output[2*thid] = temp[2*thid]; \n"\
"output[2*thid 1] = temp[2*thid 1]; \n"\
"}\n"\
"\n";
/*
*/




int main(void)
{
    cl_context context;
    cl_context_properties properties[3];

    cl_command_queue command_queue;
    cl_kernel kernel;
    cl_program program;
    cl_int err;
    cl_uint num_platforms = 0;
    cl_platform_id* platforms;
    cl_device_id device_id;
    cl_uint num_of_devices = 0;
    cl_mem inputA, inputB, output;
    size_t global, loc;
    std::cout << "Setup \n";

    long arr[DATA_SIZE];
    long inputDataA[DATA_SIZE];
    long results[2 * DATA_SIZE];
    long  i;
    for (i = 1; i < DATA_SIZE - 1;i  )
    {
        inputDataA[i-1] = (long)i;
        arr[i-1] = (long)i;
    }
    clock_t ends;

    /* --------------------- Get platform ---------------------*/
    cl_int clResult = clGetPlatformIDs(0, NULL, &num_platforms);
    assert(clResult == CL_SUCCESS);

    platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
    clResult = clGetPlatformIDs(num_platforms, platforms, NULL);
    assert(clResult == CL_SUCCESS);
    /* --------------------- ------------ ---------------------*/


    /* --------------------- Get devices ---------------------*/
    cl_device_id* devices = NULL;
    cl_uint num_devices;

    clResult = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
    assert(clResult == CL_SUCCESS);

    devices = (cl_device_id*)malloc(sizeof(cl_device_id) * num_platforms);

    if (clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, devices, NULL) != CL_SUCCESS)
    {
        printf("could not find device id");
    }
    assert(clResult == CL_SUCCESS);
    /* --------------------- ----------- ---------------------*/
    properties[0] = CL_CONTEXT_PLATFORM;
    properties[1] = 0;
    cl_int contextResult;
    context = clCreateContext(NULL, 1, &devices[0], NULL, NULL, &contextResult);
    assert(contextResult == CL_SUCCESS);
    // create command queue using the context and device
    command_queue = clCreateCommandQueueWithProperties(context, devices[0], 0, &err);

    // create a program from the kernel source code
    program = clCreateProgramWithSource(context, 1, (const char**)&ProgramSource, NULL, &err);

    // compile the program
    if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS)
    {
        printf("Error building program\n");
        return 1;
    }

    // specify which kernel from the program to execute
    kernel = clCreateKernel(program, "add", &err);

    // create buffers for the input and ouput
    cl_int result;
    inputA = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);
    inputB = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);
    output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);

    // load data into the input buffer
    clResult = clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataA, 0, NULL, NULL);
    assert(clResult == CL_SUCCESS);
    clResult = clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(long) * DATA_SIZE , 0, 0, NULL, NULL);
    assert(clResult == CL_SUCCESS); // ERROR HERE 
    clResult = clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, 0, 0, NULL, NULL);
    assert(clResult == CL_SUCCESS);


    int temp = DATA_SIZE;

    clock_t start = clock();

    // set the argument list for the kernel command
    clResult = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
    assert(clResult == CL_SUCCESS);


    clResult = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
    assert(clResult == CL_SUCCESS);


    clResult = clSetKernelArg(kernel, 2, sizeof(cl_mem), &inputB);
    assert(clResult == CL_SUCCESS);


    clResult = clSetKernelArg(kernel, 3, sizeof(int), &temp);
    assert(clResult == CL_SUCCESS);


    global = DATA_SIZE; // num of processors
    loc = 256;

    printf("\n>> start parallel ---------- \n");
    // enqueue the kernel command for execution
    clResult = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, &loc, 0, NULL, NULL);
    assert(clResult == CL_SUCCESS);

    // copy the results from out of the output buffer
    clResult = clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, results, 0, NULL, NULL);
    assert(clResult == CL_SUCCESS);


    clFinish(command_queue);
    ends = clock();

    // print the results
    int k = 1;
    for (k = 1;k < 8; k  )
    {
        printf("%d - ", k);
        printf("%d \n", results[k]);
    }
    double time_taken = ((double)(ends - start)) / CLK_TCK;
    printf("\n>>finished parallel in %lf seconds\n", time_taken);

    // cleanup - release OpenCL resources

    printf("\n-------------------------------------\n");


    /* -------sequential ------- */
    printf("\n>> start sequential ---------- \n");
    long prefixSum[DATA_SIZE] = { 0 };

    const clock_t startSequential = clock();

    prefixSum[0] = arr[0];
    long idx = 1;

    for (idx = 1; idx < DATA_SIZE; idx  ) {
        prefixSum[idx] = prefixSum[idx - 1]   arr[idx];
    }
    const clock_t endSequential = clock();




    double seqTime = ((double)(endSequential - startSequential)) / CLK_TCK;

    printf("\n>> finished sequential in %lf\n", seqTime);
    for (int j = 0;j < 8; j  )
    {
        printf("%d - ", j);
        printf("%d \n", prefixSum[j]);
    }


    clReleaseMemObject(inputA);
    clReleaseMemObject(inputB);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(command_queue);
    clReleaseContext(context);

    return 0;
}

存盤庫代碼（作業）：

#include <cassert>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include<time.h>

#include <CL/cl.h>

#define DATA_SIZE 1024
using namespace std;
ofstream outfile;


const char* ProgramSource =
"__kernel void add(__global float *input, __global float *output, __global float *temp, int size){\n"\
"int thid = get_global_id(0); \n"\
"int offset = 1; \n"\
"temp[2*thid] = input[2*thid]; \n"\
"temp[2*thid 1] = input[2*thid 1]; \n"\
"for(int d= size>>1; d>0; d >>= 1){ \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d){ \n"\
"int ai = offset*(2*thid   1)-1; \n"\
"int bi = offset*(2*thid   2)-1; \n"\
"temp[bi]  = temp[ai]; } \n"\
"offset = offset*2; \n"\
"} \n"\
"temp[size-1] = 0; \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"for(int d = 1; d<size; d *= 2){ \n"\
"offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d) { \n"\
"int ai = offset*(2*thid 1)-1; int bi = offset*(2*thid 2)-1; \n"\
"float t = temp[ai]; temp[ai] = temp[bi]; temp[bi]  = t; }  \n"\
"} \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"output[2*thid] = temp[2*thid]; \n"\
"output[2*thid 1] = temp[2*thid 1]; \n"\
"}\n"\
"\n";
/*
*/



int main(void)
{
    cl_uint num_platforms = 0;

    cl_context context;
    cl_context_properties properties[3];
    cl_kernel kernel;
    cl_platform_id* platforms;
    cl_command_queue command_queue;
    cl_program program;
    cl_int err;
    cl_uint num_of_platforms = 0;
    cl_platform_id platform_id;
    cl_device_id device_id;
    cl_uint num_of_devices = 0;
    cl_mem inputA, inputB, output;
    outfile.open("shubham.txt");
    size_t global, loc;

    float inputDataA[DATA_SIZE];
    float results[2 * DATA_SIZE] = { 0 };

    int i;
    for (i = 0; i < DATA_SIZE;i  )
    {
        inputDataA[i] = (float)i;
    }
    clock_t start, ends;

    /* --------------------- Get platform ---------------------*/
    cl_int clResult = clGetPlatformIDs(0, NULL, &num_platforms);
    assert(clResult == CL_SUCCESS);

    platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
    clResult = clGetPlatformIDs(num_platforms, platforms, NULL);
    assert(clResult == CL_SUCCESS);
    /* --------------------- ------------ ---------------------*/


    /* --------------------- Get devices ---------------------*/
    cl_device_id* devices = NULL;
    cl_uint num_devices;

    clResult = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
    assert(clResult == CL_SUCCESS);

    devices = (cl_device_id*)malloc(sizeof(cl_device_id) * num_platforms);

    if (clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, devices, NULL) != CL_SUCCESS)
    {
        printf("could not find device id");
    }
    assert(clResult == CL_SUCCESS);
    /* --------------------- ----------- ---------------------*/
    properties[0] = CL_CONTEXT_PLATFORM;
    properties[1] = 0;
    cl_int contextResult;
    context = clCreateContext(NULL, 1, &devices[0], NULL, NULL, &contextResult);
    assert(contextResult == CL_SUCCESS);
    // create command queue using the context and device

    // create command queue using the context and device
    command_queue = clCreateCommandQueueWithProperties(context, devices[0], 0, &err);

    // create a program from the kernel source code
    program = clCreateProgramWithSource(context, 1, (const char**)&ProgramSource, NULL, &err);

    // compile the program
    if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS)
    {
        printf("Error building program\n");
        return 1;
    }

    // specify which kernel from the program to execute
    kernel = clCreateKernel(program, "add", &err);

    // create buffers for the input and ouput

    inputA = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);
    inputB = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);
    output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);

    // load data into the input buffer
    clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(float) * DATA_SIZE, inputDataA, 0, NULL, NULL);
    clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(float) * DATA_SIZE, 0, 0, NULL, NULL);
    clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, 0, 0, NULL, NULL);

    int temp = DATA_SIZE;

    start = clock();

    // set the argument list for the kernel command
    clResult = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
    assert(clResult == CL_SUCCESS);

    clResult = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
    assert(clResult == CL_SUCCESS);

    clResult = clSetKernelArg(kernel, 2, sizeof(cl_mem), &inputB);
    assert(clResult == CL_SUCCESS);
    clResult = clSetKernelArg(kernel, 3, sizeof(int), &temp);
    assert(clResult == CL_SUCCESS);

    global = DATA_SIZE;
    loc = 256;
    // enqueue the kernel command for execution
    clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, &loc, 0, NULL, NULL);
    clFinish(command_queue);

    // copy the results from out of the output buffer
    clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, results, 0, NULL, NULL);
    //clEnqueueReadBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(float) *16, shubh, 0, NULL, NULL);

    // print the results
    printf("output: ");

    for (i = 0;i < 5; i  )
    {
        printf("%f \n", results[i]);
        outfile << results[i] << " ";
    }
    ends = clock();
    double time_taken = ((double)(ends - start)) / CLK_TCK;
    outfile << endl << "Time taken is : " << time_taken << endl;

    clReleaseMemObject(inputA);
    clReleaseMemObject(inputB);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(command_queue);
    clReleaseContext(context);
    return 0;
}

提前致謝

uj5u.com熱心網友回復：

我發現了你的錯誤。對我來說，它首先不會編譯 OpenCL C 代碼，所以我用

char info[1024];
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 1024*sizeof(char), (void*)info, NULL); // print build log
printf(info);

獲取構建日志：

<kernel>:4:8: warning: multi-character character constant
printf('0',thid);
       ^
<kernel>:4:8: warning: incompatible integer to pointer conversion passing 'int' to parameter of type '__constant char *'
printf('190',thid);
       ^~~~
cl_kernel.h:4694:32: note: passing argument to parameter here
printf(constant char * restrict, ...) __asm("llvm.nvvm.internal.printf.cl");

似乎'不是\"問題。更改此行：

"printf(\"%d\",thid); \n"

然后 OpenCL C 代碼編譯，我可以重現 CL_INVALID_VALUE 錯誤。

這是問題所在：您使用clEnqueueWriteBuffer將資料從復制inputB到0. 您需要添加 C 陣列以將資料復制到：

long inputDataA[DATA_SIZE];
long inputDataB[DATA_SIZE];
long outputData[DATA_SIZE];

和

clResult = clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataA, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
clResult = clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataB, 0, NULL, NULL);
assert(clResult == CL_SUCCESS); // WORKS NOW
clResult = clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, outputData, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);

然后它作業，我得到這個輸出：

Setup
0

>> start parallel ----------
25625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035101234567891011121314151617181920212223242526272829303138438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441564656667686970717273747576777879808182838485868788899091929394953523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823839697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612741641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863932333435363738394041424344454647484950515253545556575859606162634484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784798328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628635445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745751921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222234804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105118648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948955125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425431281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581598008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308315765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066072242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542559609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909916406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706711601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901917687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987997367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667679929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210236726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027039289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589597047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347358968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269271 - 0
2 - 1
3 - 2
4 - 4
5 - 6
6 - 9
7 - 12

>>finished parallel in 0.023000 seconds

-------------------------------------

>> start sequential ----------

>> finished sequential in 0.000000
0 - 1
1 - 3
2 - 6
3 - 10
4 - 15
5 - 21
6 - 28
7 - 36

另請注意，無論出于何種愚蠢的原因，在 OpenCL C 中，long= 64 位整數，但在 C 中，long=“至少”32 位整數。在 C 中，您應該使用long long int它，因為這始終是 64 位整數。例如typedef int64_t slong;，您可以使用 whereint64_t本身是long long int.

另一個問題是程式不是確定性的。多次執行時，每次都會得到不同的結果。必須存在一些競爭條件。我想你錯誤地認為它barrier(CLK_GLOBAL_MEM_FENCE);提供了所有執行緒的全域同步，但事實并非如此。唯一的全域同步是在所需的同步點將內核拆分為多個內核，并一個接一個地執行它們。

最后，為了使 C 中的 OpenCL 開發更容易，并防止在這些簡單的錯誤上浪費時間，我創建了一個輕量級的OpenCL-Wrapper來消除所有 OpenCL 代碼開銷。這樣，您的代碼將縮短 4 倍，并且更易于理解：

#include "opencl.hpp"

#define DATA_SIZE 1024

int main() {
    Clock clock;
    clock.start();

    Device device(select_device_with_most_flops()); // compile OpenCL C code for the fastest available device

    Memory<slong> arr(device, DATA_SIZE, 1u, true, false);
    Memory<slong> inputA(device, DATA_SIZE);
    Memory<slong> inputB(device, DATA_SIZE);
    Memory<slong> output(device, DATA_SIZE);

    for(int i=1; i<DATA_SIZE-1; i  ) {
        inputA[i-1] = (slong)i;
        arr[i-1] = (slong)i;
    }
    inputA.write_to_device();
    
    Kernel kernel(device, DATA_SIZE, "add", inputA, output, inputB);
    kernel.add_constants(DATA_SIZE);

    kernel.run();
    output.read_from_device();

    double time_taken = clock.stop();

    // print the results
    for(int k=1; k<8; k  ) {
        printf("%d - ", k);
        printf("%d \n", output[k]);
    }
    printf("\n>>finished parallel in %lf seconds\n", time_taken);
    printf("\n-------------------------------------\n");
    printf("\n>> start sequential ---------- \n");
    long prefixSum[DATA_SIZE] = { 0 };

    clock.start();
    prefixSum[0] = arr[0];
    for(long idx=1; idx<DATA_SIZE; idx  ) {
        prefixSum[idx] = prefixSum[idx-1] arr[idx];
    }
    double seqTime = clock.stop();

    printf("\n>> finished sequential in %lf\n", seqTime);
    for(int j=0; j<8; j  ) {
        printf("%d - ", j);
        printf("%d \n", prefixSum[j]);
    }

    wait();
    return 0;
}

#include "kernel.hpp" // note: unbalanced round brackets () are not allowed and string literals can't be arbitrarily long, so periodically interrupt with ) R(
string opencl_c_container() { return R( // ########################## begin of OpenCL C code ####################################################################

kernel void add(__global long* input, __global long* output, __global long* temp, int size) {
    int thid = get_global_id(0);
    int offset = 1;
    printf("%d",thid);
    temp[2*thid] = input[2*thid];
    temp[2*thid 1] = input[2*thid 1];
    for(int d= size>>1; d>0; d >>= 1) {
        barrier(CLK_GLOBAL_MEM_FENCE);
        if(thid < d) {
            int ai = offset*(2*thid   1)-1;
            int bi = offset*(2*thid   2)-1;
            temp[bi]  = temp[ai];
        }
        offset = offset*2;
    }
    temp[size-1] = 0;
    barrier(CLK_GLOBAL_MEM_FENCE);
    for(int d = 1; d<size; d *= 2) {
        offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE);
        if(thid < d) {
            int ai = offset*(2*thid 1)-1; int bi = offset*(2*thid 2)-1;
            long t = temp[ai]; temp[ai] = temp[bi]; temp[bi]  = t;
        }
    }
    barrier(CLK_GLOBAL_MEM_FENCE);
    output[2*thid] = temp[2*thid];
    output[2*thid 1] = temp[2*thid 1];
}

);} // ############################################################### end of OpenCL C code #####################################################################

轉載請註明出處，本文鏈接：https://www.uj5u.com/qukuanlian/466751.html

標籤：C 算法开放式

上一篇：調整Floyd-Warshall演算法以檢測周期

下一篇：兩組頂點之間的最短路徑

clEnqueueWriteBuffer上的OpenCLCL_INVALID_VALUE錯誤