c/c++语言开发共享并发内核启动示例 – CUDA

我正在尝试为一个非常复杂的CUDA内核实现并发内核启动,所以我想我会从一个简单的例子开始。 它只是启动一个减少总和的内核。 很简单。 这里是:

#include  #include  #include  #include  extern __shared__ char dsmem[]; __device__ double *scratch_space; __device__ double NDreduceSum(double *a, unsigned short length) { const int tid = threadIdx.x; unsigned short k = length; double *b; b = scratch_space; for (int i = tid; i < length; i+= blockDim.x) b[i] = a[i]; __syncthreads(); do { k = (k + 1) / 2; if (tid < k && tid + k  1); return b[0]; } __device__ double reduceSum(double *a, unsigned short length) { const int tid = threadIdx.x; unsigned short k = length; do { k = (k + 1) / 2; if (tid < k && tid + k  1); return a[0]; } __global__ void kernel_thing(double *ad, int size) { double sum_1, sum_2, sum_3; time_t begin, end, t1, t2, t3; scratch_space = (double *) &dsmem[0]; for (int j = 0; j < 1000000; j++) { begin = clock(); sum_1 = NDreduceSum(ad, size); end = clock(); } __syncthreads(); t1 = end - begin; begin = clock(); sum_2 = 0; if (threadIdx.x == 0) { for (int i = 0; i < size; i++) { sum_2 += ad[i]; } } __syncthreads(); end = clock(); t2 = end - begin; __syncthreads(); begin = clock(); sum_3 = reduceSum(ad, size); end = clock(); __syncthreads(); t3 = end - begin; if (threadIdx.x == 0) { printf("Sum found: %lf and %lf and %lf. In %ld and %ld and %ld ticks.n", sum_1, sum_2, sum_3, t1, t2, t3); } } int main(int argc, char **argv) { int i; const int size = 512; double *a, *ad, *b, *bd; double sum_a, sum_b; cudaStream_t stream_a, stream_b; cudaError_t result; cudaEvent_t a_start, a_stop, b_start, b_stop; a = (double *) malloc(sizeof(double) * size); b = (double *) malloc(sizeof(double) * size); srand48(time(0)); for (i = 0; i < size; i++) { a[i] = drand48(); } for (i = 0; i < size; i++) { b[i] = drand48(); } sum_a = 0; for (i = 0; i < size; i++) { sum_a += a[i]; } sum_b = 0; for (i = 0; i < size; i++) { sum_b += b[i]; } printf("Looking for sum_a %lfn", sum_a); printf("Looking for sum_b %lfn", sum_b); cudaEventCreate(&a_start); cudaEventCreate(&b_start); cudaEventCreate(&a_stop); cudaEventCreate(&b_stop); cudaMalloc((void **) &ad, sizeof(double) * size); cudaMalloc((void **) &bd, sizeof(double) * size); result = cudaStreamCreate(&stream_a); result = cudaStreamCreate(&stream_b); result = cudaMemcpyAsync(ad, a, sizeof(double) * size, cudaMemcpyHostToDevice, stream_a); result = cudaMemcpyAsync(bd, b, sizeof(double) * size, cudaMemcpyHostToDevice, stream_b); cudaEventRecord(a_start); kernel_thing<<>>(ad, size); cudaEventRecord(a_stop); cudaEventRecord(b_start); kernel_thing<<>>(bd, size); cudaEventRecord(b_stop); result = cudaMemcpyAsync(a, ad, sizeof(double) * size, cudaMemcpyDeviceToHost, stream_a); result = cudaMemcpyAsync(b, bd, sizeof(double) * size, cudaMemcpyDeviceToHost, stream_b); cudaEventSynchronize(a_stop); cudaEventSynchronize(b_stop); float a_ms = 0; float b_ms = 0; cudaEventElapsedTime(&a_ms, a_start, a_stop); cudaEventElapsedTime(&b_ms, b_start, b_stop); printf("%lf ms for A.n", a_ms); printf("%lf ms for B.n", b_ms); result = cudaStreamDestroy(stream_a); result = cudaStreamDestroy(stream_b); if (result != cudaSuccess) { printf("I should probably do this after each important operation.n"); } /* printf("Matrix after:n"); for (i = 0; i < size; i++) { printf("%lf ", a[i]); } printf("n"); */ free(a); free(b); cudaFree(ad); cudaFree(bd); return 0; } 

编译如下:

 CFLAGS = -arch sm_35 CC = nvcc all: parallel parallel: parallel.cu $(LINK.c) $^ -o $@ clean: rm -f *.o core parallel 

我正在使用一台特斯拉K20X。

当我运行这个简单的例子时,我得到以下输出:

 Looking for sum_a 247.983945 Looking for sum_b 248.033749 Sum found: 247.983945 and 247.983945 and 247.983945. In 3242 and 51600 and 4792 ticks. Sum found: 248.033749 and 248.033749 and 248.033749. In 3314 and 52000 and 4497 ticks. 4645.079102 ms for A. 4630.725098 ms for B. Application 577759 resources: utime ~8s, stime ~2s, Rss ~82764, inblocks ~406, outblocks ~967 

因此,正如您所看到的,每个内核都获得了正确的结果,大约需要4.5秒,这是我在早期的单内核版本中得到的。 大! 但是,正如您从aprun输出中看到的那样,挂起时间实际上大约是10秒,这远远超过单内核版本。 因此,看起来内核要么不是并行启动,要么我几乎没有达到我期望从并发内核启动的速度提升(2倍)。

to tl; dr这个问题:

谢谢你的帮助。

    内核之间的cudaEventRecord操作导致序列化。

    现在你得到的结果:

     4645.079102 ms for A. 4630.725098 ms for B. 

    由于此序列化而背靠背。

    相反,只需要整个内核启动序列的时间:

     cudaEventRecord(a_start); kernel_thing<<<1, 512, 49152, stream_a>>>(ad, size); kernel_thing<<<1, 512, 49152, stream_b>>>(bd, size); cudaEventRecord(a_stop); 

    我认为你会看到(a_start, a_stop)的经过时间与你之前的一个内核(~4600ms)大致相同,表明或多或少的完全并发。 我使用CUDA 6 RC,将数据复制回主机而不是内核中的printf ,并消除了内核调用之间的cudaEventRecord操作,我的总执行时间为~4.8s。 如果我没有修改cudaEventRecord安排,那么我的执行时间是~8.3s

    其他几点说明:

      以上就是c/c++开发分享并发内核启动示例 – CUDA相关内容,想了解更多C/C++开发(异常处理)及C/C++游戏开发关注计算机技术网(www.ctvol.com)!)。

      本文来自网络收集,不代表计算机技术网立场,如涉及侵权请联系管理员删除。

      ctvol管理联系方式QQ:251552304

      本文章地址:https://www.ctvol.com/c-cdevelopment/519604.html

      (0)
      上一篇 2020年12月5日
      下一篇 2020年12月5日

      精彩推荐