c/c++语言开发共享CUDA – Eratosthenes筛分为部分

我在GPU上编写了Eratosthenes的Sieve( https://en.wikipedia.org/wiki/Sieve_of_Eratosthenes )的实现。 但不是这样的 – https://developer-resource.blogspot.com/2008/07/cuda-sieve-of-eratosthenes.html

方法:

码:

#include  #include  #define THREADS 1024 __global__ void kernel(int *global, int threads) { extern __shared__ int cache[]; int tid = threadIdx.x + 1; int offset = blockIdx.x * blockDim.x; int number = offset + tid; cache[tid - 1] = global[number]; __syncthreads(); int start = offset + 1; int end = offset + threads; for (int i = start; i <= end; i++) { if ((i != tid) && (tid != 1) && (i % tid == 0)) { cache[i - offset - 1] = 1; } } __syncthreads(); global[number] = cache[tid - 1]; } int main(int argc, char *argv[]) { int *array, *dev_array; int n = atol(argv[1]); int n_sqrt = floor(sqrt((double)n)); size_t array_size = n * sizeof(int); array = (int*) malloc(n * sizeof(int)); array[0] = 1; array[1] = 1; for (int i = 2; i < n; i++) { array[i] = 0; } cudaMalloc((void**)&dev_array, array_size); cudaMemcpy(dev_array, array, array_size, cudaMemcpyHostToDevice); int threads = min(n_sqrt, THREADS); int blocks = n / threads; int shared = threads * sizeof(int); kernel<<>>(dev_array, threads); cudaMemcpy(array, dev_array, array_size, cudaMemcpyDeviceToHost); int count = 0; for (int i = 0; i < n; i++) { if (array[i] == 0) { count++; } } printf("Count: %dn", count); return 0; } 

运行:./ieve 10240000

当n = 16,64,1024,102400时,它可以正常工作……但是对于n = 10240000,我得到的结果不正确。 哪里有问题?

    在我看来,这段代码有很多问题。

    以下是一些代码尝试解决上面的问题#1,并至少解释与#2和#3相关的故障:

     #include  #include  #define THREADS 1024 #define MAX 10240000 #define cudaCheckErrors(msg)  do {  cudaError_t __err = cudaGetLastError();  if (__err != cudaSuccess) {  fprintf(stderr, "Fatal error: %s (%s at %s:%d)n",  msg, cudaGetErrorString(__err),  __FILE__, __LINE__);  fprintf(stderr, "*** FAILED - ABORTINGn");  exit(1);  }  } while (0) __global__ void kernel(int *global, int threads) { extern __shared__ int cache[]; int tid = threadIdx.x + 1; int offset = blockIdx.x * blockDim.x; int number = offset + tid; if ((blockIdx.x != (gridDim.x-1)) || (threadIdx.x != (blockDim.x-1))){ cache[tid - 1] = global[number]; __syncthreads(); int start = offset + 1; int end = offset + threads; for (int i = start; i <= end; i++) { if ((i != tid) && (tid != 1) && (i % tid == 0)) { cache[i - offset - 1] = 1; } } __syncthreads(); global[number] = cache[tid - 1];} } int cpu_sieve(int n){ int limit = floor(sqrt(n)); int *test_arr = (int *)malloc(n*sizeof(int)); if (test_arr == NULL) return -1; memset(test_arr, 0, n*sizeof(int)); for (int i = 2; i < limit; i++) if (!test_arr[i]){ int j = i*i; while (j <= n){ test_arr[j] = 1; j += i;}} int count = 0; for (int i = 2; i < n; i++) if (!test_arr[i]) count++; return count; } int main(int argc, char *argv[]) { int *array, *dev_array; if (argc != 2) {printf("must supply n as command line parametern"); return 1;} int n = atoi(argv[1]); if ((n < 1) || (n > MAX)) {printf("n out of range %dn", n); return 1;} int n_sqrt = floor(sqrt((double)n)); size_t array_size = n * sizeof(int); array = (int*) malloc(n * sizeof(int)); array[0] = 1; array[1] = 1; for (int i = 2; i < n; i++) { array[i] = 0; } cudaMalloc((void**)&dev_array, array_size); cudaMemcpy(dev_array, array, array_size, cudaMemcpyHostToDevice); int threads = min(n_sqrt, THREADS); int blocks = n / threads; int shared = threads * sizeof(int); printf("threads = %d, blocks = %dn", threads, blocks); kernel<<>>(dev_array, threads); cudaMemcpy(array, dev_array, array_size, cudaMemcpyDeviceToHost); cudaCheckErrors("some error"); int count = 0; for (int i = 0; i < n; i++) { if (array[i] == 0) { count++; } } printf("Count: %dn", count); printf("CPU Sieve: %dn", cpu_sieve(n)); return 0; } 

    我认为有几个问题,但这里是指向实际问题的指针:Eratosthenes的筛子去除了已经遇到的素数的迭代倍数,并且你想要将工作量分成线程块,其中每个线程 – block在一块共享内存上运行(在您的示例中为缓存)。 但是,线程块通常独立于所有其他线程块,并且不能容易地彼此通信。 举例来说明这个问题:带索引0的线程块中索引为0的线程删除了2的倍数。索引> 0的线程块无法知道这一点。

    需要了解更多c/c++开发分享CUDA – Eratosthenes筛分为部分,也可以关注C/ C++技术分享栏目---计算机技术网(www.ctvol.com)!

      以上就是c/c++开发分享CUDA – Eratosthenes筛分为部分相关内容,想了解更多C/C++开发(异常处理)及C/C++游戏开发关注计算机技术网(www.ctvol.com)!)。

      本文来自网络收集,不代表计算机技术网立场,如涉及侵权请联系管理员删除。

      ctvol管理联系方式QQ:251552304

      本文章地址:https://www.ctvol.com/c-cdevelopment/979410.html

      (0)
      上一篇 2021年12月12日
      下一篇 2021年12月12日

      精彩推荐