c/c++语言开发共享CUDA – Eratosthenes筛分为部分-计算机技术网

我在GPU上编写了Eratosthenes的Sieve（ https://en.wikipedia.org/wiki/Sieve_of_Eratosthenes ）的实现。但不是这样的 – https://developer-resource.blogspot.com/2008/07/cuda-sieve-of-eratosthenes.html

方法：

码：

#include  #include  #define THREADS 1024 __global__ void kernel(int *global, int threads) { extern __shared__ int cache[]; int tid = threadIdx.x + 1; int offset = blockIdx.x * blockDim.x; int number = offset + tid; cache[tid - 1] = global[number]; __syncthreads(); int start = offset + 1; int end = offset + threads; for (int i = start; i <= end; i++) { if ((i != tid) && (tid != 1) && (i % tid == 0)) { cache[i - offset - 1] = 1; } } __syncthreads(); global[number] = cache[tid - 1]; } int main(int argc, char *argv[]) { int *array, *dev_array; int n = atol(argv[1]); int n_sqrt = floor(sqrt((double)n)); size_t array_size = n * sizeof(int); array = (int*) malloc(n * sizeof(int)); array[0] = 1; array[1] = 1; for (int i = 2; i < n; i++) { array[i] = 0; } cudaMalloc((void**)&dev_array, array_size); cudaMemcpy(dev_array, array, array_size, cudaMemcpyHostToDevice); int threads = min(n_sqrt, THREADS); int blocks = n / threads; int shared = threads * sizeof(int); kernel<<>>(dev_array, threads); cudaMemcpy(array, dev_array, array_size, cudaMemcpyDeviceToHost); int count = 0; for (int i = 0; i < n; i++) { if (array[i] == 0) { count++; } } printf("Count: %dn", count); return 0; }

运行：./ieve 10240000

当n = 16,64,1024,102400时，它可以正常工作……但是对于n = 10240000，我得到的结果不正确。哪里有问题？

在我看来，这段代码有很多问题。

以下是一些代码尝试解决上面的问题＃1，并至少解释与＃2和＃3相关的故障：

 #include  #include  #define THREADS 1024 #define MAX 10240000 #define cudaCheckErrors(msg)  do {  cudaError_t __err = cudaGetLastError();  if (__err != cudaSuccess) {  fprintf(stderr, "Fatal error: %s (%s at %s:%d)n",  msg, cudaGetErrorString(__err),  __FILE__, __LINE__);  fprintf(stderr, "*** FAILED - ABORTINGn");  exit(1);  }  } while (0) __global__ void kernel(int *global, int threads) { extern __shared__ int cache[]; int tid = threadIdx.x + 1; int offset = blockIdx.x * blockDim.x; int number = offset + tid; if ((blockIdx.x != (gridDim.x-1)) || (threadIdx.x != (blockDim.x-1))){ cache[tid - 1] = global[number]; __syncthreads(); int start = offset + 1; int end = offset + threads; for (int i = start; i <= end; i++) { if ((i != tid) && (tid != 1) && (i % tid == 0)) { cache[i - offset - 1] = 1; } } __syncthreads(); global[number] = cache[tid - 1];} } int cpu_sieve(int n){ int limit = floor(sqrt(n)); int *test_arr = (int *)malloc(n*sizeof(int)); if (test_arr == NULL) return -1; memset(test_arr, 0, n*sizeof(int)); for (int i = 2; i < limit; i++) if (!test_arr[i]){ int j = i*i; while (j <= n){ test_arr[j] = 1; j += i;}} int count = 0; for (int i = 2; i < n; i++) if (!test_arr[i]) count++; return count; } int main(int argc, char *argv[]) { int *array, *dev_array; if (argc != 2) {printf("must supply n as command line parametern"); return 1;} int n = atoi(argv[1]); if ((n < 1) || (n > MAX)) {printf("n out of range %dn", n); return 1;} int n_sqrt = floor(sqrt((double)n)); size_t array_size = n * sizeof(int); array = (int*) malloc(n * sizeof(int)); array[0] = 1; array[1] = 1; for (int i = 2; i < n; i++) { array[i] = 0; } cudaMalloc((void**)&dev_array, array_size); cudaMemcpy(dev_array, array, array_size, cudaMemcpyHostToDevice); int threads = min(n_sqrt, THREADS); int blocks = n / threads; int shared = threads * sizeof(int); printf("threads = %d, blocks = %dn", threads, blocks); kernel<<>>(dev_array, threads); cudaMemcpy(array, dev_array, array_size, cudaMemcpyDeviceToHost); cudaCheckErrors("some error"); int count = 0; for (int i = 0; i < n; i++) { if (array[i] == 0) { count++; } } printf("Count: %dn", count); printf("CPU Sieve: %dn", cpu_sieve(n)); return 0; }

我认为有几个问题，但这里是指向实际问题的指针：Eratosthenes的筛子去除了已经遇到的素数的迭代倍数，并且你想要将工作量分成线程块，其中每个线程 – block在一块共享内存上运行（在您的示例中为缓存）。但是，线程块通常独立于所有其他线程块，并且不能容易地彼此通信。举例来说明这个问题：带索引0的线程块中索引为0的线程删除了2的倍数。索引> 0的线程块无法知道这一点。

需要了解更多c/c++开发分享CUDA – Eratosthenes筛分为部分，也可以关注C/ C++技术分享栏目---计算机技术网(www.ctvol.com)!

以上就是c/c++开发分享CUDA – Eratosthenes筛分为部分相关内容,想了解更多C/C++开发(异常处理)及C/C++游戏开发关注计算机技术网(www.ctvol.com)!)。

本文来自网络收集，不代表计算机技术网立场，如涉及侵权请联系管理员删除。

ctvol管理联系方式QQ:251552304

本文章地址：https://www.ctvol.com/c-cdevelopment/979410.html

c/c++语言开发共享CUDA – Eratosthenes筛分为部分

精彩推荐