|
cuda并行编程:原子操作atomic之计算直方图,比较了cuda与cpu的时间
- #include <stdio.h>
- #include <sys/time.h>
- __global__ void get_hist(float *a, int *hist)
- {
- //块数目 线程数目
- int tid = threadIdx.x;
- int bid = blockIdx.x;
- int idx = tid + bid * blockDim.x;
- //原子操作 加 cuda自带的api 一种锁的机制
- atomicAdd(&hist[(int)a[idx]], 1);
- }
- //cuda并行编程:原子操作atomic之计算直方图,比较了cuda与cpu的时间
- int main()
- {
- int size = 32000000;
- float *a = new float[size];
- int length = 10;
- for(int i = 0; i < size; i++)
- {
- a[i] = i*(i+1) % length;
- }
- int hist[length] = {0};
- float *aGpu;
- cudaMalloc(&aGpu, size * sizeof(float));
- cudaMemcpy(aGpu, a, size * sizeof(float), cudaMemcpyHostToDevice);
- int *histGpu;
- cudaMalloc(&histGpu, length * sizeof(int));
- cudaMemcpy(histGpu, hist, length * sizeof(int), cudaMemcpyHostToDevice);
- // 统计时间
- struct timeval startTime, endTime;
- gettimeofday(&startTime, NULL);
- // get_hist<<<1, size>> >(aGpu, histGpu);
- get_hist<<<size / 512, 512>> >(aGpu, histGpu);
- cudaMemcpy(hist, histGpu, length * sizeof(int), cudaMemcpyDeviceToHost); //这个比较耗时 相比GPU运算
- gettimeofday(&endTime, NULL);
- printf("gpu计算的结果(次数统计):\n");
- for(int i=0;i<10;i++){
- printf("%d,", hist[i]);
- }
- printf("cuda use time(单位是微秒): %ld\n",
- (endTime.tv_sec - startTime.tv_sec)*1000000 + (endTime.tv_usec - startTime.tv_usec));
- //数据清0
- memset(hist, 0, sizeof(hist));
- gettimeofday(&startTime, NULL);
- for(int i = 0; i < size; i++)
- {
- hist[(int)a[i]] += 1;
- }
- gettimeofday(&endTime, NULL);
- printf("cpu计算的结果(次数统计):\n");
- for(int i=0;i<10;i++){
- printf("%d,", hist[i]);
- }
- printf("cpu use time: %ld\n",
- (endTime.tv_sec - startTime.tv_sec)*1000000 + (endTime.tv_usec - startTime.tv_usec));
-
- return 0;
- }
复制代码
|
|