CUDA编程模型系列十( CUDA Stream / CUDA 流 / 多流执行)
本系列视频目的是帮助开发者们一步步地学会利用CUDA编程模型加速GPU应用, 我们的口号是: 让GPU飞起来
本期我介绍了CUDA 中Stream的概念和使用方法, CUDA流也是很多CUDA加速库(cuBLAS, cuDNN, TensorRT)中常用的手段, 它能让多个执行队列并行执行, 还能让这些队列执行的过程中相对独立, 彼此不受影响
// cuda stream
// cuda lib, cudnn cublas tensort
// stream-- 一系列的指令执行队列
// mul-stream -- asyn -- order-- asyn#include <stdio.h>
#include <math.h>// a[] + b[] = c[]#define N (1024 * 1024)
#define FULL_SIZE (N * 30)
//stream[2] __global__ void kernel(int *a, int *b, int *c)
{int idx = threadIdx.x + blockDim.x * blockIdx.x;if( idx < N){int idx1 = (idx + 1)%256;int idx2 = (idx + 2)%256;float as = (a[idx] + a[idx1] + a[idx2]) / 3.0;float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0;c[idx] = (as + bs)/2;}
}int main()
{cudaDeviceProp prop;int whichDevice;cudaGetDevice(&whichDevice);cudaGetDeviceProperties(&prop, whichDevice);if( !prop.deviceOverlap ){printf("Your device will not support speed up from multi-streams\n");return 0;}cudaEvent_t start, stop;float elapsedTime;cudaStream_t my_stream[3];int *h_a, *h_b, *h_c;int *d_a0, *d_b0, *d_c0;int *d_a1, *d_b1, *d_c1;int *d_a2, *d_b2, *d_c2;cudaEventCreate(&start);cudaEventCreate(&stop);cudaStreamCreate(&my_stream[0]);cudaStreamCreate(&my_stream[1]);cudaStreamCreate(&my_stream[2]);cudaMalloc((void**) &d_a0, N * sizeof(int));cudaMalloc((void**) &d_b0, N * sizeof(int));cudaMalloc((void**) &d_c0, N * sizeof(int));cudaMalloc((void**) &d_a1, N * sizeof(int));cudaMalloc((void**) &d_b1, N * sizeof(int));cudaMalloc((void**) &d_c1, N * sizeof(int));cudaMalloc((void**) &d_a2, N * sizeof(int));cudaMalloc((void**) &d_b2, N * sizeof(int));cudaMalloc((void**) &d_c2, N * sizeof(int));cudaHostAlloc((void**) &h_a, FULL_SIZE * sizeof(int), cudaHostAllocDefault);cudaHostAlloc((void**) &h_b, FULL_SIZE * sizeof(int), cudaHostAllocDefault);cudaHostAlloc((void**) &h_c, FULL_SIZE * sizeof(int), cudaHostAllocDefault);for(int i = 0; i<FULL_SIZE; i++){h_a[i] = rand()%1024;h_b[i] = rand()%1024;}cudaEventRecord(start);for(int i = 0; i < FULL_SIZE; i += N * 1){cudaMemcpyAsync(d_a0, h_a+i, N*sizeof(int), cudaMemcpyHostToDevice, my_stream[0]);//cudaMemcpyAsync(d_a1, h_a+i+N, N*sizeof(int), cudaMemcpyHostToDevice, my_stream[1]);//cudaMemcpyAsync(d_a2, h_a+i+N+N, N*sizeof(int), cudaMemcpyHostToDevice, my_stream[2]);cudaMemcpyAsync(d_b0, h_a+i, N*sizeof(int), cudaMemcpyHostToDevice, my_stream[0]);//cudaMemcpyAsync(d_b1, h_a+i+N, N*sizeof(int), cudaMemcpyHostToDevice, my_stream[1]);//cudaMemcpyAsync(d_b2, h_a+i+N+N, N*sizeof(int), cudaMemcpyHostToDevice, my_stream[2]);kernel<<<N/256, 256, 0, my_stream[0]>>>(d_a0, d_b0, d_c0);//kernel<<<N/256, 256, 0, my_stream[1]>>>(d_a1, d_b1, d_c1);//kernel<<<N/256, 256, 0, my_stream[2]>>>(d_a2, d_b2, d_c2);cudaMemcpyAsync(h_c+i, d_c0, N*sizeof(int), cudaMemcpyDeviceToHost, my_stream[0]);//cudaMemcpyAsync(h_c+i+N, d_c0, N*sizeof(int), cudaMemcpyDeviceToHost, my_stream[0]);//cudaMemcpyAsync(h_c+i+N+N, d_c0, N*sizeof(int), cudaMemcpyDeviceToHost, my_stream[0]);}cudaStreamSynchronize(my_stream[0]);cudaStreamSynchronize(my_stream[1]);cudaStreamSynchronize(my_stream[2]);cudaEventRecord(stop, 0);cudaEventSynchronize(stop);cudaEventElapsedTime(&elapsedTime , start, stop);printf("Time: %3.2f ms\n", elapsedTime);// cudaFreereturn 0;
}