NVIDIA GPU atom.global指令Profing
- 一.小结
- 二.输出解释
- 三.复现过程
本文对NVIDIA GPU atom.global指令Profing,并小结
一.小结
- sm atom指令的执行能力为:112 inst/cycle,每条atom warp指令会产生32个request,即3584 request/cycle
- lts有18个slice,每个slice处理能力为 1 request/cycle,总的处理能力为18 request/cycle
- 如果每个sm同时发atom指令,最小需要等待 3584/18=199cycle
- 如果落在同一个lts slice,延迟还会增加
- 同样的atom指令条数,如果一半inc一半dec,总的cycle数比仅inc或dec小约一半,但lts的利用率无明显差异
- 通过比较lts__t_requests_op_atom的sum和max,发现仅inc或dec只有一个lts slice参与,但inc和dec的组合,会用到2个lts slice
二.输出解释
# 28个sm,每个sm个sub_core,每个sub_core上运行1个warp(32线程),每个线程执行32次(atom inc+inc或inc+dec或dec+dec)
# 总的atomic warp请求数为=28*4*32*2=7168inc+inc:239121
inc+dec:121946 #同样的atom指令条数,如果一半inc一半dec,总的cycle数比仅inc或dec小约一半
dec+dec:240249atomic_kernel(float *, float *, int) (28, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
----------------------------------------------------------------- ------------- ------------
Metric Name Metric Unit Metric Value
----------------------------------------------------------------- ------------- ------------
lts__cycles_elapsed.avg.per_second Ghz 1.24 #L2 Cache的频率
lts__d_atomic_input_cycles_active.sum cycle 229,381 #--| # of cycles where the atomic unit's input was active
lts__t_requests_op_atom.max request 229,376 #sum等于max 说是只有1个lts参与
lts__t_requests_op_atom.max.peak_sustained request/cycle 1 #每一个atomic unit的峰值性能
lts__t_requests_op_atom.min request 0
lts__t_requests_op_atom.sum request 229,381 #--| 等于atom warp指令数*32,即每个warp atom指令生成32个request
lts__t_requests_op_atom.sum.pct_of_peak_sustained_active % 84.51 # L2 atomic unit 利用率
lts__t_requests_op_atom.sum.peak_sustained request/cycle 18 # atomic unit总的峰值性能,也就是说有18个atomic unit
lts__t_sectors_op_atom.sum sector 229,381 # 请求的sectors个数,跟request一样,说明并没有合并
lts__cycles_active.avg cycle 15,079
lts__cycles_active.max cycle 231,455 #--|
lts__cycles_active.min cycle 1,297 #--| 最大值跟最小值有差距,说明各个lts单元负载不均衡
lts__cycles_active.sum cycle 271,422
smsp__inst_executed_op_generic_atom.avg inst 64.04
smsp__inst_executed_op_generic_atom.max inst 69
smsp__inst_executed_op_generic_atom.min inst 64
smsp__inst_executed_op_generic_atom.sum inst 7,173 #执行的指令条数比sass指令条数多,可能是指令replay导致的
sm__cycles_elapsed.avg.per_second Ghz 1.32 #sm的频率(比lts高)
smsp__cycles_elapsed.avg.per_second Ghz 1.32
smsp__sass_data_bytes_mem_global_op_atom.sum Kbyte 28.67 #warp sass指令条数*sizeof(int)
smsp__sass_inst_executed_op_atom.sum inst 7,168 #warp sass指令条数
smsp__sass_inst_executed_op_atom.sum.pct_of_peak_sustained_active % 0.03 #smsp atom单元的峰值占比(smsp__sass_inst_executed_op_atom.sum/smsp__cycles_active/peak)#因为存在启动开销smsp__cycles_active远大于atom指令发射的cycle数,所以利用率不高
smsp__sass_inst_executed_op_atom.sum.peak_sustained inst/cycle 112 #sub_core atom指令能力:每个sub_core每个cycle处理一条atom warp指令(28sm*4subcore)
smsp__sass_inst_executed_op_global_atom.sum inst 7,168
----------------------------------------------------------------- ------------- ------------atomic_kernel(float *, float *, int) (28, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
----------------------------------------------------------------- ------------- ------------
Metric Name Metric Unit Metric Value
----------------------------------------------------------------- ------------- ------------
lts__cycles_elapsed.avg.per_second Ghz 1.24
lts__d_atomic_input_cycles_active.sum cycle 229,381
lts__t_requests_op_atom.max request 114,688 #sum是229381 max是114688 说是有2个lts参与
lts__t_requests_op_atom.max.peak_sustained request/cycle 1
lts__t_requests_op_atom.min request 0
lts__t_requests_op_atom.sum request 229,381
lts__t_requests_op_atom.sum.pct_of_peak_sustained_active % 83.98 #无明显差异
lts__t_requests_op_atom.sum.peak_sustained request/cycle 18
lts__t_sectors_op_atom.sum sector 229,381
lts__cycles_active.avg cycle 15,173.78
lts__cycles_active.max cycle 117,755 #最大值比仅inc或dec要小,说明负载输相对均匀一些
lts__cycles_active.min cycle 1,391
lts__cycles_active.sum cycle 273,128
smsp__inst_executed_op_generic_atom.avg inst 64.04
smsp__inst_executed_op_generic_atom.max inst 69
smsp__inst_executed_op_generic_atom.min inst 64
smsp__inst_executed_op_generic_atom.sum inst 7,173
sm__cycles_elapsed.avg.per_second Ghz 1.32
smsp__cycles_elapsed.avg.per_second Ghz 1.32
smsp__sass_data_bytes_mem_global_op_atom.sum Kbyte 28.67
smsp__sass_inst_executed_op_atom.sum inst 7,168
smsp__sass_inst_executed_op_atom.sum.pct_of_peak_sustained_active % 0.05
smsp__sass_inst_executed_op_atom.sum.peak_sustained inst/cycle 112
smsp__sass_inst_executed_op_global_atom.sum inst 7,168
----------------------------------------------------------------- ------------- ------------atomic_kernel(float *, float *, int) (28, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
----------------------------------------------------------------- ------------- ------------
Metric Name Metric Unit Metric Value
----------------------------------------------------------------- ------------- ------------
lts__cycles_elapsed.avg.per_second Ghz 1.24
lts__d_atomic_input_cycles_active.sum cycle 229,381
lts__t_requests_op_atom.max request 229,376
lts__t_requests_op_atom.max.peak_sustained request/cycle 1
lts__t_requests_op_atom.min request 0
lts__t_requests_op_atom.sum request 229,381
lts__t_requests_op_atom.sum.pct_of_peak_sustained_active % 84.46 #无明显差异
lts__t_requests_op_atom.sum.peak_sustained request/cycle 18
lts__t_sectors_op_atom.sum sector 229,381
lts__cycles_active.avg cycle 15,088.72
lts__cycles_active.max cycle 231,378 #比仅inc或dec要高,说明负载均衡差一些
lts__cycles_active.min cycle 1,221
lts__cycles_active.sum cycle 271,597
smsp__inst_executed_op_generic_atom.avg inst 64.04
smsp__inst_executed_op_generic_atom.max inst 69
smsp__inst_executed_op_generic_atom.min inst 64
smsp__inst_executed_op_generic_atom.sum inst 7,173
sm__cycles_elapsed.avg.per_second Ghz 1.32
smsp__cycles_elapsed.avg.per_second Ghz 1.32
smsp__sass_data_bytes_mem_global_op_atom.sum Kbyte 28.67
smsp__sass_inst_executed_op_atom.sum inst 7,168
smsp__sass_inst_executed_op_atom.sum.pct_of_peak_sustained_active % 0.03
smsp__sass_inst_executed_op_atom.sum.peak_sustained inst/cycle 112
smsp__sass_inst_executed_op_global_atom.sum inst 7,168
----------------------------------------------------------------- ------------- ------------
三.复现过程
# 生成Kernel代码
tee atomic_kernel.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>__device__ unsigned int inc_count = 0;
__device__ unsigned int dec_count = 65535000;#define LOOP_COUNT 32__global__ void atomic_kernel(float *input,float *output,int mode) {unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;unsigned int inc_temp=0;unsigned int dec_temp=0;unsigned int start_cycle, end_cycle;void *ptr0=&inc_count;void *ptr1=&dec_count;if(mode==0){//inc+inc__syncthreads();asm volatile ("mov.u32 %0, %%clock;" : "=r"(start_cycle) :: "memory");#pragma unrollfor(int i=0;i<LOOP_COUNT;i++){asm("atom.global.inc.u32 %0, [%1], %2;" : "=r"(inc_temp) : "l"(ptr0), "r"(inc_temp));asm("atom.global.inc.u32 %0, [%1], %2;" : "=r"(inc_temp) : "l"(ptr0), "r"(inc_temp));}// __syncthreads(); asm volatile ("mov.u32 %0, %%clock;" : "=r"(end_cycle) :: "memory");input[tid]=inc_temp+dec_temp;if(tid==0){printf("inc+inc:%lld\n",end_cycle-start_cycle);}}if(mode==1){//inc+dec__syncthreads();asm volatile ("mov.u32 %0, %%clock;" : "=r"(start_cycle) :: "memory");#pragma unrollfor(int i=0;i<LOOP_COUNT;i++){asm("atom.global.inc.u32 %0, [%1], %2;" : "=r"(inc_temp) : "l"(ptr0), "r"(inc_temp));asm("atom.global.dec.u32 %0, [%1], %2;" : "=r"(dec_temp) : "l"(ptr1), "r"(dec_temp));} // __syncthreads();asm volatile ("mov.u32 %0, %%clock;" : "=r"(end_cycle) :: "memory");input[tid]=inc_temp+dec_temp;if(tid==0){printf("inc+dec:%lld\n",end_cycle-start_cycle);}}if(mode==2){//dec+dec__syncthreads();asm volatile ("mov.u32 %0, %%clock;" : "=r"(start_cycle) :: "memory");#pragma unrollfor(int i=0;i<LOOP_COUNT;i++){asm("atom.global.dec.u32 %0, [%1], %2;" : "=r"(dec_temp) : "l"(ptr1), "r"(dec_temp));asm("atom.global.dec.u32 %0, [%1], %2;" : "=r"(dec_temp) : "l"(ptr1), "r"(dec_temp));} //__syncthreads();asm volatile ("mov.u32 %0, %%clock;" : "=r"(end_cycle) :: "memory");input[tid]=inc_temp+dec_temp;if(tid==0){printf("dec+dec:%lld\n",end_cycle-start_cycle);}}
}
EOF# 编译Kernel
/usr/local/cuda/bin/nvcc -std=c++17 -dc -lineinfo -arch=sm_86 -ptx atomic_kernel.cu -o atomic_kernel.ptx
/usr/local/cuda/bin/nvcc -arch=sm_86 atomic_kernel.ptx -cubin -o atomic_kernel.cubin
/usr/local/cuda/bin/nvcc -arch=sm_86 atomic_kernel.cubin -fatbin -o atomic_kernel.fatbin
cat atomic_kernel.ptx
/usr/local/cuda/bin/cuobjdump --dump-sass atomic_kernel.fatbin# 生成测试代码
tee atomic_kernel_main.cpp<<-'EOF'
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cuda.h>int main(int argc,char *argv[])
{CUresult error;CUdevice cuDevice;cuInit(0);int deviceCount = 0;error = cuDeviceGetCount(&deviceCount);error = cuDeviceGet(&cuDevice, 0);if(error!=CUDA_SUCCESS){printf("Error happened in get device!\n");}CUcontext cuContext;error = cuCtxCreate(&cuContext, 0, cuDevice);if(error!=CUDA_SUCCESS){printf("Error happened in create context!\n");}CUmodule module;CUfunction function;const char* module_file = "atomic_kernel.fatbin";const char* kernel_name = "_Z13atomic_kernelPfS_i";error = cuModuleLoad(&module, module_file);if(error!=CUDA_SUCCESS){printf("Error happened in load moudle %d!\n",error);}error = cuModuleGetFunction(&function, module, kernel_name);if(error!=CUDA_SUCCESS){printf("get function error!\n");}int block_count=28;int block_size=32*4;int thread_size=block_count*block_size;int data_size=sizeof(float)*thread_size;float *dev_mem=nullptr;int cudaStatus=0;cudaStatus = cudaMalloc((void**)&dev_mem, data_size*3);if(cudaStatus){printf("cudaMalloc1 Failed\n");}float *input_ptr[2]={&dev_mem[0],&dev_mem[thread_size*2]};for(int mode=0;mode<3;mode++){void *kernelParams[]= {(void*)&input_ptr[0], (void*)&input_ptr[1],(void*)&mode};auto ret=cuLaunchKernel(function,block_count, 1, 1,block_size,1,1,0,0,kernelParams, 0);cudaError_t cudaerr = cudaDeviceSynchronize();if (cudaerr != cudaSuccess){printf("kernel launch failed with error \"%s\".\n",cudaGetErrorString(cudaerr)); }} cudaFree(dev_mem);cuModuleUnload(module);cuCtxDestroy(cuContext);return 0;
}
EOF
# 编译测试代码
g++ atomic_kernel_main.cpp -o atomic_kernel_main -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcudart -lcuda# Profing
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics smsp__inst_executed_op_generic_atom,\
smsp__sass_inst_executed_op_atom.sum,\
smsp__sass_inst_executed_op_atom.sum.peak_sustained,\
smsp__sass_inst_executed_op_atom.sum.pct_of_peak_sustained_active,\
lts__t_sectors_op_atom.sum,\
lts__t_requests_op_atom.sum,\
lts__t_requests_op_atom.max,\
lts__t_requests_op_atom.min,\
lts__cycles_active,\
sm__cycles_elapsed.avg.per_second,\
smsp__cycles_elapsed.avg.per_second,\
lts__d_atomic_input_cycles_active.sum,\
lts__cycles_elapsed.avg.per_second,\
lts__t_requests_op_atom.sum.peak_sustained,\
lts__t_requests_op_atom.max.peak_sustained,\
lts__t_requests_op_atom.sum.pct_of_peak_sustained_active,\
smsp__sass_inst_executed_op_global_atom.sum,\
smsp__sass_data_bytes_mem_global_op_atom.sum ./atomic_kernel_main