统计一条cuda ld指令需要经过哪些硬件单元--演示CuAssembler如何修改CUDA SASS指令

1.准备SASS反汇编工具CuAssembler
2.仅包含ld.global.cv.f32的cuda kernel,如果不加st指令,编译器会将ld指令也优化掉。后面手动修改汇编指令删除掉st指令
3.生成fatbin
4.修改SASS指令,删除掉STG.E.STRONG.SYS指令,重新生成fatbin
5.准备测试程序,加载fatbin并运行里面的Kernel
6.ncu profing
7.获取NCU支持的metrics列表
8.查询每个metrics
9.过滤掉值为0的metrics

背景:想统计一条ld指令需要经过哪些硬件单元
步骤:

cuda Kernel里只包含一条load指令,但如果没有st会被编译器优化掉(ptx还在,但sass里却没了)
暂时没有找到编译选项关掉该优化
于是采用CuAssembler将ST SASS指令删掉,重新生成fatbin
采用cuModuleLoad加载fatbin,用cuLaunchKernel运行该Kernel

SASSCuAssembler_10">1.准备SASS反汇编工具CuAssembler

git clone https://github.com/cloudcores/CuAssembler
export PATH=${PATH}:$PWD/CuAssembler/bin:/usr/local/cuda/bin/
export PYTHONPATH=${PYTHOPATH}:$PWD/CuAssembler/
pip install pyelftools

2.仅包含ld.global.cv.f32的cuda kernel,如果不加st指令,编译器会将ld指令也优化掉。后面手动修改汇编指令删除掉st指令

tee ptx_ld_inst.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
__global__ void ptx_ld_inst_kernel(float *input, float *out) {float d;int tid  = threadIdx.x + blockIdx.x * blockDim.x;asm("ld.global.cv.f32 %0, [%1];" : "=f"(d) : "l"(&input[tid]));asm("st.global.wt.f32 [%0],%1;" :: "l"(&out[tid]),"f"(d));
}
EOF

3.生成fatbin

# 生成ptx
/usr/local/cuda/bin/nvcc -std=c++17 -dc -lineinfo -arch=sm_86 -ptx ptx_ld_inst.cu -o ptx_ld_inst.ptx
# 生成cubin
/usr/local/cuda/bin/nvcc -arch=sm_86 ptx_ld_inst.ptx -cubin -o ptx_ld_inst.cubin
# 生成fatbin
/usr/local/cuda/bin/nvcc -arch=sm_86 ptx_ld_inst.cubin -fatbin -o ptx_ld_inst.fatbin
# 查看ptx
cat ptx_ld_inst.ptx
# 查看sass指令
/usr/local/cuda/bin/cuobjdump --dump-sass ptx_ld_inst.fatbin
# 输出:/*0070*/                   LDG.E.STRONG.SYS R3, [R2.64] ;         /* 0x0000000402037981 *//* 0x000ea2000c1f5900 *//*0080*/                   IMAD.WIDE R4, R4, R5, c[0x0][0x168] ;  /* 0x00005a0004047625 *//* 0x000fca00078e0205 *//*0090*/                   STG.E.STRONG.SYS [R4.64], R3 ;         /* 0x0000000304007986 *//* 0x004fe2000c115904 *//*00a0*/                   EXIT ;                                 /* 0x000000000000794d */

SASSSTGESTRONGSYSfatbin_54">4.修改SASS指令,删除掉STG.E.STRONG.SYS指令,重新生成fatbin

cuasm.py ptx_ld_inst.cubin ptx_ld_inst.cuasm
cat ptx_ld_inst.cuasm | grep "STG.E.STRONG.SYS"  -B 2
# 输出[B------:R-:W2:-:S01]         /*0070*/                   LDG.E.STRONG.SYS R3, desc[UR4][R2.64] ;[B------:R-:W-:Y:S05]         /*0080*/                   IMAD.WIDE R4, R4, R5, c[0x0][0x168] ;[B--2---:R-:W-:-:S01]         /*0090*/                   STG.E.STRONG.SYS desc[UR4][R4.64], R3 ;# 删除这二行
sed '/STG.E.STRONG.SYS/d' -i ptx_ld_inst.cuasm
sed '/IMAD.WIDE R4/d' -i ptx_ld_inst.cuasm# 生新行成cubin
cuasm.py ptx_ld_inst.cuasm
# 生成fatbin
/usr/local/cuda/bin/nvcc -arch=sm_86 ptx_ld_inst.cubin -fatbin -o ptx_ld_inst.fatbin
# 查看sass指令
/usr/local/cuda/bin/cuobjdump --dump-sass ptx_ld_inst.fatbin
输出:/*0050*/                   IMAD R4, R3, c[0x0][0x0], R4 ;           /* 0x0000000003047a24 *//* 0x001fc800078e0204 *//*0060*/                   IMAD.WIDE R2, R4, R5, c[0x0][0x160] ;    /* 0x0000580004027625 *//* 0x000fcc00078e0205 *//*0070*/                   LDG.E.STRONG.SYS R3, desc[UR4][R2.64] ;  /* 0x0000000402037981 *//* 0x000ea2200c1f5900 *//*0080*/                   EXIT ;                                   /* 0x000000000000794d */

5.准备测试程序,加载fatbin并运行里面的Kernel

tee ptx_ld_inst_main.cpp<<-'EOF'
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cuda.h>int main(int argc,char *argv[])
{CUresult error;CUdevice cuDevice;cuInit(0);int deviceCount = 0;error = cuDeviceGetCount(&deviceCount);error = cuDeviceGet(&cuDevice, 0);if(error!=CUDA_SUCCESS){printf("Error happened in get device!\n");}CUcontext cuContext;error = cuCtxCreate(&cuContext, 0, cuDevice);if(error!=CUDA_SUCCESS){printf("Error happened in create context!\n");}CUmodule module;CUfunction function;const char* module_file = "ptx_ld_inst.fatbin";const char* kernel_name = "_Z18ptx_ld_inst_kernelPfS_";error = cuModuleLoad(&module, module_file);if(error!=CUDA_SUCCESS){printf("Error happened in load moudle %d!\n",error);}error = cuModuleGetFunction(&function, module, kernel_name);if(error!=CUDA_SUCCESS){printf("get function error!\n");}int data_size=sizeof(float)*8192;float *output_ptr=nullptr;float *input_ptr=nullptr;int cudaStatus=0;cudaStatus = cudaMalloc((void**)&input_ptr, data_size);cudaStatus = cudaMalloc((void**)&output_ptr, data_size);void *kernelParams[]= {(void*)&output_ptr, (void*)&input_ptr};cuLaunchKernel(function,1, 1, 1,32, 1, 1,0,0,kernelParams, 0);cudaFree(output_ptr);cudaFree(input_ptr);cuModuleUnload(module);cuCtxDestroy(cuContext);return 0;
}
EOF
g++ ptx_ld_inst_main.cpp -o ptx_ld_inst_main -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcudart -lcuda

6.ncu profing

/usr/local/NVIDIA-Nsight-Compute/ncu --set full --section SpeedOfLight_HierarchicalTensorRooflineChart \--target-processes all --clock-control=none \--print-details all --export ncu_report_ptx_ld_inst -f ./ptx_ld_inst_main/usr/local/NVIDIA-Nsight-Compute/ncu --metrics \
l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.max_rate,\
l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct,\
l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio,\
l1tex__data_bank_conflicts_pipe_lsu_mem_global_op_ld.max,\
l1tex__m_xbar2l1tex_read_bytes_mem_lg_op_ld.max,\
l1tex__m_xbar2l1tex_read_sectors_mem_lg_op_ld.max,\
l1tex__t_bytes_pipe_lsu_mem_global_op_ld.max,\
l1tex__t_bytes_pipe_lsu_mem_global_op_ld_lookup_miss.max,\
l1tex__t_output_wavefronts_pipe_lsu_mem_global_op_ld.max,\
l1tex__t_requests_pipe_lsu_mem_global_op_ld.max,\
l1tex__t_sectors_pipe_lsu_mem_global_op_ld.max,\
l1tex__t_sectors_pipe_lsu_mem_global_op_ld_lookup_miss.max,\
l1tex__t_set_accesses_pipe_lsu_mem_global_op_ld.max,\
l1tex__t_set_conflicts_pipe_lsu_mem_global_op_ld.max,\
sm__sass_data_bytes_mem_global_op_ld.max,\
sm__sass_inst_executed_op_global_ld.max,\
sm__sass_inst_executed_op_ld.max,\
sm__sass_l1tex_t_sectors_pipe_lsu_mem_global_op_ld.max,\
smsp__inst_executed_op_global_ld.max,\
smsp__inst_executed_op_global_ld_pred_on_any.max,\
smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.max_rate,\
smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct,\
smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.ratio,\
smsp__sass_data_bytes_mem_global_op_ld.max,\
smsp__sass_inst_executed_op_global_ld.max,\
smsp__sass_inst_executed_op_ld.max,\
smsp__sass_l1tex_t_sectors_pipe_lsu_mem_global_op_ld.max  ./ptx_ld_inst_main

输出

 ----------------------------------------------------------------------- ----------- ------------Metric Name                                                             Metric Unit Metric Value----------------------------------------------------------------------- ----------- ------------l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.max_rate    sector/1           32l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct                %        12.50l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio       sector/1            4l1tex__data_bank_conflicts_pipe_lsu_mem_global_op_ld.max                                       0l1tex__m_xbar2l1tex_read_bytes_mem_lg_op_ld.max                                byte          128l1tex__m_xbar2l1tex_read_sectors_mem_lg_op_ld.max                            sector            4l1tex__t_bytes_pipe_lsu_mem_global_op_ld.max                                   byte          128l1tex__t_bytes_pipe_lsu_mem_global_op_ld_lookup_miss.max                       byte          128l1tex__t_output_wavefronts_pipe_lsu_mem_global_op_ld.max                                       1l1tex__t_requests_pipe_lsu_mem_global_op_ld.max                                                1l1tex__t_sectors_pipe_lsu_mem_global_op_ld.max                               sector            4l1tex__t_sectors_pipe_lsu_mem_global_op_ld_lookup_miss.max                   sector            4l1tex__t_set_accesses_pipe_lsu_mem_global_op_ld.max                                            1l1tex__t_set_conflicts_pipe_lsu_mem_global_op_ld.max                          cycle            0sm__sass_data_bytes_mem_global_op_ld.max                                       byte          128sm__sass_inst_executed_op_global_ld.max                                        inst            1sm__sass_inst_executed_op_ld.max                                               inst            1sm__sass_l1tex_t_sectors_pipe_lsu_mem_global_op_ld.max                       sector            4smsp__inst_executed_op_global_ld.max                                           inst            1smsp__inst_executed_op_global_ld_pred_on_any.max                               inst            1smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.max_rate      byte/sector           32smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct                     %          100smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.ratio         byte/sector           32smsp__sass_data_bytes_mem_global_op_ld.max                                     byte          128smsp__sass_inst_executed_op_global_ld.max                                      inst            1smsp__sass_inst_executed_op_ld.max                                             inst            1smsp__sass_l1tex_t_sectors_pipe_lsu_mem_global_op_ld.max                     sector            4----------------------------------------------------------------------- ----------- ------------

7.获取NCU支持的metrics列表

/usr/local/NVIDIA-Nsight-Compute/ncu --query-metrics \--csv | awk -F, '{print $1}' | sed 's/"//g' | tail -n +2 > metrics.txt

8.查询每个metrics

tee get_metrics.sh<<-'EOF'
rm -f ptx_ld_inst_metrics.txt
for line in `cat metrics.txt`
do/usr/local/NVIDIA-Nsight-Compute/ncu --metrics $line \./ptx_ld_inst_main 2>&1 | grep "$line" | grep -v "n/a" | tee -a "ptx_ld_inst_metrics.txt"/usr/local/NVIDIA-Nsight-Compute/ncu --metrics $line \./ptx_ld_inst_main 2>&1 | grep "$line" | grep -v "n/a" | tee -a "ptx_ld_inst_metrics.txt"
done
EOF
bash get_metrics.sh