背景:想统计一条ld指令需要经过哪些硬件单元
步骤:
- cuda Kernel里只包含一条load指令,但如果没有st会被编译器优化掉(ptx还在,但sass里却没了)
- 暂时没有找到编译选项关掉该优化
- 于是采用CuAssembler将ST SASS指令删掉,重新生成fatbin
- 采用cuModuleLoad加载fatbin,用cuLaunchKernel运行该Kernel
SASSCuAssembler_10">1.准备SASS反汇编工具CuAssembler
git clone https://github.com/cloudcores/CuAssembler
export PATH=${PATH}:$PWD/CuAssembler/bin:/usr/local/cuda/bin/
export PYTHONPATH=${PYTHOPATH}:$PWD/CuAssembler/
pip install pyelftools
2.仅包含ld.global.cv.f32的cuda kernel,如果不加st指令,编译器会将ld指令也优化掉。后面手动修改汇编指令删除掉st指令
tee ptx_ld_inst.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
__global__ void ptx_ld_inst_kernel(float *input, float *out) {float d;int tid = threadIdx.x + blockIdx.x * blockDim.x;asm("ld.global.cv.f32 %0, [%1];" : "=f"(d) : "l"(&input[tid]));asm("st.global.wt.f32 [%0],%1;" :: "l"(&out[tid]),"f"(d));
}
EOF
3.生成fatbin
# 生成ptx
/usr/local/cuda/bin/nvcc -std=c++17 -dc -lineinfo -arch=sm_86 -ptx ptx_ld_inst.cu -o ptx_ld_inst.ptx
# 生成cubin
/usr/local/cuda/bin/nvcc -arch=sm_86 ptx_ld_inst.ptx -cubin -o ptx_ld_inst.cubin
# 生成fatbin
/usr/local/cuda/bin/nvcc -arch=sm_86 ptx_ld_inst.cubin -fatbin -o ptx_ld_inst.fatbin
# 查看ptx
cat ptx_ld_inst.ptx
# 查看sass指令
/usr/local/cuda/bin/cuobjdump --dump-sass ptx_ld_inst.fatbin
# 输出:/*0070*/ LDG.E.STRONG.SYS R3, [R2.64] ; /* 0x0000000402037981 *//* 0x000ea2000c1f5900 *//*0080*/ IMAD.WIDE R4, R4, R5, c[0x0][0x168] ; /* 0x00005a0004047625 *//* 0x000fca00078e0205 *//*0090*/ STG.E.STRONG.SYS [R4.64], R3 ; /* 0x0000000304007986 *//* 0x004fe2000c115904 *//*00a0*/ EXIT ; /* 0x000000000000794d */
SASSSTGESTRONGSYSfatbin_54">4.修改SASS指令,删除掉STG.E.STRONG.SYS指令,重新生成fatbin
cuasm.py ptx_ld_inst.cubin ptx_ld_inst.cuasm
cat ptx_ld_inst.cuasm | grep "STG.E.STRONG.SYS" -B 2
# 输出[B------:R-:W2:-:S01] /*0070*/ LDG.E.STRONG.SYS R3, desc[UR4][R2.64] ;[B------:R-:W-:Y:S05] /*0080*/ IMAD.WIDE R4, R4, R5, c[0x0][0x168] ;[B--2---:R-:W-:-:S01] /*0090*/ STG.E.STRONG.SYS desc[UR4][R4.64], R3 ;# 删除这二行
sed '/STG.E.STRONG.SYS/d' -i ptx_ld_inst.cuasm
sed '/IMAD.WIDE R4/d' -i ptx_ld_inst.cuasm# 生新行成cubin
cuasm.py ptx_ld_inst.cuasm
# 生成fatbin
/usr/local/cuda/bin/nvcc -arch=sm_86 ptx_ld_inst.cubin -fatbin -o ptx_ld_inst.fatbin
# 查看sass指令
/usr/local/cuda/bin/cuobjdump --dump-sass ptx_ld_inst.fatbin
输出:/*0050*/ IMAD R4, R3, c[0x0][0x0], R4 ; /* 0x0000000003047a24 *//* 0x001fc800078e0204 *//*0060*/ IMAD.WIDE R2, R4, R5, c[0x0][0x160] ; /* 0x0000580004027625 *//* 0x000fcc00078e0205 *//*0070*/ LDG.E.STRONG.SYS R3, desc[UR4][R2.64] ; /* 0x0000000402037981 *//* 0x000ea2200c1f5900 *//*0080*/ EXIT ; /* 0x000000000000794d */
5.准备测试程序,加载fatbin并运行里面的Kernel
tee ptx_ld_inst_main.cpp<<-'EOF'
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cuda.h>int main(int argc,char *argv[])
{CUresult error;CUdevice cuDevice;cuInit(0);int deviceCount = 0;error = cuDeviceGetCount(&deviceCount);error = cuDeviceGet(&cuDevice, 0);if(error!=CUDA_SUCCESS){printf("Error happened in get device!\n");}CUcontext cuContext;error = cuCtxCreate(&cuContext, 0, cuDevice);if(error!=CUDA_SUCCESS){printf("Error happened in create context!\n");}CUmodule module;CUfunction function;const char* module_file = "ptx_ld_inst.fatbin";const char* kernel_name = "_Z18ptx_ld_inst_kernelPfS_";error = cuModuleLoad(&module, module_file);if(error!=CUDA_SUCCESS){printf("Error happened in load moudle %d!\n",error);}error = cuModuleGetFunction(&function, module, kernel_name);if(error!=CUDA_SUCCESS){printf("get function error!\n");}int data_size=sizeof(float)*8192;float *output_ptr=nullptr;float *input_ptr=nullptr;int cudaStatus=0;cudaStatus = cudaMalloc((void**)&input_ptr, data_size);cudaStatus = cudaMalloc((void**)&output_ptr, data_size);void *kernelParams[]= {(void*)&output_ptr, (void*)&input_ptr};cuLaunchKernel(function,1, 1, 1,32, 1, 1,0,0,kernelParams, 0);cudaFree(output_ptr);cudaFree(input_ptr);cuModuleUnload(module);cuCtxDestroy(cuContext);return 0;
}
EOF
g++ ptx_ld_inst_main.cpp -o ptx_ld_inst_main -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcudart -lcuda
6.ncu profing
/usr/local/NVIDIA-Nsight-Compute/ncu --set full --section SpeedOfLight_HierarchicalTensorRooflineChart \--target-processes all --clock-control=none \--print-details all --export ncu_report_ptx_ld_inst -f ./ptx_ld_inst_main/usr/local/NVIDIA-Nsight-Compute/ncu --metrics \
l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.max_rate,\
l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct,\
l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio,\
l1tex__data_bank_conflicts_pipe_lsu_mem_global_op_ld.max,\
l1tex__m_xbar2l1tex_read_bytes_mem_lg_op_ld.max,\
l1tex__m_xbar2l1tex_read_sectors_mem_lg_op_ld.max,\
l1tex__t_bytes_pipe_lsu_mem_global_op_ld.max,\
l1tex__t_bytes_pipe_lsu_mem_global_op_ld_lookup_miss.max,\
l1tex__t_output_wavefronts_pipe_lsu_mem_global_op_ld.max,\
l1tex__t_requests_pipe_lsu_mem_global_op_ld.max,\
l1tex__t_sectors_pipe_lsu_mem_global_op_ld.max,\
l1tex__t_sectors_pipe_lsu_mem_global_op_ld_lookup_miss.max,\
l1tex__t_set_accesses_pipe_lsu_mem_global_op_ld.max,\
l1tex__t_set_conflicts_pipe_lsu_mem_global_op_ld.max,\
sm__sass_data_bytes_mem_global_op_ld.max,\
sm__sass_inst_executed_op_global_ld.max,\
sm__sass_inst_executed_op_ld.max,\
sm__sass_l1tex_t_sectors_pipe_lsu_mem_global_op_ld.max,\
smsp__inst_executed_op_global_ld.max,\
smsp__inst_executed_op_global_ld_pred_on_any.max,\
smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.max_rate,\
smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct,\
smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.ratio,\
smsp__sass_data_bytes_mem_global_op_ld.max,\
smsp__sass_inst_executed_op_global_ld.max,\
smsp__sass_inst_executed_op_ld.max,\
smsp__sass_l1tex_t_sectors_pipe_lsu_mem_global_op_ld.max ./ptx_ld_inst_main
输出
----------------------------------------------------------------------- ----------- ------------Metric Name Metric Unit Metric Value----------------------------------------------------------------------- ----------- ------------l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.max_rate sector/1 32l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.pct % 12.50l1tex__average_t_sectors_per_request_pipe_lsu_mem_global_op_ld.ratio sector/1 4l1tex__data_bank_conflicts_pipe_lsu_mem_global_op_ld.max 0l1tex__m_xbar2l1tex_read_bytes_mem_lg_op_ld.max byte 128l1tex__m_xbar2l1tex_read_sectors_mem_lg_op_ld.max sector 4l1tex__t_bytes_pipe_lsu_mem_global_op_ld.max byte 128l1tex__t_bytes_pipe_lsu_mem_global_op_ld_lookup_miss.max byte 128l1tex__t_output_wavefronts_pipe_lsu_mem_global_op_ld.max 1l1tex__t_requests_pipe_lsu_mem_global_op_ld.max 1l1tex__t_sectors_pipe_lsu_mem_global_op_ld.max sector 4l1tex__t_sectors_pipe_lsu_mem_global_op_ld_lookup_miss.max sector 4l1tex__t_set_accesses_pipe_lsu_mem_global_op_ld.max 1l1tex__t_set_conflicts_pipe_lsu_mem_global_op_ld.max cycle 0sm__sass_data_bytes_mem_global_op_ld.max byte 128sm__sass_inst_executed_op_global_ld.max inst 1sm__sass_inst_executed_op_ld.max inst 1sm__sass_l1tex_t_sectors_pipe_lsu_mem_global_op_ld.max sector 4smsp__inst_executed_op_global_ld.max inst 1smsp__inst_executed_op_global_ld_pred_on_any.max inst 1smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.max_rate byte/sector 32smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.pct % 100smsp__sass_average_data_bytes_per_sector_mem_global_op_ld.ratio byte/sector 32smsp__sass_data_bytes_mem_global_op_ld.max byte 128smsp__sass_inst_executed_op_global_ld.max inst 1smsp__sass_inst_executed_op_ld.max inst 1smsp__sass_l1tex_t_sectors_pipe_lsu_mem_global_op_ld.max sector 4----------------------------------------------------------------------- ----------- ------------
7.获取NCU支持的metrics列表
/usr/local/NVIDIA-Nsight-Compute/ncu --query-metrics \--csv | awk -F, '{print $1}' | sed 's/"//g' | tail -n +2 > metrics.txt
8.查询每个metrics
tee get_metrics.sh<<-'EOF'
rm -f ptx_ld_inst_metrics.txt
for line in `cat metrics.txt`
do/usr/local/NVIDIA-Nsight-Compute/ncu --metrics $line \./ptx_ld_inst_main 2>&1 | grep "$line" | grep -v "n/a" | tee -a "ptx_ld_inst_metrics.txt"/usr/local/NVIDIA-Nsight-Compute/ncu --metrics $line \./ptx_ld_inst_main 2>&1 | grep "$line" | grep -v "n/a" | tee -a "ptx_ld_inst_metrics.txt"
done
EOF
bash get_metrics.sh