CUDA L2Cache Profing
- 一.小结
- 二.测试L2 Cache的驱逐策略
- 三.测试Kernel执行完成后,l2Cache是否会被清
一.小结
- 当所有的warp都访问 2(warpcount)*32(threads)*4(bytes)的DRAM区间时,因DRAM BANK冲突,导致耗时太长
- 开启l2 cache后,性能大幅提升
- 推测kernel执行完之后或运行之前,L2Cache会被invalidate,导致新的Kernel复用同一块内存时,LD指令的耗时没有明显降低
- Profing发现l2 cache hit也没有变化
- 也有可能因L2容量太小,导致差异不明显
二.测试L2 Cache的驱逐策略
tee l2cache_test.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>#define CHECK_CUDA(call) \do { \cudaError_t err = call; \if (err != cudaSuccess) { \std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__; \std::cerr << " code=" << err << " (" << cudaGetErrorString(err) << ")" << std::endl; \exit(EXIT_FAILURE); \} \} while (0)__global__ void kernel(float *addr,int warp_count)
{unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; unsigned int warp_index=tid/32;unsigned int warp_offset=tid%32;warp_index=warp_index%100;//所有warp复用100(warp)*4(secotrs)*32(字节)=12800字节的数据//按warp访问,可以保证每次获取一个cacheline的数据unsigned int offset=warp_index*32+warp_offset;float value;//设置二种不同的驱逐方式if(warp_index<50){asm("ld.global.L1::evict_first.f32 %0, [%1];" : "=f"(value) : "l"(&addr[offset]));}else{asm("ld.global.L1::evict_last.f32 %0, [%1];" : "=f"(value) : "l"(&addr[offset]));}//asm("ld.global.cg.f32 %0, [%1];" : "=f"(value) : "l"(&addr[offset]));//asm("ld.global.L1::evict_normal.f32 %0, [%1];" : "=f"(value) : "l"(&addr[offset]));value+=tid;//addr[offset]=value;asm("st.global.wt.f32 [%0],%1;" :: "l"(&addr[offset]),"f"(value));
}__global__ void kernel_big(float *addr,int warp_count)
{unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; unsigned int warp_index=tid/32;unsigned int warp_offset=tid%32;warp_index=warp_index%30000;unsigned int offset=warp_index*32+warp_offset;float value;//设置二种不同的驱逐方式if(warp_index<15000){asm("ld.global.L1::evict_first.f32 %0, [%1];" : "=f"(value) : "l"(&addr[offset]));}else{asm("ld.global.L1::evict_last.f32 %0, [%1];" : "=f"(value) : "l"(&addr[offset]));}//asm("ld.global.cg.f32 %0, [%1];" : "=f"(value) : "l"(&addr[offset]));//asm("ld.global.L1::evict_normal.f32 %0, [%1];" : "=f"(value) : "l"(&addr[offset]));value+=tid;//addr[offset]=value;asm("st.global.wt.f32 [%0],%1;" :: "l"(&addr[offset]),"f"(value));
}int main(int argc,char *argv[])
{int deviceid=0;cudaSetDevice(deviceid);int block_count=28*10000;int block_size=32*4*8;int thread_size=block_count*block_size;int warp_count=thread_size/32;printf("total secotrs:%d\n",thread_size*4/32);float *addr;CHECK_CUDA(cudaMalloc(&addr, thread_size*4));kernel<<<block_count, block_size>>>(addr,warp_count);kernel<<<block_count, block_size>>>(addr,warp_count);CHECK_CUDA(cudaDeviceSynchronize());kernel_big<<<block_count, block_size>>>(addr,warp_count);CHECK_CUDA(cudaDeviceSynchronize()); return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -arch=sm_86 -lineinfo -o l2cache_test l2cache_test.cu \-I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcuda/usr/local/NVIDIA-Nsight-Compute/ncu --metrics \
smsp__sass_inst_executed_op_global_ld.sum,\
lts__t_sectors_srcunit_tex_op_read_evict_first_lookup_hit,\
lts__t_sectors_srcunit_tex_op_read_evict_first_lookup_hit.sum,\
lts__t_sectors_srcunit_tex_op_read_evict_first_lookup_miss.sum,\
lts__t_sectors_srcunit_tex_op_read_evict_last_lookup_hit.sum,\
lts__t_sectors_srcunit_tex_op_read_evict_last_lookup_miss.sum,\
lts__t_sectors_srcunit_tex_op_read_evict_normal_lookup_hit.sum,\
lts__t_sectors_srcunit_tex_op_read_evict_normal_lookup_miss.sum,\
lts__t_requests_srcunit_tex_op_read.sum,\
l1tex__m_xbar2l1tex_read_sectors_mem_lg_op_ld.sum,\
l1tex__t_sector_pipe_lsu_mem_global_op_ld_hit_rate.pct,\
smsp__sass_l1tex_m_xbar2l1tex_read_sectors_mem_global_op_ldgsts_cache_bypass.sum,\
lts__t_sectors_srcunit_tex_op_read.sum ./l2cache_test
输出
total secotrs:35840000
kernel(float *, int) (280000, 1, 1)x(1024, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
-------------------------------------------------------------------------------- ----------- ------------
Metric Name Metric Unit Metric Value
-------------------------------------------------------------------------------- ----------- ------------
l1tex__m_xbar2l1tex_read_sectors_mem_lg_op_ld.sum sector 11,200 #100(warp)*4(secotrs)*28(sm)=11200
l1tex__t_sector_pipe_lsu_mem_global_op_ld_hit_rate.pct % 99.97 #(35840000-11200)/35840000=0.9996875=99.968%->99.97%
lts__t_requests_srcunit_tex_op_read.sum request 2,800 #*100(warp)*28(sm)=2800
lts__t_sectors_srcunit_tex_op_read.sum sector 11,200
lts__t_sectors_srcunit_tex_op_read_evict_first_lookup_hit.sum sector 5,400 #11200/2-200=5400
lts__t_sectors_srcunit_tex_op_read_evict_first_lookup_miss.sum sector 200 #50(warp)*4(secotrs)=200
lts__t_sectors_srcunit_tex_op_read_evict_last_lookup_hit.sum sector 0 #TODO
lts__t_sectors_srcunit_tex_op_read_evict_last_lookup_miss.sum sector 0
lts__t_sectors_srcunit_tex_op_read_evict_normal_lookup_hit.sum sector 5,400 #同上
lts__t_sectors_srcunit_tex_op_read_evict_normal_lookup_miss.sum sector 200
lts__t_sectors_srcunit_tex_op_read_evict_first_lookup_hit.avg sector 300 #sum/avg=5400/300=18(l2 slice个数)
lts__t_sectors_srcunit_tex_op_read_evict_first_lookup_hit.max sector 864 #
lts__t_sectors_srcunit_tex_op_read_evict_first_lookup_hit.min sector 0 #说明不均衡
lts__t_sectors_srcunit_tex_op_read_evict_first_lookup_hit.sum sector 5,400
smsp__sass_inst_executed_op_global_ld.sum inst 8,960,000 #总的warp指令条数=28*10000*4*8=8960000
smsp__sass_l1tex_m_xbar2l1tex_read_sectors_mem_global_op_ldgsts_cache_bypass.sum sector 0
-------------------------------------------------------------------------------- ----------- ------------kernel(float *, int) (280000, 1, 1)x(1024, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
-------------------------------------------------------------------------------- ----------- ------------
Metric Name Metric Unit Metric Value
-------------------------------------------------------------------------------- ----------- ------------
l1tex__m_xbar2l1tex_read_sectors_mem_lg_op_ld.sum sector 11,200
l1tex__t_sector_pipe_lsu_mem_global_op_ld_hit_rate.pct % 99.97
lts__t_requests_srcunit_tex_op_read.sum request 2,800
lts__t_sectors_srcunit_tex_op_read.sum sector 11,200
lts__t_sectors_srcunit_tex_op_read_evict_first_lookup_hit.sum sector 5,400
lts__t_sectors_srcunit_tex_op_read_evict_first_lookup_miss.sum sector 200 #跟上面的Kernel一样,难道kernel执行完成之后,L2Cache被清了?
lts__t_sectors_srcunit_tex_op_read_evict_last_lookup_hit.sum sector 0
lts__t_sectors_srcunit_tex_op_read_evict_last_lookup_miss.sum sector 0
lts__t_sectors_srcunit_tex_op_read_evict_normal_lookup_hit.sum sector 5,400
lts__t_sectors_srcunit_tex_op_read_evict_normal_lookup_miss.sum sector 200
lts__t_sectors_srcunit_tex_op_read_evict_first_lookup_hit.avg sector 300
lts__t_sectors_srcunit_tex_op_read_evict_first_lookup_hit.max sector 864
lts__t_sectors_srcunit_tex_op_read_evict_first_lookup_hit.min sector 0
lts__t_sectors_srcunit_tex_op_read_evict_first_lookup_hit.sum sector 5,400
smsp__sass_inst_executed_op_global_ld.sum inst 8,960,000
smsp__sass_l1tex_m_xbar2l1tex_read_sectors_mem_global_op_ldgsts_cache_bypass.sum sector 0
-------------------------------------------------------------------------------- ----------- ------------kernel_big(float *, int) (280000, 1, 1)x(1024, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
-------------------------------------------------------------------------------- ----------- ------------
Metric Name Metric Unit Metric Value
-------------------------------------------------------------------------------- ----------- ------------
l1tex__m_xbar2l1tex_read_sectors_mem_lg_op_ld.sum sector 35,473,928
l1tex__t_sector_pipe_lsu_mem_global_op_ld_hit_rate.pct % 1.05
lts__t_requests_srcunit_tex_op_read.sum request 8,868,482
lts__t_sectors_srcunit_tex_op_read.sum sector 35,473,928
lts__t_sectors_srcunit_tex_op_read_evict_first_lookup_hit.sum sector 0
lts__t_sectors_srcunit_tex_op_read_evict_first_lookup_miss.sum sector 17,753,904 #超过Cache大小时,cache失效
lts__t_sectors_srcunit_tex_op_read_evict_last_lookup_hit.sum sector 0
lts__t_sectors_srcunit_tex_op_read_evict_last_lookup_miss.sum sector 0
lts__t_sectors_srcunit_tex_op_read_evict_normal_lookup_hit.sum sector 0
lts__t_sectors_srcunit_tex_op_read_evict_normal_lookup_miss.sum sector 17,713,084
lts__t_sectors_srcunit_tex_op_read_evict_first_lookup_hit.avg sector 0
lts__t_sectors_srcunit_tex_op_read_evict_first_lookup_hit.max sector 0
lts__t_sectors_srcunit_tex_op_read_evict_first_lookup_hit.min sector 0
lts__t_sectors_srcunit_tex_op_read_evict_first_lookup_hit.sum sector 0
smsp__sass_inst_executed_op_global_ld.sum inst 8,960,000
smsp__sass_l1tex_m_xbar2l1tex_read_sectors_mem_global_op_ldgsts_cache_bypass.sum sector 0
-------------------------------------------------------------------------------- ----------- ------------
三.测试Kernel执行完成后,l2Cache是否会被清
- 输出
total secotrs:35840000
warpup
kernel_without_l2cache
E2E:20.6264 ms Kernel:20.6138 ms #当所有的warp都访问 2(warpcount)*32(threads)*4(bytes)的DRAM区间时,因DRAM BANK冲突,导致耗时太长
E2E:20.5642 ms Kernel:20.5536 ms
E2E:20.5506 ms Kernel:20.5413 ms
kernel_l2cache
E2E:3.85756 ms Kernel:3.84717 ms #当开启l2 cache后,性能大幅提升
E2E:3.85328 ms Kernel:3.84512 ms
E2E:3.85268 ms Kernel:3.84416 ms
kernel_l2cache+Invalidate
E2E:3.85734 ms Kernel:3.84602 ms #每次执行完Kernel后,Invalidate the data in l2,性能没有差异,推测Kernel执行完之后或之前,内部会刷l2 cache
E2E:3.85134 ms Kernel:3.84512 ms
E2E:3.85482 ms Kernel:3.8441 ms
- 复现过程
tee l2cache_test.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#include <iostream>
#include <chrono>
#include <thread>#define CHECK_CUDA(call) \do { \cudaError_t err = call; \if (err != cudaSuccess) { \std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__; \std::cerr << " code=" << err << " (" << cudaGetErrorString(err) << ")" << std::endl; \exit(EXIT_FAILURE); \} \} while (0)__constant__ unsigned int warp_count=2;__global__ void kernel_without_l2cache(volatile float *input,volatile float *output)
{unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; unsigned int warp_index=tid/32;unsigned int warp_offset=tid%32;warp_index=warp_index%warp_count;unsigned int offset=warp_index*32+warp_offset; float value=0; asm volatile ("ld.global.cv.f32 %0, [%1];" : "=f"(value) : "l"(&input[offset]));if(tid==0){printf("%f\r",value);}
}__global__ void kernel_l2cache(volatile float *input,volatile float *output)
{unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; unsigned int warp_index=tid/32;unsigned int warp_offset=tid%32;warp_index=warp_index%warp_count;unsigned int offset=warp_index*32+warp_offset; float value=0; asm volatile ("ld.global.L1::evict_last.f32 %0, [%1];" : "=f"(value) : "l"(&input[offset]));if(tid==0){printf("%f\r",value);}
}__global__ void invalidate_l2cache(volatile float *input,volatile float *output)
{unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; unsigned int warp_index=tid/32;unsigned int warp_offset=tid%32;warp_index=warp_index%warp_count;if(warp_offset==0){unsigned int offset=warp_index*32+warp_offset; asm("discard.global.L2 [%0],128;" :: "l"(&input[offset]));//Invalidate the data in L2}
}int main(int argc,char *argv[])
{int deviceid=0;cudaSetDevice(deviceid);int block_count=28*10000;int block_size=32*4*8;int thread_size=block_count*block_size;printf("total secotrs:%d\n",thread_size*4/32);{//warpupprintf("warpup\n");float *addr;CHECK_CUDA(cudaMalloc(&addr, thread_size*4));kernel_l2cache<<<block_count, block_size>>>(addr,addr);kernel_without_l2cache<<<block_count, block_size>>>(addr,addr);cudaDeviceSynchronize();cudaFree(addr);}float *addr;CHECK_CUDA(cudaMalloc(&addr, thread_size*4));cudaStream_t stream;cudaStreamCreate(&stream);cudaEvent_t start_ev, stop_ev;cudaEventCreate(&start_ev);cudaEventCreate(&stop_ev);printf("kernel_without_l2cache\n");for(int i=0;i<3;i++){ auto start = std::chrono::high_resolution_clock::now();cudaEventRecord(start_ev, stream);kernel_without_l2cache<<<block_count, block_size,0,stream>>>(addr,addr);cudaEventRecord(stop_ev, stream);CHECK_CUDA(cudaEventSynchronize(stop_ev));auto end = std::chrono::high_resolution_clock::now();std::chrono::duration<double> diff = end - start;float milliseconds = 0;cudaEventElapsedTime(&milliseconds, start_ev, stop_ev);std::cout << "E2E:" << diff.count()*1000 << " ms" << " Kernel:" << milliseconds << " ms" << std::endl;}printf("kernel_l2cache\n");for(int i=0;i<3;i++){ auto start = std::chrono::high_resolution_clock::now();cudaEventRecord(start_ev, stream);kernel_l2cache<<<block_count, block_size,0,stream>>>(addr,addr);cudaEventRecord(stop_ev, stream);CHECK_CUDA(cudaEventSynchronize(stop_ev));auto end = std::chrono::high_resolution_clock::now();std::chrono::duration<double> diff = end - start;float milliseconds = 0;cudaEventElapsedTime(&milliseconds, start_ev, stop_ev);std::cout << "E2E:" << diff.count()*1000 << " ms" << " Kernel:" << milliseconds << " ms" << std::endl;} printf("kernel_l2cache+Invalidate\n"); for(int i=0;i<3;i++){invalidate_l2cache<<<block_count, block_size>>>(addr,addr);cudaDeviceSynchronize();auto start = std::chrono::high_resolution_clock::now();cudaEventRecord(start_ev, stream);kernel_l2cache<<<block_count, block_size,0,stream>>>(addr,addr);cudaEventRecord(stop_ev, stream);CHECK_CUDA(cudaEventSynchronize(stop_ev));auto end = std::chrono::high_resolution_clock::now();std::chrono::duration<double> diff = end - start;float milliseconds = 0;cudaEventElapsedTime(&milliseconds, start_ev, stop_ev);std::cout << "E2E:" << diff.count()*1000 << " ms" << " Kernel:" << milliseconds << " ms" << std::endl;}return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -arch=sm_86 \-o l2cache_test l2cache_test.cu \-I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcuda
./l2cache_test