探索NVIDIA GPU PeerAccess的访问范围如何突破PCIE Bar空间大小

news/2024/10/20 17:24:06/

探索NVIDIA GPU PeerAccess的访问范围如何突破PCIE Bar空间大小

      • 一.相关链接
      • 二.观察到的现象
      • 三.升级到cuda_12.6.2[可选]
      • 四.安装open-gpu-kernel-modules[可选,如果需要调试NV驱动源码]
      • 五.测试Kernel中访问Host内存以及H2D
      • 六.准备pcm,监控HOST Memory的带宽,用来确定PeerAccess是否用通过了Host Memory
      • 七.测试PeerAccess
      • 八.用devmem直接读GPU BAR1(测试nsys中的Pcie Read Request to BAR1 Metric是否只记录GPU驱动对BAR1的读写)

探索NVIDIA GPU PeerAccess的访问范围如何突破PCIE Bar空间大小

一.相关链接

  • IOMMU VFIO GROUP
  • DMAR表 + iommu
  • 什么是IOMMU
  • Shared Virtual Addressing for high performance Arm Infrastructure platforms
  • Pcie Read Request to BAR1

二.观察到的现象

  • GPU0通过PeerAccess从GPU1读数据,发现GPU0的PCIE有21%的TX ,同时GPU1的PCIE也有21%的RX.像是在这个过程中GPU0在通过PCIE配置GPU1
  • 在Kernel执行过程中二个GPU的BAR1并没有被访问,NV也没有提供BAR1的metrics,也有可能GPU0在通过GPU1的BAR0配置GPU1的BAR1窗口映射
  • 如果是Kernel过程中GPU0产生MMU缺页中断,让HOST驱动通过BAR0去配置BAR1的映射关系,那GPU0的PCIE应该不会出现TX的利用率
  • 下一步可以通过逻辑分析抓包,进一步分析请添加图片描述
    请添加图片描述
    请添加图片描述
    请添加图片描述

三.升级到cuda_12.6.2[可选]

wget https://developer.download.nvidia.com/compute/cuda/12.6.2/local_installers/cuda_12.6.2_560.35.03_linux.run
sudo apt-get --purge -y remove 'nvidia*'
sh cuda_12.6.2_560.35.03_linux.run

四.安装open-gpu-kernel-modules[可选,如果需要调试NV驱动源码]

git clone --branch 560.35.03 --single-branch https://github.com/NVIDIA/open-gpu-kernel-modules.git
git branch
git checkout -b 560.35.03rmmod nvidia-uvm
rmmod nvidia-drm
rmmod nvidia-modeset
rmmod nvidia
dmesg  -Cinsmod kernel-open/nvidia.ko
insmod kernel-open/nvidia-modeset.ko
insmod kernel-open/nvidia-drm.ko
insmod kernel-open/nvidia-uvm.ko

五.测试Kernel中访问Host内存以及H2D

tee p2p.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#include <iostream>
#include <chrono>
#include <thread>#define CUDA_CHECK(call) \do { \cudaError_t error = call; \if (error != cudaSuccess) { \fprintf(stderr, "CUDA error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, cudaGetErrorString(error)); \exit(EXIT_FAILURE); \} \} while (0)template<int mode>
__global__ void copyKernel(float *input_data,float *output_data) {int idx = threadIdx.x + blockIdx.x * blockDim.x;output_data[idx]=input_data[idx];
}template<int mode>
__global__ void incKernel(float *input_data,float *output_data) {int idx = threadIdx.x + blockIdx.x * blockDim.x;input_data[idx]=input_data[idx]+=1;
}template <typename F>
void TIMEIT(F const &f,cudaStream_t &stream,cudaEvent_t &start_ev,cudaEvent_t&stop_ev)
{ CUDA_CHECK(cudaDeviceSynchronize());auto start = std::chrono::high_resolution_clock::now();cudaEventRecord(start_ev, stream); f(stream); cudaEventRecord(stop_ev, stream); CUDA_CHECK(cudaEventSynchronize(stop_ev)); auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration<double> diff = end - start; float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start_ev, stop_ev); printf("E2E:%7.2fms Kernel:%7.2fms\n",diff.count()*1000,milliseconds);
}int main() {int devID0 = 0;#define block_size 1024L#define block_count ((1024<<20)/block_size/4)  //超过BAR大小size_t dataSize = block_count * block_size * sizeof(float);float *data0;float *data1;CUDA_CHECK(cudaSetDevice(devID0));cudaStream_t stream;cudaStreamCreate(&stream);CUDA_CHECK(cudaMallocHost(&data0,dataSize));CUDA_CHECK(cudaMalloc(&data1, dataSize));cudaEvent_t start_ev, stop_ev;cudaEventCreate(&start_ev);cudaEventCreate(&stop_ev);TIMEIT([&](cudaStream_t &stream)-> void {cudaMemcpyAsync(data1,data0,dataSize,cudaMemcpyHostToDevice,stream);},stream,start_ev,stop_ev);TIMEIT([&](cudaStream_t &stream)-> void {copyKernel<1><<<block_count, block_size,0,stream>>>(data0,data1);},stream,start_ev,stop_ev);TIMEIT([&](cudaStream_t &stream)-> void {incKernel<1><<<block_count, block_size,0,stream>>>(data0,data1);},stream,start_ev,stop_ev);return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -o p2p p2p.cu -I /usr/local/cuda/include -L /usr/local/cuda/lib64  -lcuda
/usr/local/cuda/bin/nsys profile --stats=true -o cuda_profing_report.nsys-rep -f true -t cuda,nvtx --gpu-metrics-frequency=100 --gpu-metrics-devices=0 ./p2p
/usr/local/cuda/bin/ncu --metrics \
dram__bytes_read.sum,\
dram__bytes_write.sum,\
lts__t_sectors_srcunit_tex_aperture_sysmem.sum,\
lts__t_sectors_srcunit_tex_aperture_peer.sum,\
lts__t_sectors_srcunit_tex_aperture_device.sum,\
pcie__read_bytes.sum,\
pcie__write_bytes.sum,\
lts__t_bytes.sum,\
smsp__sass_inst_executed_op_global_ld.sum ./p2p

请添加图片描述
L2的metrics可以看到,请求全部去了peer
请添加图片描述

六.准备pcm,监控HOST Memory的带宽,用来确定PeerAccess是否用通过了Host Memory

git clone --recursive https://github.com/intel/pcm
git submodule update --init --recursive
mkdir build
cd build
cmake ..
cmake --build .
cmake --build . --config Release
./bin/pcm-memory

七.测试PeerAccess

tee p2p.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#include <iostream>
#include <chrono>
#include <thread>#define CUDA_CHECK(call) \do { \cudaError_t error = call; \if (error != cudaSuccess) { \fprintf(stderr, "CUDA error in file '%s' in line %i: %s.\n", __FILE__, __LINE__, cudaGetErrorString(error)); \exit(EXIT_FAILURE); \} \} while (0)template<int mode>
__global__ void dummyKernel(float *input_data,float *output_data) {int idx = threadIdx.x + blockIdx.x * blockDim.x;output_data[idx]=input_data[idx];
}template <typename F>
void TIMEIT(F const &f,cudaStream_t &stream,cudaEvent_t &start_ev,cudaEvent_t&stop_ev)
{ CUDA_CHECK(cudaDeviceSynchronize());auto start = std::chrono::high_resolution_clock::now();cudaEventRecord(start_ev, stream); f(stream); cudaEventRecord(stop_ev, stream); CUDA_CHECK(cudaEventSynchronize(stop_ev)); auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration<double> diff = end - start; float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start_ev, stop_ev); printf("E2E:%7.2fms Kernel:%7.2fms errno:%d\n",diff.count()*1000,milliseconds,cudaGetLastError());
}int main() {int devID0 = 0, devID1 = 1;int device_count=0;CUDA_CHECK(cudaGetDeviceCount(&device_count));for(int deviceid=0; deviceid<2;deviceid++){CUDA_CHECK(cudaSetDevice(deviceid));  cudaDeviceProp deviceProp;CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, deviceid));std::cout << "-----------------------------------" << std::endl;std::cout << "Device Index: " << deviceid << std::endl;std::cout << "Compute Capability:"<<deviceProp.major<<"."<<deviceProp.minor<<std::endl;std::cout << "Device name: " << deviceProp.name << std::endl;std::cout << "Max threads per block: " << deviceProp.maxThreadsPerBlock << std::endl;std::cout << "Shared memory per block: " << deviceProp.sharedMemPerBlock << " bytes" << std::endl;std::cout << "Max blocks per SM: " << deviceProp.maxBlocksPerMultiProcessor << std::endl;std::cout << "asyncEngineCount: " << deviceProp.asyncEngineCount << std::endl;std::cout << "directManagedMemAccessFromHost: " << deviceProp.directManagedMemAccessFromHost << std::endl;std::cout << "unifiedAddressing: " << deviceProp.unifiedAddressing << std::endl;std::cout << "Number of SMs: " << deviceProp.multiProcessorCount << std::endl;}std::cout << "-----------------------------------" << std::endl;int p2p_value=0;CUDA_CHECK(cudaDeviceGetP2PAttribute(&p2p_value,cudaDevP2PAttrAccessSupported,devID0,devID1));std::cout << "cudaDevP2PAttrAccessSupported: " << p2p_value << std::endl;#define block_size 1024L#define block_count ((512<<20)/block_size/4)size_t dataSize = block_count*block_size * sizeof(float);float *data0_dev, *data1_dev,*data1_dev_ex;CUDA_CHECK(cudaSetDevice(devID0));CUDA_CHECK(cudaMalloc(&data0_dev, dataSize));CUDA_CHECK(cudaSetDevice(devID1));CUDA_CHECK(cudaMalloc(&data1_dev, dataSize));CUDA_CHECK(cudaMalloc(&data1_dev_ex, dataSize));float *host;CUDA_CHECK(cudaMallocHost(&host,dataSize));printf("Init Done(%.2f)MB..\n",dataSize/1024.0/1024.0);// 启用P2Pint canAccessPeer=0;CUDA_CHECK(cudaDeviceCanAccessPeer(&canAccessPeer, devID0, devID1));if (canAccessPeer) {CUDA_CHECK(cudaSetDevice(devID1));cudaStream_t stream;cudaStreamCreate(&stream);cudaEvent_t start_ev, stop_ev;cudaEventCreate(&start_ev);cudaEventCreate(&stop_ev);CUDA_CHECK(cudaDeviceEnablePeerAccess(devID0, 0));//让devID1可以访问devID0的设备内存do{//TIMEIT([&](cudaStream_t &stream)-> void {dummyKernel<1><<<block_count, block_size,0,stream>>>(host,data1_dev);},stream,start_ev,stop_ev);TIMEIT([&](cudaStream_t &stream)-> void {dummyKernel<2><<<block_count, block_size,0,stream>>>(data0_dev,data1_dev);},stream,start_ev,stop_ev);}while(0)CUDA_CHECK(cudaDeviceDisablePeerAccess(devID0));}else{printf("%s %d canAccessPeer=0\n",__FILE__,__LINE__);}CUDA_CHECK(cudaFreeHost(host));CUDA_CHECK(cudaFree(data0_dev));CUDA_CHECK(cudaFree(data1_dev));CUDA_CHECK(cudaFree(data1_dev_ex));return 0;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -o p2p p2p.cu -I /usr/local/cuda/include -L /usr/local/cuda/lib64  -lcuda
export CUDA_VISIBLE_DEVICES=6,7
/usr/local/cuda/bin/ncu --metrics \
dram__bytes_read.sum,\
dram__bytes_write.sum,\
lts__t_sectors_srcunit_tex_aperture_sysmem.sum,\
lts__t_sectors_srcunit_tex_aperture_peer.sum,\
lts__t_sectors_srcunit_tex_aperture_device.sum,\
pcie__read_bytes.sum,\
pcie__write_bytes.sum,\
lts__t_bytes.sum,\
smsp__sass_inst_executed_op_global_ld.sum ./p2p
/usr/local/cuda/bin/nsys profile --stats=true -o cuda_profing_report_p2p.nsys-rep -f true -t cuda,nvtx --gpu-metrics-device=4,7 ./p2p

请添加图片描述
请添加图片描述请添加图片描述
实验是明PeerAccess时,没有经过Host Memory

GPU_BAR1nsysPcie_Read_Request_to_BAR1_MetricGPUBAR1_290">八.用devmem直接读GPU BAR1(测试nsys中的Pcie Read Request to BAR1 Metric是否只记录GPU驱动对BAR1的读写)

结论是:该Metric也可以统计GPU驱动以外对BAR1的访问

lspci -s `nvidia-smi  -q | grep "Bus Id" | awk '{print $4}'` -v | grep "Memory at" | sed -n "2,1p"
Memory at 383fd0000000 (64-bit, prefetchable) [size=256M]tee devmem.c<<-'EOF'
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include <signal.h>
#include <fcntl.h>
#include <ctype.h>
#include <sys/time.h>
#include <time.h>
#include <termios.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <string.h>
#include <semaphore.h>
#include <stdint.h>
#include <pthread.h>
#include <sys/stat.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <fcntl.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include <sys/file.h>#define FATAL do { fprintf(stderr, "Error at line %d, file %s (%d) [%s]\n", \__LINE__, __FILE__, errno, strerror(errno)); exit(1); } while(0)#define MAP_SIZE (32<<20)
#define MAP_MASK (MAP_SIZE - 1)unsigned long GetTickCount()
{struct timeval tv;if( gettimeofday(&tv, NULL) != 0 )return 0;return (tv.tv_sec * 1000000) + (tv.tv_usec);
}#define ALIGN_UP(x, a)           ( ( ((x) + ((a) - 1) ) / a ) * a )int main(int argc, char **argv) {int fd;void *map_base, *virt_addr;unsigned long read_result, writeval;off_t target;target = strtoul(argv[1], 0, 16)& ~MAP_MASK;unsigned int size=atoi(argv[2]);unsigned char value=atoi(argv[3]);if((fd = open("/dev/mem", O_RDWR | O_SYNC, S_IRWXU)) == -1) FATAL;fflush(stdout);map_base = mmap(0, MAP_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, target);if(map_base == (void *) -1) FATAL;fflush(stdout);virt_addr = map_base;unsigned char *wbuffer=(unsigned char*)malloc(size);for(int i=0;i<size;i++){wbuffer[i]=value;}unsigned char *rbuffer=(unsigned char*)malloc(size);for(int i=0;i<size;i++){rbuffer[i]=0x51;}unsigned long t0=GetTickCount();memcpy(virt_addr,wbuffer,size);msync(virt_addr, size, MS_SYNC);__asm__ __volatile__("" ::: "memory");unsigned long t1=GetTickCount();printf("mem:%llx %ld(usec)\n",target,t1-t0);memcpy(rbuffer,virt_addr,size);msync(rbuffer, size, MS_SYNC);__asm__ __volatile__("" ::: "memory");//如果不加,结果会不一致for(int i=0;i<size;i++){if(wbuffer[i]!=rbuffer[i]){printf("mismatch at %d %02x %02x\n",i,wbuffer[i],rbuffer[i]);break;}}if(munmap(map_base, MAP_SIZE) == -1) FATAL;close(fd);return 0;
}EOF
gcc -o devmem devmem.c  -std=c99 -g
/usr/local/cuda/bin/nsys profile --stats=true \
-o bar1_access.nsys-rep -f true -t cuda,nvtx --gpu-metrics-device=0 ./devmem 383fd0000000 32 32
/usr/local/cuda/bin/nsys profile --stats=true \
-o bar1_access.nsys-rep -f true -t cuda,nvtx --gpu-metrics-device=0 ./devmem 383fd0000000 1024 32

请添加图片描述


http://www.ppmy.cn/news/1538301.html

相关文章

OceanBase + DolphinScheduler,搭建分布式大数据调度平台的实践

本文整理自白鲸开源联合创始人&#xff0c;Apache DolphinScheduler PMC Chair&#xff0c;Apache Foundation Member 代立冬的演讲。主要介绍了DolphinScheduler及其架构、DolphinScheduler与OceanBase 的联合大数据方案。 DolphinScheduler是什么&#xff1f; Apache Dolphi…

如何使用Colly库进行大规模数据抓取?

在互联网时代&#xff0c;数据的价值日益凸显&#xff0c;大规模数据抓取成为获取信息的重要手段。Go语言因其高效的并发处理能力&#xff0c;成为编写大规模爬虫的首选语言。Colly库作为Go语言中一个轻量级且功能强大的爬虫框架&#xff0c;能够满足大规模数据抓取的需求。本文…

UDP和TCP的区别、网络编程(UDP回显服务器、TCP回显服务器)

目录 一、什么是网络编程 二、网络编程的内容概念 接受端和发送端 请求和响应 服务端和客户端 三、UDP和TCP协议的区别 四、UDP网络编程的类和函数&#xff08;回显服务器&#xff09; DatagramSocket DatagramPacket InetSocketAddress 基于UDP的回显服务器和客户…

电脑查不到IP地址是什么原因?怎么解决

在日常使用电脑的过程中&#xff0c;有时会遇到无法查询到电脑IP地址的情况&#xff0c;这可能会影响到网络的正常使用。本文将探讨电脑查不到IP地址的可能原因&#xff0c;并提供相应的解决方案。 一、原因分析 ‌网络连接问题‌&#xff1a;首先&#xff0c;网络连接不稳定或…

华为高频手撕冲刺

简单题 两数之和 方法一&#xff0c;暴力破解&#xff0c;时间复杂度O(n^2)&#xff0c;空间复杂度O(1) class Solution:def twoSum(self, nums: List[int], target: int) -> List[int]:nlen(nums)for i in range(n):for j in range(i1,n):if nums[i]nums[j]target:retur…

Meta 发布 Quest 3S 头显及 AR 眼镜原型:开启未来交互新视界

简介 在科技的浪潮中&#xff0c;Meta 始终站在创新的前沿&#xff0c;不断为我们带来令人惊叹的虚拟现实和增强现实体验。2024 年 10 月 6 日&#xff0c;让我们一同聚焦 Meta 最新发布的 Quest 3S 头显及 AR 眼镜原型&#xff08;Orion&#xff09;&#xff0c;探索这两款产品…

leetcode 3217 从链表中移除在数组中的结点

1.题目要求: 给你一个整数数组 nums 和一个链表的头节点 head。从链表中移除所有存在于 nums 中的节点后&#xff0c;返回修改后的链表的头节点。 示例 1&#xff1a; 输入&#xff1a; nums [1,2,3], head [1,2,3,4,5] 输出&#xff1a; [4,5] 解释&#xff1a; 移除数值…

Spring Cloud Netflix Eureka 注册中心讲解和案例示范

在微服务架构中&#xff0c;服务的发现和注册是至关重要的一环。Netflix Eureka 是一个在云端设计的服务注册与发现系统。它允许各个微服务将自身注册到注册中心&#xff0c;并在需要时发现其他服务&#xff0c;从而实现客户端负载均衡、服务容错以及动态扩展。本文将深入分析 …