[QNX] mmap+cache/nocache+memcpy/asm_memcpy速度对比

ops/2024/9/23 19:17:02/

mmap_nocache_0">mmap nocache介绍

以linux系统的nocache介绍:

在Linux系统中,使用mmap映射文件到内存时可以指定不使用缓存。这可以通过在调用mmap时将MAP_NOCACHE标志传递给mmap函数来实现。
MAP_NOCACHE标志告诉内核不要将映射的内存页缓存到文件系统缓存中,而是直接将内存与文件关联。这对于需要频繁读写大量数据的应用程序是有益的,因为它避免了在读写数据时额外的缓存开销。

例如:

#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>
int main() {int fd = open("file.txt", O_RDWR);char *ptr = mmap(NULL, 100, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_NOCACHE, fd, 0);// use ptr to read and write data directly from/to the file...munmap(ptr, 100);close(fd);return 0;
}

qnx系统介绍:
https://www.qnx.com/developers/docs/7.1/#com.qnx.doc.neutrino.lib_ref/topic/p/posix_typed_mem_open.html

数据

memcpy cached speed: 2133.333333 MB/s (千兆级别)
memcpy nocached speed: 116.363636 MB/s (百兆级别)
invalidate memcpy cached speed: 1333.333333 MB/s (千兆级别)
invalidate memcpy nocached speed: 112.280702 MB/s (百兆级别)
asm memcpy cached speed: 2133.333333 MB/s (千兆级别)
asm memcpy nocached speed: 225.352113 MB/s (百兆级别)

名词

mmap cached指mmap使用PROT_READ | PROT_WRITE这个flag

mmap nocached是指mmap使用PROT_READ | PROT_WRITE | PROT_NOCACHE这个flag

for (size_t i = 0; i < count; i++) {memcpy(dst, src[i], bytes);
}
  • invalidate memcpy是指memcpy前先用msync设置共享内存属性,如
for (size_t i = 0; i < count; i++) {msync(src[i], bytes, MS_INVALIDATE);memcpy(dst, src[i], bytes);
}
inline void aarch64_fast_memcpy(void *dst, const void *src, size_t size) {
#ifdef _QNX_void *ss = (void *)src, *dd = (void *)dst;size_t sz = size;asm volatile("loop_start: ""ldp q3, q4,[%0,#0x0]\n""ldp q5, q6,  [%0,#0x20]\n""ldp q7, q8,  [%0,#0x40]\n""ldp q9, q10, [%0,#0x60]\n""stp q3, q4,  [%1,#0x0]\n""stp q5, q6,  [%1,#0x20]\n""stp q7, q8, [%1,#0x40]\n""stp q9, q10, [%1,#0x60]\n""add %0, %0, #0x80\n""add %1, %1, #0x80\n""subs %2, %2, #0x80\n""b.ne loop_start\n""dsb sy\n": /* no output */: "r"(ss), "r"(dd), "r"(sz));
#endif
}
// ......   
for (size_t i = 0; i < count; i++) {aarch64_fast_memcpy(dst, src[i], bytes);
}

(完整代码见后面长源码文件)

结论

cached情况下:memcpy和asm memcpy差不多快(没有明显区别),invalidate memcpy最慢。

nocached情况下,asm memcpy最快,memcpy第二快,invalidate memcpy最慢。

测试源码

// mmap_memcpy.c
#include <errno.h>
#include <fcntl.h>
#include <memory.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>#ifndef _QNX_
#define PROT_NOCACHE 0
#endifinline void aarch64_fast_memcpy(void *dst, const void *src, size_t size) {
#ifdef _QNX_void *ss = (void *)src, *dd = (void *)dst;size_t sz = size;asm volatile("loop_start: ""ldp q3, q4,[%0,#0x0]\n""ldp q5, q6,  [%0,#0x20]\n""ldp q7, q8,  [%0,#0x40]\n""ldp q9, q10, [%0,#0x60]\n""stp q3, q4,  [%1,#0x0]\n""stp q5, q6,  [%1,#0x20]\n""stp q7, q8, [%1,#0x40]\n""stp q9, q10, [%1,#0x60]\n""add %0, %0, #0x80\n""add %1, %1, #0x80\n""subs %2, %2, #0x80\n""b.ne loop_start\n""dsb sy\n": /* no output */: "r"(ss), "r"(dd), "r"(sz));
#endif
}off_t offset(unsigned int bytes) {static off_t base_offset = 0x1E000000;off_t return_base_offset = base_offset;base_offset += bytes;return return_base_offset;
}void *mmap_memory(unsigned int bytes, int flag) {int fd = open("/dev/mem", O_RDWR | O_SYNC);if (fd < 0) {printf("open /dev/mem failed: %s\n", strerror(errno));}void *ptr = mmap(NULL, bytes, flag, MAP_SHARED, fd, offset(bytes));close(fd);if (MAP_FAILED == ptr) {printf("mmap failed: %s\n", strerror(errno));}return ptr;
}void *mmap_memory_cached(unsigned int bytes) {return mmap_memory(bytes, PROT_READ | PROT_WRITE);
}void *mmap_memory_nocached(unsigned int bytes) {return mmap_memory(bytes, PROT_READ | PROT_WRITE | PROT_NOCACHE);
}// if C++
#ifdef __cplusplus
#include <chrono>
using namespace std::chrono;
#define start() auto start_ = high_resolution_clock::now();
#define end()                                                                  \auto end_ = high_resolution_clock::now();                                  \double bytes_mb = bytes * count / 1024.0 / 1024.0;                         \double cost_ns = duration_cast<nanoseconds>(end_ - start_).count();        \double mps = bytes_mb / cost_ns * 1e9;#else
#include <time.h>
#include <unistd.h>
double time_diff_ns(struct timespec start, struct timespec end) {return (end.tv_sec - start.tv_sec) * 1e9 + (end.tv_nsec - start.tv_nsec);
}
#define start()                                                                \struct timespec start_, end_;                                              \clock_gettime(CLOCK_REALTIME, &start_);#define end()                                                                  \clock_gettime(CLOCK_REALTIME, &end_);                                      \double bytes_mb = bytes * count / 1024.0 / 1024.0;                         \double cost_ns = time_diff_ns(start_, end_);                               \double mps = bytes_mb / cost_ns * 1e9;#endifdouble memcpy_speed(void *dst, void **src, unsigned int bytes,unsigned int count) {start();for (size_t i = 0; i < count; i++) {memcpy(dst, src[i], bytes);}end();return mps;
}double invalidate_memcpy_speed(void *dst, void **src, unsigned int bytes,unsigned int count) {start();for (size_t i = 0; i < count; i++) {msync(src[i], bytes, MS_INVALIDATE);memcpy(dst, src[i], bytes);}end();return mps;
}double asm_memcpy_speed(void *dst, void **src, unsigned int bytes,unsigned int count) {start();for (size_t i = 0; i < count; i++) {aarch64_fast_memcpy(dst, src[i], bytes);}end();return mps;
}int main(int argc, char *argv[]) {const unsigned int count = 2;unsigned int bytes = 128 * 1024; // 128 KBif (argc > 1) {bytes = atoi(argv[1]) * 1024;}printf("bytes: %d\n", bytes);printf("count: %d\n", count);void *mmap_cached_src[count];void *mmap_nocached_src[count];for (size_t i = 0; i < count; i++) {mmap_cached_src[i] = mmap_memory_cached(bytes);mmap_nocached_src[i] = mmap_memory_nocached(bytes);}void *dst = malloc(bytes);printf("memcpy cached speed: %f MB/s\n",memcpy_speed(dst, mmap_cached_src, bytes, count));printf("memcpy nocached speed: %f MB/s\n",memcpy_speed(dst, mmap_nocached_src, bytes, count));printf("invalidate memcpy cached speed: %f MB/s\n",invalidate_memcpy_speed(dst, mmap_cached_src, bytes, count));printf("invalidate memcpy nocached speed: %f MB/s\n",invalidate_memcpy_speed(dst, mmap_nocached_src, bytes, count));printf("asm memcpy cached speed: %f MB/s dst[0]=%d\n",asm_memcpy_speed(dst, mmap_cached_src, bytes, count), ((char *)dst)[0]);printf("asm memcpy nocached speed: %f MB/s\n",asm_memcpy_speed(dst, mmap_nocached_src, bytes, count));free(dst);for (size_t i = 0; i < count; i++) {munmap(mmap_cached_src[i], bytes);munmap(mmap_nocached_src[i], bytes);}return 0;
}

http://www.ppmy.cn/ops/33601.html

相关文章

ffmpeg 转文件为h265

查看 nvidia-smi ---------------------------------------------------------------------------------------- | NVIDIA-SMI 551.61 Driver Version: 551.61 CUDA Version: 12.4 | |-------------------------------------------------------------------------------------…

微信小程序个人开放服务类目表

微信小程序个人开放服务类目表 服务类目类目分类一类目分类二引导描述出行与交通代驾//生活服务家政、丽人、摄影/扩印、婚庆服务、环保回收/废物回收//餐饮点评与推荐、菜谱、餐厅排队//旅游出境WiFi、旅游攻略//商业服务会展服务、律师/ 【律师】类目需上传《律师执业资格证…

文件上传结合springboot

目录 前台页面 后台接口 文件上传总结 前端页面三要素 服务端接收文件 前台页面 <form action"/upload" method"post" enctype"mutipart/form-data"> 姓名<imput type"text" name"username"><br> 年…

对命令模式的理解

目录 一、场景1、文本编辑器并不是一个好的例子&#xff0c;设备控制器才是2、设备控制器的demo 二、不用命令模式1、代码2、问题 三、使用命令模式1、代码2、当需求变化时2.1 新增代码2.2 优点 四、进一步思考1、省略对Command的建模可以吗&#xff1f;2、命令模式的价值 一、…

[C语言]指针进阶详解

指针是C语言的精髓所以内容可能会比较多&#xff0c;需要我们认真学习 目录 1、字符指针 2、指针数组 3、数组指针 3.1数组指针的定义 3.2&数组名vs数组名 3.3数组指针的使用 4、数组传参和指针传参 4.1一维数组传参 4.2二维数组传参 4.3一级指针传参 4.4二级指…

STM32入门_江协科技_3~4_OB记录的自学笔记_软件安装新建工程

3. 软件安装 3.1. 安装Keil5 MDK 作者的资料下载的连接如下&#xff1a;https://jiangxiekeji.com/download.html#32 3.2. 安装器件支持包 因为新的芯片层出不穷&#xff0c;所以需要安装Keil5提供的器件升级版对软件进行升级&#xff0c;从而支持新的芯片&#xff1b;如果不…

毕设基于深度学习的斑马鱼卵识别系统笔记和遇到的问题

我是在云GPU上训练的&#xff0c;训练完成之后&#xff0c;保存了最优模型字典pth文件&#xff0c;在到cpu本地加载的时候&#xff0c;我们需要读取pth文件&#xff0c;正确代码如下 # 加载模型权重&#xff0c;确保映射到正确的设备model models.resnet18(weightsmodels.ResN…

阿里云开源大模型开发环境搭建

ModelScope是阿里云通义千问开源的大模型开发者社区&#xff0c;本文主要描述AI大模型开发环境的搭建。 如上所示&#xff0c;安装ModelScope大模型基础库开发框架的命令行参数&#xff0c;使用清华大学提供的镜像地址 如上所示&#xff0c;在JetBrains PyCharm的项目工程终端控…