《Cuda By Exmaple》文中的Texture Reference方法,CUDA 12已经不再支持了。为了试试Texture内存模式的加速功能,补充利用Texture Object API的例子。
Texture Object API
Texture Object API里,仍然有tex1Dfetch和tex2D两个函数,用于获取Texture里数据。tex1Dfetch和tex2D在新版本中,定义为两个模板函数。
template<class T>
T tex1Dfetch(cudaTextureObject_t texObj, int x);template<class T>
T tex2D(cudaTextureObject_t texObj, float x, float y);
模板参数T为Texture里的数据类型,texObj指代的Texture对象,可以存储各种类型的数据,比如常用的浮点数类型float。tex1Dfetch的第二个参数 int x为数据在Texture中的坐标。由坐标类型为整数可知,tex1Dfetch只支持非归一化的坐标值。tex2D用于从二维Texture对象中取值,x, y分别为在x轴和y轴的坐标值。float类型允许使用归一化后的坐标值。要从一个存储数据类型为float的Texture对象texObj中取出 x 处的值,可以用下面的语句:
float v = tex2D<float>(texObj, x);
因此,前文中的copy_const_kernel和blend_kernel可以改写成:
__global__ void copy_const_kernel(float* iptr, cudaTextureObject_t texConst) {// map threadIdx/blockIdx to x, yint x = threadIdx.x + blockIdx.x * blockDim.x;int y = threadIdx.y + blockIdx.y * blockDim.y;int offset = x + y * blockDim.x * gridDim.x;float v = tex1Dfetch<float>(texConst, offset);if (v != 0)iptr[offset] = v;
}
除了引入了参数cudaTextureObject_t texConst作为热源输入外,跟前文几乎一样。
__global__ void blend_kernel(float* outSrc, cudaTextureObject_t texIn)
{// map threadIdx/blockIdx to x, yint x = threadIdx.x + blockIdx.x * blockDim.x;int y = threadIdx.y + blockIdx.y * blockDim.y;int offset = x + y * blockDim.x * gridDim.x;int left = offset - 1;int right = offset + 1;if (x == 0)left = left + 1;if (x == DIM - 1)right = right - 1;int top = offset - DIM;int bottom = offset + DIM;if (y == 0)top += DIM;if (y == DIM - 1)bottom -= DIM;outSrc[offset] = tex1Dfetch<float>(texIn, offset) + SPEED * (tex1Dfetch<float>(texIn, top) + tex1Dfetch<float>(texIn, bottom)+ tex1Dfetch<float>(texIn, left) + tex1Dfetch<float>(texIn, right) - 4 * tex1Dfetch<float>(texIn, offset));
}
同样,因为tex1Dfetch不是像之前那样,在编译时就跟Texture Reference静态捆绑在一起,我们可以把输入Texture作为函数参数传进来。
Texture 对象
为了跟前文做直观对比,tex1Dfetch 和 tex2D放在前面讲了。同时可能也引发了我们的好奇心,Texture对象是怎么得来的。
Texture对象在CUDA中的类型名是:cudaTextureObject_t。创建一个Texture对象,我们先声明一个cudaTextureObject_t变量,然后使用cudaCreateTextureObject对变量进行初始化。
先来看一下cudaCreateTextureObject的定义:
cudaError_t cudaCreateTextureObject(cudaTextureObject_t* pTexObject,const cudaResourceDesc* pResDesc,const cudaTextureDesc* pTexDesc,const cudaResourceViewDesc* pResViewDesc
);
pTexObject
:返回的CUDA纹理对象。pResDesc
:纹理资源描述符,描述了纹理在内存中的位置和格式。pTexDesc
:纹理描述符,描述了纹理的滤波和寻址模式等属性。pResViewDesc
:资源视图描述符,描述了纹理的视图。
因此要创建一个Texture对象,我们必须提供三方面的信息。纹理视图跟我们的例子无关,可以先不管。剩下的cudaResourceDesc结构描述的是Texture对象存放数据的内存信息,cudaTextureDesc描述了访问内存时的一些规定。
cudaResourceDesc
cudaResourceDesc包含了一个枚举类型enum cudaResourceType,用于描述资源的类型。当前CUDA支持4中资源,分别是:Array,Mipmapped Array,Linear 和 Pitch 2D。cudaResourceDesc的其余部分是一个联合体,对应每一种资源类型,定义了描述该类资源结构。本例中我们使用Linear类型资源,即一维数组。需要提供的信息包含在下面的结构中。
struct {void *devPtr; /**< Device pointer */struct cudaChannelFormatDesc desc; /**< Channel descriptor */size_t sizeInBytes; /**< Size in bytes */} linear;
devPtr为跟Texture关联的内存的指针,通过cudaMalloc分配得到。sizeInBytes指定以byte为单位,内存的长度,我们的内存大小为显示网格的图像大小。desc描述内存数据包含的通道数量及类型,我们使用单通道的float类型。
我们使用下面的代码初始化热源的cudaResourceDesc:
cudaMalloc((void**)&d.dev_constSrc, bitmap.image_size());
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeLinear;
resDesc.res.linear.devPtr = d.dev_constSrc;
resDesc.res.linear.desc = channelDesc;
resDesc.res.linear.sizeInBytes = bitmap.image_size();
cudaTextureDesc
cudaTextureDesc包含了很多变量,addressMode指定坐标超出边界时,如何处理坐标,我们自己维护了坐标值确保了它不会出界。 不过在2D Texture的示例中,我们需要把它设置为cudaAddressModeClamp。 readMode 和 normalizedCoords 也需要指定。 我们需要读取的数组元素,readMode 设置为cudaReadModeElementType。坐标为原始坐标,不需要归一化,设置为0.
由此我们使用下面的代码初始化热源的cudaTextureDesc:
struct cudaTextureDesc texDesc;memset(&texDesc, 0, sizeof(texDesc));texDesc.addressMode[0] = cudaAddressModeClamp;texDesc.addressMode[1] = cudaAddressModeClamp;//texDesc.filterMode = cudaFilterModePoint;texDesc.readMode = cudaReadModeElementType;//texDesc.normalizedCoords = 0;
最后创建Texture对象,我们的Texture对象声明在DataBlock里,所以有以下的代码:
cudaCreateTextureObject(&d.texConstSrc, &resDesc, &texDesc, NULL);
完整代码
In 和 Out 的Texture对象的声明跟热源类型,只是用到不同内存,其他都一样。下面是完整的代码,在CUDA 12 编译通过。遗憾的是性能还不如最早使用Global Momory的版本。
#include "book.h"
#include "cpu_anim.h"#define SPEED 0.25f
const int DIM = 1024;const float MAX_TEMP = 1.0f;
const float MIN_TEMP = 0.0001f;__global__ void copy_const_kernel(float* iptr, cudaTextureObject_t texConst) {// map threadIdx/blockIdx to x, yint x = threadIdx.x + blockIdx.x * blockDim.x;int y = threadIdx.y + blockIdx.y * blockDim.y;int offset = x + y * blockDim.x * gridDim.x;float v = tex1Dfetch<float>(texConst, offset);if (v != 0)iptr[offset] = v;
}__global__ void blend_kernel(float* outSrc, cudaTextureObject_t texIn)
{// map threadIdx/blockIdx to x, yint x = threadIdx.x + blockIdx.x * blockDim.x;int y = threadIdx.y + blockIdx.y * blockDim.y;int offset = x + y * blockDim.x * gridDim.x;int left = offset - 1;int right = offset + 1;if (x == 0)left = left + 1;if (x == DIM - 1)right = right - 1;int top = offset - DIM;int bottom = offset + DIM;if (y == 0)top += DIM;if (y == DIM - 1)bottom -= DIM;outSrc[offset] = tex1Dfetch<float>(texIn, offset) + SPEED * (tex1Dfetch<float>(texIn, top) + tex1Dfetch<float>(texIn, bottom)+ tex1Dfetch<float>(texIn, left) + tex1Dfetch<float>(texIn, right) - 4 * tex1Dfetch<float>(texIn, offset));
}struct DataBlock {unsigned char* output_bitmap;float * dev_inSrc;float * dev_outSrc;float * dev_constSrc;cudaTextureObject_t texConstSrc;cudaTextureObject_t texIn;cudaTextureObject_t texOut;CPUAnimBitmap* bitmap;cudaEvent_t start, stop;float totalTime;float frames;
};void anim_gpu(DataBlock* d, int ticks) {cudaEventRecord(d->start, 0);dim3 blocks(DIM / 16, DIM / 16);dim3 threads(16, 16);// tranfer 90 times for a good displayvolatile bool dstOut = true;for (int i = 0; i < 90; i++) {if (dstOut) {copy_const_kernel <<<blocks, threads>>> (d->dev_inSrc, d->texConstSrc);blend_kernel <<<blocks, threads>>> (d->dev_outSrc, d->texIn);}else {copy_const_kernel <<<blocks, threads>>> (d->dev_outSrc, d->texConstSrc);blend_kernel <<<blocks, threads>>> (d->dev_inSrc, d->texOut);}dstOut = !dstOut;}float_to_color << <blocks, threads >> > (d->output_bitmap, d->dev_outSrc);//Copy image from devicecudaMemcpy(d->bitmap->get_ptr(), d->output_bitmap, d->bitmap->image_size(), cudaMemcpyDeviceToHost);cudaEventRecord(d->stop, 0);cudaEventSynchronize(d->stop);float elapsedTime;cudaEventElapsedTime(&elapsedTime, d->start, d->stop);d->totalTime += elapsedTime;++d->frames;printf("Average Time per frame: %3.1f ms\n", d->totalTime / d->frames);
}void anim_exit(DataBlock* d) {cudaDestroyTextureObject(d->texConstSrc);cudaDestroyTextureObject(d->texIn);cudaDestroyTextureObject(d->texOut);cudaFree(d->dev_constSrc);cudaFree(d->dev_inSrc);cudaFree(d->dev_outSrc);cudaEventDestroy(d->start);cudaEventDestroy(d->stop);
}int main(int argc, char* argv[]) {DataBlock d;CPUAnimBitmap bitmap(DIM, DIM, &d);d.bitmap = &bitmap;d.frames = 0;d.totalTime = 0.0f;cudaEventCreate(&d.start);cudaEventCreate(&d.stop);HANDLE_ERROR(cudaMalloc((void**)&d.output_bitmap, bitmap.image_size()));//Allocate CUDA Array in device memory.cudaMalloc((void**)&d.dev_constSrc, bitmap.image_size());cudaMalloc((void**)&d.dev_inSrc, bitmap.image_size());cudaMalloc((void**)&d.dev_outSrc, bitmap.image_size());// Create Texture obj for const,in and outcudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();struct cudaResourceDesc resDesc;memset(&resDesc, 0, sizeof(resDesc));resDesc.resType = cudaResourceTypeLinear;resDesc.res.linear.devPtr = d.dev_constSrc;resDesc.res.linear.desc = channelDesc;resDesc.res.linear.sizeInBytes = bitmap.image_size();struct cudaTextureDesc texDesc;memset(&texDesc, 0, sizeof(texDesc));texDesc.addressMode[0] = cudaAddressModeClamp;texDesc.addressMode[1] = cudaAddressModeClamp;//texDesc.filterMode = cudaFilterModePoint;texDesc.readMode = cudaReadModeElementType;//texDesc.normalizedCoords = 0;cudaCreateTextureObject(&d.texConstSrc, &resDesc, &texDesc, NULL);resDesc.res.linear.devPtr = d.dev_inSrc;cudaCreateTextureObject(&d.texIn, &resDesc, &texDesc, NULL);resDesc.res.linear.devPtr = d.dev_outSrc;cudaCreateTextureObject(&d.texOut, &resDesc, &texDesc, NULL);//create constant heaterfloat* temp = (float*)malloc(bitmap.image_size());for (int i = 0; i < DIM * DIM; i++) {temp[i] = 0;int x = i % DIM;int y = i / DIM;if ((x > 300) && (x < 600) && (y > 310) && (y < 601))temp[i] = MAX_TEMP;}temp[DIM * 100 + 100] = (MAX_TEMP + MIN_TEMP) / 2.0;temp[DIM * 700 + 100] = MIN_TEMP;temp[DIM * 300 + 300] = MIN_TEMP;temp[DIM * 200 + 700] = MIN_TEMP;for (int y = 800; y < 900; y++) {for (int x = 400; x < 500; x++) {temp[x + y * DIM] = MIN_TEMP;}}// Copy data located at address h_data in host memory to device memorycudaMemcpy(d.dev_constSrc, temp, bitmap.image_size(), cudaMemcpyHostToDevice);for (int y = 800; y < DIM; y++) {for (int x = 0; x < 200; x++) {temp[x + y * DIM] = MAX_TEMP;}}cudaMemcpy(d.dev_inSrc, temp, bitmap.image_size(), cudaMemcpyHostToDevice);free(temp);bitmap.anim_and_exit((void(*)(void*, int))anim_gpu, (void(*)(void*))anim_exit);cudaFree(d.output_bitmap);
}