ubuntu 22.04 cuda12.x 运行 cutensor 1.6.2 sample
1.6.2 是比较久的cutensor 版本,但是nv对新的cuda 平台做了继续支持,故可以在cuda sdk 12上使用cutensor 1.6.2
1,下载libcutensor 1.6.2
下载 cutensor 1.6.2 for all Linux and all cuda:
https://developer.nvidia.com/cutensor/1.6.2/downloads
wget https://developer.download.nvidia.com/compute/cutensor/redist/libcutensor/linux-x86_64/libcutensor-linux-x86_64-1.6.2.3-archive.tar.xz
tar xf libcutensor-linux-x86_64-1.6.2.3-archive.tar.xz
ls lib..../lib/
10.2/ 11/ 11.0/ 12/
2,运行示例
由于cutensor 2.x中的api有改写,例如 cutensorInit(&handle) 已经改名字;
故需要使用旧的 CUDALibrarySamples中的代码运行示例,例如:
Makefile:
CUTENSOR_ROOT := /home/hipper/cutensor_ex/libcutensor-linux-x86_64-1.6.2.3
CXX_FLAGS=-std=c++11 -I${CUTENSOR_ROOT}/include -L${CUTENSOR_ROOT}/lib/12 -lcutensor -lcudartall:nvcc einsum.cu -o einsum ${CXX_FLAGS}nvcc contraction.cu -o contraction ${CXX_FLAGS}nvcc contraction_simple.cu -o contraction_simple ${CXX_FLAGS}nvcc contraction_autotuning.cu -o contraction_autotuning ${CXX_FLAGS}nvcc elementwise_binary.cu -o elementwise_binary ${CXX_FLAGS}nvcc elementwise_permute.cu -o elementwise_permute ${CXX_FLAGS}nvcc elementwise_trinary.cu -o elementwise_trinary ${CXX_FLAGS}nvcc reduction.cu -o reduction ${CXX_FLAGS}clean:rm -f contraction contraction_simple contraction_autotuning elementwise_binary elementwise_permute elementwise_trinary reduction
contraction_simple.cu
#include <stdlib.h>
#include <stdio.h>#include <unordered_map>
#include <vector>#include <cuda_runtime.h>
#include <cutensor.h>#define HANDLE_ERROR(x) \
{ const auto err = x; \if( err != CUTENSOR_STATUS_SUCCESS ) \{ printf("Error: %s\n", cutensorGetErrorString(err)); return err; } \
};#define HANDLE_CUDA_ERROR(x) \
{ const auto err = x; \if( err != cudaSuccess ) \{ printf("Error: %s\n", cudaGetErrorString(err)); return err; } \
};/* This routine computes the tensor contraction \f[ D = alpha * A * B + beta * C \f] using the staged-API */
cutensorStatus_t cutensorContractionSimple(const cutensorHandle_t* handle,const void* alpha, const void *A, const cutensorTensorDescriptor_t* descA, const int32_t modeA[],const void *B, const cutensorTensorDescriptor_t* descB, const int32_t modeB[],const void* beta, const void *C, const cutensorTensorDescriptor_t* descC, const int32_t modeC[],void *D, const cutensorTensorDescriptor_t* descD, const int32_t modeD[],cutensorComputeType_t typeCompute, cutensorAlgo_t algo, cutensorWorksizePreference_t workPref,cudaStream_t stream)
{/*********************************************** Retrieve the memory alignment for each tensor**********************************************/ uint32_t alignmentRequirementA;HANDLE_ERROR(cutensorGetAlignmentRequirement(handle,A, descA, &alignmentRequirementA));uint32_t alignmentRequirementB;HANDLE_ERROR(cutensorGetAlignmentRequirement(handle,B, descB, &alignmentRequirementB));uint32_t alignmentRequirementC;HANDLE_ERROR(cutensorGetAlignmentRequirement(handle,C, descC, &alignmentRequirementC));uint32_t alignmentRequirementD;HANDLE_ERROR(cutensorGetAlignmentRequirement(handle,D, descD, &alignmentRequirementD));/******************************** Create Contraction Descriptor*******************************/cutensorContractionDescriptor_t desc;HANDLE_ERROR(cutensorInitContractionDescriptor(handle, &desc,descA, modeA, alignmentRequirementA,descB, modeB, alignmentRequirementB,descC, modeC, alignmentRequirementC,descD, modeD, alignmentRequirementD,typeCompute));/*************************** Set the algorithm to use***************************/cutensorContractionFind_t find;HANDLE_ERROR(cutensorInitContractionFind( handle, &find, algo));/*********************** Query workspace**********************/size_t worksize = 0;HANDLE_ERROR(cutensorContractionGetWorkspaceSize(handle,&desc,&find,workPref, &worksize));void *work = nullptr;if (worksize > 0){if(cudaSuccess != cudaMalloc(&work, worksize)){work = nullptr;worksize = 0;}} /*************************** Create Contraction Plan**************************/cutensorContractionPlan_t plan;HANDLE_ERROR(cutensorInitContractionPlan(handle,&plan,&desc,&find,worksize));/*********************** Run**********************/HANDLE_ERROR(cutensorContraction(handle,&plan,(void*) &alpha, A, B,(void*) &beta, C, D, work, worksize, stream));return CUTENSOR_STATUS_SUCCESS;
}int main()
{typedef float floatTypeA;typedef float floatTypeB;typedef float floatTypeC;typedef float floatTypeCompute;cudaDataType_t typeA = CUDA_R_32F;cudaDataType_t typeB = CUDA_R_32F;cudaDataType_t typeC = CUDA_R_32F;cutensorComputeType_t typeCompute = CUTENSOR_COMPUTE_32F;floatTypeCompute alpha = (floatTypeCompute) 1.1f;floatTypeCompute beta = (floatTypeCompute) 0.f;/*********************** Computing: C_{m,u,n,v} = alpha * A_{m,h,k,n} B_{u,k,v,h} + beta * C_{m,u,n,v}**********************/std::vector<int> modeC{'m','u','n','v'};std::vector<int> modeA{'m','h','k','n'};std::vector<int> modeB{'u','k','v','h'};int nmodeA = modeA.size();int nmodeB = modeB.size();int nmodeC = modeC.size();std::unordered_map<int, int64_t> extent;extent['m'] = 96;extent['n'] = 96;extent['u'] = 96;extent['v'] = 64;extent['h'] = 64;extent['k'] = 64;double gflops = (2.0 * extent['m'] * extent['n'] * extent['u'] * extent['v'] * extent['k'] * extent['h']) /1e9;std::vector<int64_t> extentC;for (auto mode : modeC)extentC.push_back(extent[mode]);std::vector<int64_t> extentA;for (auto mode : modeA)extentA.push_back(extent[mode]);std::vector<int64_t> extentB;for (auto mode : modeB)extentB.push_back(extent[mode]);/*********************** Allocating data**********************/size_t elementsA = 1;for (auto mode : modeA)elementsA *= extent[mode];size_t elementsB = 1;for (auto mode : modeB)elementsB *= extent[mode];size_t elementsC = 1;for (auto mode : modeC)elementsC *= extent[mode];size_t sizeA = sizeof(floatTypeA) * elementsA;size_t sizeB = sizeof(floatTypeB) * elementsB;size_t sizeC = sizeof(floatTypeC) * elementsC;printf("Total memory: %.2f GiB\n", (sizeA + sizeB + sizeC)/1024./1024./1024);void *A_d, *B_d, *C_d;HANDLE_CUDA_ERROR(cudaMalloc((void**) &A_d, sizeA));HANDLE_CUDA_ERROR(cudaMalloc((void**) &B_d, sizeB));HANDLE_CUDA_ERROR(cudaMalloc((void**) &C_d, sizeC));floatTypeA *A = (floatTypeA*) malloc(sizeof(floatTypeA) * elementsA);floatTypeB *B = (floatTypeB*) malloc(sizeof(floatTypeB) * elementsB);floatTypeC *C = (floatTypeC*) malloc(sizeof(floatTypeC) * elementsC);if (A == NULL || B == NULL || C == NULL){printf("Error: Host allocation of A, B, or C.\n");return -1;}/******************** Initialize data*******************/for (int64_t i = 0; i < elementsA; i++)A[i] = (((float) rand())/RAND_MAX - 0.5)*100;for (int64_t i = 0; i < elementsB; i++)B[i] = (((float) rand())/RAND_MAX - 0.5)*100;for (int64_t i = 0; i < elementsC; i++)C[i] = (((float) rand())/RAND_MAX - 0.5)*100;HANDLE_CUDA_ERROR(cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice));HANDLE_CUDA_ERROR(cudaMemcpy(B_d, B, sizeB, cudaMemcpyHostToDevice));HANDLE_CUDA_ERROR(cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice));/************************** cuTENSOR*************************/ cutensorHandle_t handle;HANDLE_ERROR(cutensorInit(&handle));/*********************** Create Tensor Descriptors**********************/cutensorTensorDescriptor_t descA;HANDLE_ERROR(cutensorInitTensorDescriptor(&handle,&descA,nmodeA,extentA.data(),NULL /* stride */,typeA, CUTENSOR_OP_IDENTITY));cutensorTensorDescriptor_t descB;HANDLE_ERROR(cutensorInitTensorDescriptor(&handle,&descB,nmodeB,extentB.data(),NULL /* stride */,typeB, CUTENSOR_OP_IDENTITY));cutensorTensorDescriptor_t descC;HANDLE_ERROR(cutensorInitTensorDescriptor(&handle,&descC,nmodeC,extentC.data(),NULL /* stride */,typeC, CUTENSOR_OP_IDENTITY));HANDLE_ERROR(cutensorContractionSimple(&handle,(void*)&alpha, A_d, &descA, modeA.data(),B_d, &descB, modeB.data(),(void*)&beta, C_d, &descC, modeC.data(),C_d, &descC, modeC.data(),typeCompute, CUTENSOR_ALGO_DEFAULT,CUTENSOR_WORKSPACE_RECOMMENDED, 0 /* stream */));return 0;
}
运行:
export LD_LIBRARY_PATH=/home/hipper/cutensor_ex/libcutensor-linux-x86_64-1.6.2.3/lib/12