Memory issue with texture in mexCUDA compiled code

3 views (last 30 days)
I created a helper function mxArrayToTexture_3D_float4() to facilitate the process of creating CUDA objects from mxArray.
This function will check the type and size of inputMxArray, hide the details of creating cuArray, texture resource, description, etc. It works great when I have a dozen of texture objects to create with different dimension, size and type. However, I am not sure how to clean up and free the memory afterwards.
In the attached code, I can pass a 4 x 400x 400 x400 matlab array (~1G in memory size) and bind it to a texture object in CUDA with 1) explicit code, 2) through the helper function.
With the first approach, the device memory allocated to the texture is freed with cudaDestroyTextureObject() and cudaFreeArray()
However, with the second approach, those memories are not freed after the code exits. Everytime, I ran the code it will occupy ~1Gb device memory and eventually causing out of memory erros.
It is probably due to how I wrote those C codes and pass the texture object and cuda array through the helpfer function. Anyway, any suggestion to fix this issue is appreciated !
#include "tmwtypes.h"
#include "mex.h"
#include "gpu/mxGPUArray.h"
/* Convert matlab float array to CUDA 3D texture*/
void mxArrayToTexture_3D_float4(
const mxArray* inputArray,
cudaTextureObject_t& outputTexture,
cudaArray *cuArray){
unsigned int height = mxGetDimensions(inputArray)[1];
unsigned int width = mxGetDimensions(inputArray)[2];
unsigned int depth = mxGetDimensions(inputArray)[3];
cudaExtent extent = make_cudaExtent(height, width, depth);
float4 const* h_ptr = (float4 const*)mxGetPr(inputArray);
// cuArray and channel description
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float4>();
cudaMalloc3DArray(&cuArray, &channelDesc, extent);
// cudaMemcpy
cudaMemcpy3DParms copyParams = { 0 };
copyParams.srcPtr = make_cudaPitchedPtr((void *)h_ptr, extent.width*sizeof(float4), extent.width, extent.height);
copyParams.dstArray = cuArray;
copyParams.extent = extent;
copyParams.kind = cudaMemcpyHostToDevice;
cudaMemcpy3D(&copyParams);
// CUDA resource description
cudaResourceDesc texRes;
memset(&texRes,0,sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = cuArray;
// CUDA texture description
cudaTextureDesc texDescr;
memset(&texDescr,0,sizeof(cudaTextureDesc));
texDescr.normalizedCoords = false;
texDescr.filterMode = cudaFilterModeLinear;
texDescr.addressMode[0] = cudaAddressModeClamp;
texDescr.addressMode[1] = cudaAddressModeClamp;
texDescr.addressMode[2] = cudaAddressModeClamp;
texDescr.readMode = cudaReadModeElementType;
cudaCreateTextureObject(&outputTexture, &texRes, &texDescr, NULL);
}
/**
* MEX gateway
*/
void mexFunction(int /* nlhs */, mxArray *plhs[],
int nrhs, mxArray const *prhs[])
{
mxInitGPU();
/* 1. Explicit approach, no issue with GPU memory after code finishs*/
mxArray const* inputArray = prhs[0];
unsigned int height = mxGetDimensions(inputArray)[1];
unsigned int width = mxGetDimensions(inputArray)[2];
unsigned int depth = mxGetDimensions(inputArray)[3];
cudaExtent extent = make_cudaExtent(height, width, depth);
cudaTextureObject_t outputTexture1;
cudaArray *cuArray1;
float4 const* h_ptr = (float4 const*)mxGetPr(inputArray);
// cuArray and channel description
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float4>();
cudaMalloc3DArray(&cuArray1, &channelDesc, extent);
// cudaMemcpy
cudaMemcpy3DParms copyParams = { 0 };
copyParams.srcPtr = make_cudaPitchedPtr((void *)h_ptr, extent.width*sizeof(float4), extent.width, extent.height);
copyParams.dstArray = cuArray1;
copyParams.extent = extent;
copyParams.kind = cudaMemcpyHostToDevice;
cudaMemcpy3D(&copyParams);
// CUDA resource description
cudaResourceDesc texRes;
memset(&texRes,0,sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = cuArray1;
// CUDA texture description
cudaTextureDesc texDescr;
memset(&texDescr,0,sizeof(cudaTextureDesc));
texDescr.normalizedCoords = false;
texDescr.filterMode = cudaFilterModeLinear;
texDescr.addressMode[0] = cudaAddressModeClamp;
texDescr.addressMode[1] = cudaAddressModeClamp;
texDescr.addressMode[2] = cudaAddressModeClamp;
texDescr.readMode = cudaReadModeElementType;
cudaCreateTextureObject(&outputTexture1, &texRes, &texDescr, NULL);
cudaError_t lastError;
lastError = cudaGetLastError();
mexPrintf("%s\n", cudaGetErrorString(lastError));
cudaDestroyTextureObject(outputTexture1);
cudaFreeArray(cuArray1);
lastError = cudaGetLastError();
mexPrintf("%s\n", cudaGetErrorString(lastError));
/* 2. Using helper function, Device memory not cleared after the code exits */
cudaTextureObject_t outputTexture2;
cudaArray *cuArray2;
mxArrayToTexture_3D_float4(prhs[0], outputTexture2, cuArray2);
lastError = cudaGetLastError();
mexPrintf("%s\n", cudaGetErrorString(lastError));
cudaDestroyTextureObject(outputTexture2);
cudaFreeArray(cuArray2);
lastError = cudaGetLastError();
mexPrintf("%s\n", cudaGetErrorString(lastError));
}

Answers (1)

Joss Knight
Joss Knight on 26 May 2023
Edited: Joss Knight on 26 May 2023
It looks like the syntax for your function mxArrayToTexture_3D_float4 is incorrect. You are passing the pointer cuArray by value, so you are overwriting it locally with the allocated address but not passing that back to the caller. Pass the pointer by reference instead.

Products


Release

R2021b

Community Treasure Hunt

Find the treasures in MATLAB Central and discover how the community can help you!

Start Hunting!