java 共享内存并行_在动态并行CUDA中使用共享内存
发布日期:2021-09-13 10:04:19 浏览次数:2 分类:技术文章

本文共 3769 字,大约阅读时间需要 12 分钟。

Question 1: 如果共享内存仅由子内核使用,是否必须指定在启动父内核时分配的动态共享内存量 .

Question 2: 以下是我的子内核和父内核

父内核

__global__ void ColumnFractionalShift(DataIn DataInput,float* __restrict__ DeviceInput, float ShiftAmount, float* __restrict__ LightFieldDevice)

{

cudaError_t status;

float ImageShift = threadIdx.x*ShiftAmount;

float ImageIntegerShift = nearbyintf(ImageShift);

float Delay = ImageShift - ImageIntegerShift;

int InputImageOffset = +DataInput.X*DataInput.Y*DataInput.U*(threadIdx.y) + DataInput.X*DataInput.Y*(threadIdx.x);

dim3 dimBlock(32, 24);

dim3 dimGrid(16, 14);

//if (threadIdx.x > 5)

{

ConvolutionColumn << > >(DataInput, DeviceInput + InputImageOffset, Delay, LightFieldDevice + InputImageOffset);

}

status = cudaGetLastError();

if (status != cudaSuccess) {

printf("failed %s\n", cudaGetErrorString(status));

}

cudaDeviceSynchronize();

if (threadIdx.x == 5)

{

printf("The values at beginig of %d %d are %f\n", threadIdx.x, threadIdx.y, *(LightFieldDevice + InputImageOffset));

}

}

子内核

__global__ void ConvolutionColumn(DataIn DataInput,float* __restrict__ DeviceInput, float Delay, float* __restrict__ DeviceResult)

{

extern __shared__ float ConvolutionBlockLeft[];

int BlockStart = blockDim.y*blockIdx.y*DataInput.V + blockIdx.x*blockDim.x;

//int BlockEnd = BlockStart+(blockDim.x*blockDim.y)-1;

int PixelId = blockDim.x*threadIdx.y + threadIdx.x; //32 by 24 kernal

int LoadPixelId = DataInput.V*threadIdx.y + threadIdx.x;

int LoadLeft,LoadRght,LoadCentre;

float KernalSum;

float DelayPower = Delay;

//load upper values

if (blockIdx.y == 0)

{

LoadLeft = DataInput.V*(blockDim.y - threadIdx.y-1) + threadIdx.x;

}

else

{

LoadLeft = LoadPixelId - (DataInput.V*blockDim.y);

}

*(ConvolutionBlockLeft + (threadIdx.y*blockDim.x) + threadIdx.x) = *(DeviceInput + BlockStart + LoadLeft);

if (blockIdx.y*blockDim.y + threadIdx.y >= DataInput.U)

{

LoadCentre = ((DataInput.U - 1)*DataInput.V) + (blockDim.x*blockIdx.x) + threadIdx.x - ((blockIdx.y*blockDim.y + threadIdx.y) - DataInput.U)*DataInput.V;

}

else

{

LoadCentre = BlockStart+LoadPixelId;

}

*(ConvolutionBlockLeft + (blockDim.x*blockDim.y) + (threadIdx.y*blockDim.x) + threadIdx.x) = *(DeviceInput + LoadCentre);

if (blockIdx.y*blockDim.y + threadIdx.y + blockDim.y >= DataInput.U)

{

LoadRght = ((DataInput.U - 1)*DataInput.V) + (blockDim.x*blockIdx.x) + threadIdx.x - ((((blockIdx.y*blockDim.y) + threadIdx.y + blockDim.y) - DataInput.U)*DataInput.V);

}

else

{

LoadRght = BlockStart+LoadPixelId + (DataInput.V*blockDim.y);

}

//float tempfil, tempdata;

//int t;

*(ConvolutionBlockLeft + (2 * blockDim.x*blockDim.y) + (threadIdx.y*blockDim.x) + threadIdx.x) = *(DeviceInput + LoadRght);

__syncthreads();

float FilterSum = *(ConvolutionBlockLeft + ((blockDim.x*blockDim.y) + PixelId));

for (int k = 1; k < DataInput.KernalNoOfFilters; k++)

{

KernalSum = 0;

//printf("The value of filter size is %d\n", (DeviceFilterSize[k]));

for (int l = -((*(DeviceFilterSize + k) - 1) / 2); l < ((*(DeviceFilterSize + k) + 1) / 2); l++)

{

//tempfil = *(DeviceFilterKernal + k*DataInput.KernalFilterLength + ((*(DeviceFilterSize + k) - 1) / 2) + l);

//t = (blockDim.x*blockDim.y) + PixelId + (l*blockDim.x);

//tempdata = *(ConvolutionBlockLeft + ((blockDim.x*blockDim.y) + PixelId - (l*blockDim.x)));

KernalSum += *(DeviceFilterKernal + k*DataInput.KernalFilterLength + ((*(DeviceFilterSize + k) - 1) / 2) + l)**(ConvolutionBlockLeft + ((blockDim.x*blockDim.y) + PixelId - (l*blockDim.x)));

}

KernalSum *= DelayPower;

DelayPower *= Delay;

FilterSum += KernalSum;

}

if (blockIdx.y*blockDim.y + threadIdx.y < DataInput.U)

{

*(DeviceResult + LoadPixelId + BlockStart) = FilterSum;

}

}

这里的子内核很好 . 但是当它从另一个内核启动时,在主机内核从 cudaDeviceSynchronize() 未指定启动失败错误后启动(错误不会从内核中的printf打印) .

父内核的启动配置是 <<<1,(17 17)>>> . 如果只允许来自父级的一个线程启动子网格,则代码可以正常工作 . 是否可以从一个区块发射多少个网格?

转载地址:https://blog.csdn.net/weixin_39817176/article/details/114817127 如侵犯您的版权,请留言回复原文章的地址,我们会给您删除此文章,给您带来不便请您谅解!

上一篇:java quartz 源码解析_quartz2.2源码分析1-使用和原理(转)
下一篇:java读取ppt数据_使用Java基于数据流直接抽取ppt文本

发表评论

最新留言

网站不错 人气很旺了 加油
[***.192.178.218]2024年04月24日 06时59分32秒