L2 Transfer Overhead , " , L1 L2 L1", , " ".
L2 Point::Packed 1.0.

namespace Point
{
struct PackedBitfield
{
uint32_t x : 6;
uint32_t y : 6;
uint32_t z : 6;
uint32_t nx : 4;
uint32_t ny : 4;
uint32_t nz : 4;
uint32_t unused : 2;
};
union __align__(4) Packed
{
uint32_t bits;
PackedBitfield field;
};
struct ClusterPositionBitfield
{
uint32_t x : 10;
uint32_t y : 10;
uint32_t z : 10;
uint32_t w : 2;
};
union ClusterPosition
{
uint32_t bits;
ClusterPositionBitfield field;
};
}
__global__ void pointsRenderKernel(Point::Packed* points, Point::ClusterPosition* clusterPosition)
{
int t_id = blockIdx.x * blockDim.x + threadIdx.x;
clusterPosition[blockIdx.x + blockDim.x] = clusterPosition[blockIdx.x];
points[t_id + blockDim.x * gridDim.x] = points[t_id];
}
void main()
{
int blockSize = 256;
int numberOfClusters = 256;
std::cout << sizeof(Point::Packed) << std::endl;
std::cout << sizeof(Point::ClusterPosition) << std::endl;
Point::Packed *d_points;
cudaMalloc(&d_points, sizeof(Point::Packed) * numberOfClusters * blockSize * 2);
Point::ClusterPosition *d_clusterPositions;
cudaMalloc(&d_points, sizeof(Point::ClusterPosition) * numberOfClusters * 2);
pointsRenderKernel<<<numberOfClusters, blockSize>>>(d_points, d_clusterPositions);
}
UPDATE
Nsight , . , CUDA 8.0.61 ( ), . , , - 376,51. Windows 10 64-bit Visual Studio 2015, Nsight - 5.2, - cc6.1.
:
nvcc.exe -gencode = arch = compute_61, code =\ "sm_61, compute_61 \" --use-local-env -cl- 2015 -Xcompiler "/wd 4819" -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\x86_amd64" -I "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\include" -lineinfo --keep-dir x64\Release -maxrregcount = 0 --machine 64 --compile -cudart static -DWIN32 -DWIN64 -DNDEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc/W3/nologo/O2/FS/Zi/MD" -o x64\Release\kernel.cu.obj kernel.cu "
2
, sm_50,compute_50: 1.0 L2 Transfer Overhead.