Cuda L2 Overhead

I have a kernel to check render points with atomicMin. The test setup has tons of dots in the case memory layout. Two buffers, one uint32for 256x clusters uint32.

namespace Point
{
struct PackedBitfield
{
    glm::uint32_t x : 6;
    glm::uint32_t y : 6;
    glm::uint32_t z : 6;
    glm::uint32_t nx : 4;
    glm::uint32_t ny : 4;
    glm::uint32_t nz : 4;
    glm::uint32_t unused : 2;
};

union __align__(4) Packed
{
    glm::uint32_t bits;
    PackedBitfield field;
};

struct ClusterPositionBitfield
{
    glm::uint32_t x : 10;
    glm::uint32_t y : 10;
    glm::uint32_t z : 10;
    glm::uint32_t w : 2;
};

union ClusterPosition
{
    glm::uint32_t bits;
    ClusterPositionBitfield field;
};
}

//
// launch with blockSize=(256, 1, 1) and grid=(numberOfClusters, 1, 1)
//
extern "C" __global__ void pointsRenderKernel(mat4 u_mvp,
                    ivec2 u_resolution,
                    uint64_t* rasterBuffer,
                    Point::Packed* points, 
                    Point::ClusterPosition* clusterPosition)
{
// extract and compute world position
const Point::ClusterPosition cPosition(clusterPosition[blockIdx.x]);
const Point::Packed point(points[blockIdx.x*256 + threadIdx.x]);

...use points and write to buffer...

}

The resulting SASS is as follows:

enter image description here

Look at the output of the memory profiler: L2 transfer overhead from reading buffer Point::Packed* 3.0 . Why is this? . The memory should be perfectly aligned and consistent. Also why does this automatically generate LDG(compute_50, sm_50)? I do not need to cache this.

+4
source share
1 answer

L2 Transfer Overhead , " , L1 L2 L1", , " ".

L2 Point::Packed 1.0.

enter image description here

namespace Point
{
    struct PackedBitfield
    {
        uint32_t x : 6;
        uint32_t y : 6;
        uint32_t z : 6;
        uint32_t nx : 4;
        uint32_t ny : 4;
        uint32_t nz : 4;
        uint32_t unused : 2;
    };

    union __align__(4) Packed
    {
        uint32_t bits;
        PackedBitfield field;
    };

    struct ClusterPositionBitfield
    {
        uint32_t x : 10;
        uint32_t y : 10;
        uint32_t z : 10;
        uint32_t w : 2;
    };

    union ClusterPosition
    {
        uint32_t bits;
        ClusterPositionBitfield field;
    };
}

__global__ void pointsRenderKernel(Point::Packed* points, Point::ClusterPosition* clusterPosition)
{
    int t_id = blockIdx.x * blockDim.x + threadIdx.x;

    clusterPosition[blockIdx.x + blockDim.x] = clusterPosition[blockIdx.x];
    points[t_id + blockDim.x * gridDim.x] = points[t_id];
}

void main()
{
    int blockSize = 256;
    int numberOfClusters = 256;

    std::cout << sizeof(Point::Packed) << std::endl;
    std::cout << sizeof(Point::ClusterPosition) << std::endl;

    Point::Packed *d_points;
    cudaMalloc(&d_points, sizeof(Point::Packed) * numberOfClusters * blockSize * 2);

    Point::ClusterPosition *d_clusterPositions;
    cudaMalloc(&d_points, sizeof(Point::ClusterPosition) * numberOfClusters * 2);

    pointsRenderKernel<<<numberOfClusters, blockSize>>>(d_points, d_clusterPositions);
}

UPDATE

Nsight , . , CUDA 8.0.61 ( ), . , , - 376,51. Windows 10 64-bit Visual Studio 2015, Nsight - 5.2, - cc6.1.

:

nvcc.exe -gencode = arch = compute_61, code =\ "sm_61, compute_61 \" --use-local-env -cl- 2015 -Xcompiler "/wd 4819" -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\x86_amd64" -I "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\include" -lineinfo --keep-dir x64\Release -maxrregcount = 0 --machine 64 --compile -cudart static -DWIN32 -DWIN64 -DNDEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc/W3/nologo/O2/FS/Zi/MD" -o x64\Release\kernel.cu.obj kernel.cu "


2

, sm_50,compute_50: 1.0 L2 Transfer Overhead.

0

Source: https://habr.com/ru/post/1621642/


All Articles