Is the result array that is larger than the memory available on the GPU correctly managed?

Question

Is the result array that is larger than the memory available on the GPU correctly managed?

Having determined how to deal with errors:

static void HandleError( cudaError_t err,
                         const char *file,
                         int line ) {
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

As a rule, to save our results in a d_results array, of type double, of size N, which can be immediately allocated in the GPU memory, we can transfer data from the device to the host like this:

    double *d_results;
    HANDLE_ERROR(cudaMalloc(&d_results,N*sizeof(double)));
//Launch our kernel to do some computations and store the results in d_results
.....
// and transfer our data from the device to the host
vector<double> results(N);
cudaMemcpy(results.data(),d_results,N*sizeof(double),cudaMemcpyDeviceToHost);

If the second line fails because there is not enough memory to store all the results at once. How do I manage to perform the calculations and correctly transfer the results to the host? is required to perform batch calculations? I prefer to avoid manual dosing. What is the standard approach for managing this situation in CUDA?

+4

gpgpu cuda

user3116936 Dec 18 '15 at 13:25

source share

1 answer

talonmies · Accepted Answer · 2015-12-18T16:08:26+0000

- . , - :

#include <assert.h>
#include <iostream>

int main()
{
    // Allocate 4 Gb array on host
    const size_t N = 1 << 30;
    int * data = new int[N];

    // Allocate as much memory as will fit on GPU
    size_t total_mem, free_mem;
    cudaMemGetInfo(&free_mem, &total_mem);
    const size_t MB = 1 << 20;

    cudaError_t status;
    int *buffer;
    size_t buffer_size = free_mem;
    for(; buffer_size > MB; buffer_size -= MB) {
        status = cudaMalloc((void **)&buffer, buffer_size);
        if (status == cudaSuccess)
            break;
    }

    std::cout << "Allocated " << buffer_size << " bytes on GPU" << std::endl;

    // Loop through host source data in batches
    std::cout << N << " items require processing" << std::endl;
    size_t batchN = buffer_size / sizeof(int);
    size_t remainN = N;
    int * dp = data;
    std::cout << "Using batch size " << batchN << std::endl;

    for(; remainN > 0; remainN -= batchN) {
        batchN = (remainN < batchN) ? remainN : batchN;
        size_t worksize = batchN * sizeof(int);
        std::cout << "Processing batch of size " << batchN;
        std::cout << "," << remainN << " items remaining" << std::endl;
        cudaMemcpy(buffer, dp, worksize, cudaMemcpyHostToDevice);
        cudaMemset(buffer, 0xff, worksize);
        cudaMemcpy(dp, buffer, worksize, cudaMemcpyDeviceToHost);
        dp += batchN;
    }

    for(size_t i = 0; i < N; i++) {
        assert(data[i] == 0xffffffff);
    }

    cudaDeviceReset();

    return 0;
}

,
gpu ,

cudaMemset , , . , ( / ) , /, .

Is the result array that is larger than the memory available on the GPU correctly managed?

More articles: