This is a continuation of Farzad’s response.
The Farzad point is highlighted in the CUDA C Programming Guide:
Texture and surface memory is cached (see Access Memory Access Access) and, within the same kernel call, the cache is not saved in a consistent way with writing to global memory and writing to surface memory, so any texture retrieval or reading of the surface to an address that was written to via global write or surface write in the same kernel call return undefined. In other words, a thread can safely read some texture or memory on the surface only if that memory location was updated by a previous kernel call or copy of memory, but had not previously been updated by the same thread or another thread from the same kernel call.
This means that you can change the global memory cells to which the texture is bound, but this should not happen in the same kernel in which texture frames work. On the other hand, “recording texture” in the above sense is possible in all cores, since the texture cache is cleared when the kernel starts, see the cuda kernel for adding (a, b, c) using texture objects for a and b - it works correctly for "increment operation" add (a, b, a)? .
The following is an example in which changes to the global texture binding are mapped. In this example, I call the CUDA kernels as follows
median_filter_periodic_boundary<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_out, N); ... square<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_vec, pitch, N); ... median_filter_periodic_boundary<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_out, N);
The median_filter_periodic_boundary kernel uses texture operations, while the square kernel changes the global memory changes to which the texture is bound.
Here is the code:
#include <stdio.h> #include "TimingGPU.cuh" #include "Utilities.cuh" texture<float, 1, cudaReadModeElementType> signal_texture; #define BLOCKSIZE 32 /*************************************************/ /* KERNEL FUNCTION FOR MEDIAN FILTER CALCULATION */ /*************************************************/ __global__ void median_filter_periodic_boundary(float * __restrict__ d_out, const unsigned int N){ int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < N) { float signal_center = tex1D(signal_texture, (float)(tid + 0.5 - 0) / (float)N); float signal_before = tex1D(signal_texture, (float)(tid + 0.5 - 1) / (float)N); float signal_after = tex1D(signal_texture, (float)(tid + 0.5 + 1) / (float)N); d_out[tid] = (signal_center + signal_before + signal_after) / 3.f; } } /*************************************************/ /* KERNEL FUNCTION FOR MEDIAN FILTER CALCULATION */ /*************************************************/ __global__ void square(float * __restrict__ d_vec, const size_t pitch, const unsigned int N){ unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < N) d_vec[tid] = 2.f * tid; } /********/ /* MAIN */ /********/ int main() { const int N = 10; // --- Input/output host array declaration and initialization float *h_vec = (float *)malloc(N * sizeof(float)); for (int i = 0; i < N; i++) h_vec[i] = (float)i; // --- Input/output host and device array vectors size_t pitch; float *d_vec; gpuErrchk(cudaMallocPitch(&d_vec, &pitch, N * sizeof(float), 1)); printf("pitch = %i\n", pitch); float *d_out; gpuErrchk(cudaMalloc(&d_out, N * sizeof(float))); gpuErrchk(cudaMemcpy(d_vec, h_vec, N * sizeof(float), cudaMemcpyHostToDevice)); // --- CUDA texture memory binding and properties definition cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); //Alternatively //cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); size_t texture_offset = 0; gpuErrchk(cudaBindTexture2D(&texture_offset, signal_texture, d_vec, channelDesc, N, 1, pitch)); signal_texture.normalized = true; signal_texture.addressMode[0] = cudaAddressModeWrap; // --- Median filter kernel execution median_filter_periodic_boundary<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_out, N); gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaDeviceSynchronize()); gpuErrchk(cudaMemcpy(h_vec, d_out, N * sizeof(float), cudaMemcpyDeviceToHost)); printf("\n\nFirst filtering\n"); for (int i=0; i<N; i++) printf("h_vec[%i] = %f\n", i, h_vec[i]); // --- Square kernel execution square<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_vec, pitch, N); gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaDeviceSynchronize()); gpuErrchk(cudaMemcpy(h_vec, d_vec, N * sizeof(float), cudaMemcpyDeviceToHost)); printf("\n\nSquaring\n"); for (int i=0; i<N; i++) printf("h_vec[%i] = %f\n", i, h_vec[i]); // --- Median filter kernel execution median_filter_periodic_boundary<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_out, N); gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaDeviceSynchronize()); printf("\n\nSecond filtering\n"); gpuErrchk(cudaMemcpy(h_vec, d_out, N * sizeof(float), cudaMemcpyDeviceToHost)); for (int i=0; i<N; i++) printf("h_vec[%i] = %f\n", i, h_vec[i]); printf("Test finished\n"); return 0; }
Please note the following:
- I DO NOT bind the texture to
cudaArray , since cudaArray cannot be modified from inside the kernels; - I DO NOT bind the texture to the
cudaMalloc ed array, since the textures bound to the cudaMalloc ed arrays can only be obtained with tex1Dfetch , and tex1Dfetch does not support the cudaAddressModeWrap addressing cudaAddressModeWrap , which guarantees periodic signal expansion beyond its borders; - I bind the texture to the
cudaMallocPitch ed array, as this makes it possible to select the texture using tex1D , which allows the addressing mode of cudaAddressModeWrap ; - I use normalized coordinates, because only those that activate the
cudaAddressModeWrap addressing cudaAddressModeWrap .
I need points #2 , #3 and #4 since I extracted this example from the code I wrote.