If I understand what you're trying to do, the logical approach is to use the C ++ reinterpret_cast mechanism to force the compiler to generate the correct vector load instruction, and then use the CUDA built into the uchar4 byte size vector type to access each byte in each of the four 32-bit words loaded from global memory. Using this approach, you really trust the compiler, knowing the best way to use byte access in each 32-bit register.
A completely contrived example might look like this:
#include <cstdio> #include <cstdlib> __global__ void kernel(unsigned int *in, unsigned char* out) { int tid = threadIdx.x; uint4* p = reinterpret_cast<uint4*>(in); uint4 i4 = p[tid]; // vector load here uchar4 c0 = *reinterpret_cast<uchar4 *>(&i4.x); uchar4 c4 = *reinterpret_cast<uchar4 *>(&i4.y); uchar4 c8 = *reinterpret_cast<uchar4 *>(&i4.z); uchar4 c12 = *reinterpret_cast<uchar4 *>(&i4.w); out[tid*4+0] = c0.x; out[tid*4+1] = c4.y; out[tid*4+2] = c8.z; out[tid*4+3] = c12.w; } int main(void) { unsigned int c[8] = { 2021161062, 2021158776, 2020964472, 1920497784, 2021161058, 2021161336, 2020898936, 1702393976 }; unsigned int * _c; cudaMalloc((void **)&_c, sizeof(int)*size_t(8)); cudaMemcpy(_c, c, sizeof(int)*size_t(8), cudaMemcpyHostToDevice); unsigned char * _m; cudaMalloc((void **)&_m, sizeof(unsigned char)*size_t(8)); kernel<<<1,2>>>(_c, _m); unsigned char m[8]; cudaMemcpy(m, _m, sizeof(unsigned char)*size_t(8), cudaMemcpyDeviceToHost); for(int i=0; i<8; i++) fprintf(stdout, "%d %c\n", i, m[i]); return 0; }
which should create a readable string of characters embedded in an array of unsigned integers supplied to the kernel.
One caveat is that the open64 compiler used for computing 1.x often defeats this strategy by trying to generate vector loads if it can detect that not all words in the vector were actually used. So make sure you touch all input words in the input vector type to make sure the compiler plays well.
source share