I am trying to create a large number of sha256 hashes quickly on a T4 machine. T4 has a 'sha256' instruction that allows me to calculate the hash in one op code. I created an inline assembly template to invoke the sha256 opcode:
in my C ++ code:
extern "C"
{
void ProcessChunk(const char* buf, uint32_t* state);
}
pchunk.il:
.inline ProcessChunk,8
.volatile
ldd [%o1],%f0
ldd [%o1 + 8],%f2
ldd [%o1 +16],%f4
ldd [%o1 +24],%f6
ldd [%o0],%f8
ldd [%o0+8],%f10
ldd [%o0+16],%f12
ldd [%o0+24],%f14
ldd [%o0+32],%f16
ldd [%o0+40],%f18
ldd [%o0+48],%f20
ldd [%o0+56],%f22
sha256
nop
std %f0, [%o1]
std %f2, [%o1+8]
std %f4, [%o1+16]
std %f6, [%o1+24]
.end
, . openmp , ProcessChunk. , (, 16), . ProcessChunk , . , . ProcessChunk , , ( ). , . , ?
, ?
Update:
, (16 ) :
.inline ProcessChunk,8
.volatile
ldq [%o1], %f0
ldq [%o1 +16],%f4
ldq [%o0], %f8
ldq [%o0+16],%f12
ldq [%o0+32],%f16
ldq [%o0+48],%f20
lzd %o0,%o0
nop
stq %f0, [%o1]
stq %f4, [%o1+16]
.end
, , . 32 , , ( ), , , . , , , .
2:
, , T4 (10 ).
, , :