Multithreaded embedded assembly

I am trying to create a large number of sha256 hashes quickly on a T4 machine. T4 has a 'sha256' instruction that allows me to calculate the hash in one op code. I created an inline assembly template to invoke the sha256 opcode:

in my C ++ code:

extern "C"
{
   void ProcessChunk(const char* buf, uint32_t* state);
}

pchunk.il:

.inline ProcessChunk,8  
.volatile
  /* copy state */
  ldd [%o1],%f0 /* load 8 bytes */ 
  ldd [%o1 + 8],%f2 /* load 8 bytes */ 
  ldd [%o1 +16],%f4 /* load 8 bytes */ 
  ldd [%o1 +24],%f6 /* load 8 bytes */ 

  /* copy data */
  ldd [%o0],%f8 /* load 8 bytes */ 
  ldd [%o0+8],%f10 /* load 8 bytes */ 
  ldd [%o0+16],%f12 /* load 8 bytes */ 
  ldd [%o0+24],%f14 /* load 8 bytes */ 
  ldd [%o0+32],%f16 /* load 8 bytes */ 
  ldd [%o0+40],%f18 /* load 8 bytes */ 
  ldd [%o0+48],%f20 /* load 8 bytes */ 
  ldd [%o0+56],%f22 /* load 8 bytes */ 

  sha256
  nop

  std %f0, [%o1]
  std %f2, [%o1+8]
  std %f4, [%o1+16]
  std %f6, [%o1+24]
.end

, . openmp , ProcessChunk. , (, 16), . ProcessChunk , . , . ProcessChunk , , ( ). , . , ?

, ?

Update:

, (16 ) :

.inline ProcessChunk,8
.volatile
  /* copy state */
  ldq [%o1],    %f0
  ldq [%o1 +16],%f4

  /* copy data */
  ldq [%o0],   %f8
  ldq [%o0+16],%f12
  ldq [%o0+32],%f16
  ldq [%o0+48],%f20

  lzd %o0,%o0
  nop

  stq %f0, [%o1]
  stq %f4, [%o1+16]
.end

, , . 32 , , ( ), , , . , , , .

2:

, , T4 (10 ).

, , :

  • ,

+4
1

Spark (, ), :

, .

? ProcessChunk .

, , CPU asm "" .

? , , ​​/ . , , .

, ? : , , ?

, , , N , N + 1 ( , ).

+1

Source: https://habr.com/ru/post/1524787/


All Articles