If there is no architecture-dependent architecture that maps imulh32to the native instruction, then I think this is just what you can do.
, g++ 6.3 , , 1 , , - shr $0x20.
unsigned long umulhi32( unsigned int x, unsigned int y)
{
return ( ((unsigned long)a * (unsigned long)b ) >> 32);
}
0000000000000960 <_Z8umulhi32jy>:
960: 89 f8 mov %edi,%eax
962: 89 f7 mov %esi,%edi
964: 48 0f af c7 imul %rdi,%rax
968: 48 c1 e8 20 shr $0x20,%rax
96c: c3 retq
96d: 0f 1f 00 nopl (%rax)
cuda umulhi, , - PTX, , , , PTX, cuda , mul24, (quoting)
mul24.hi 24x24 32 48-
, x86 .
, .