There is a way to solve this problem (it allows you to use the VMOVDQA instruction (an analogue of MOVAPD) instead of MOVUPD):
inline __m256d Load(const double * p) { #ifdef _MSC_VER return _mm256_castsi256_pd(_mm256_load_si256((__m256i*)p)); #else return _mm256_load_pd(p); #endif }
Similar solution for float type:
inline __m256 Load(const float * p) { #ifdef _MSC_VER return _mm256_castsi256_ps(_mm256_load_si256((__m256i*)p)); #else return _mm256_load_ps(p); #endif }
But to cheat the Visual Studio compiler, you need to use dynamically allocated pointers. Otherwise, the compiler does not use the VMOVDQA statement.
#include <immintrin.h> int main() { float * ps = (float*)_mm_malloc(40, 32); double * pd = (double*)_mm_malloc(40, 32); __m256 s = Load(ps); //00007FF79FF81325 vmovdqa ymm1,ymmword ptr [rdi] __m256d d = Load(pd); //00007FF79FF8132F vmovdqa ymm0,ymmword ptr [rax] _mm256_storeu_ps(ps, s); _mm256_storeu_pd(pd, d); _mm_free(ps); _mm_free(pd); }
Ermig source share