I am trying to create arrays __m256ifor reuse in another calculation. When I try to do this (even with a minimal test case), I get a segmentation error - but only if the code is compiled with g ++ or clang. If I compile code with the Intel compiler (version 16.0), a segmentation error does not occur. Here is a test case that I created:
int main() {
__m256i *table = new __m256i[10000];
__m256i zeroes = _mm256_set_epi64x(0, 0, 0, 0);
table[99] = zeroes;
}
When compiling the above with clang 3.6 and g ++ 4.8, a segmentation error occurs.
Here's the assembly generated by the Intel compiler (from https://gcc.godbolt.org/ , icc 13.0):
pushq %rbx
movq %rsp, %rbx
andq $-32, %rsp
pushq %rbp
pushq %rbp
movq 8(%rbx), %rbp
movq %rbp, 8(%rsp)
movq %rsp, %rbp
subq $112, %rsp
movl $3200, %eax
vzeroupper
movq %rax, %rdi
call operator new[](unsigned long)
movq %rax, -112(%rbp)
movq -112(%rbp), %rax
movq %rax, -104(%rbp)
vxorps %ymm0, %ymm0, %ymm0
vmovdqu %ymm0, -80(%rbp)
vmovdqu -80(%rbp), %ymm0
vmovdqu %ymm0, -48(%rbp)
movl $3168, %eax
addq -104(%rbp), %rax
vmovdqu -48(%rbp), %ymm0
vmovdqu %ymm0, (%rax)
movl $0, %eax
vzeroupper
leave
movq %rbx, %rsp
popq %rbx
ret
And here from clang 3.7:
pushq %rbp
movq %rsp, %rbp
andq $-32, %rsp
subq $192, %rsp
xorl %eax, %eax
movl $3200, %ecx
movl %ecx, %edi
movl %eax, 28(%rsp)
callq operator new[](unsigned long)
movq %rax, 88(%rsp)
movq $0, 168(%rsp)
movq $0, 160(%rsp)
movq $0, 152(%rsp)
movq $0, 144(%rsp)
vmovq 168(%rsp), %xmm0
vmovq 160(%rsp), %xmm1
vpunpcklqdq %xmm0, %xmm1, %xmm0
vmovq 152(%rsp), %xmm1
vpslldq $8, %xmm1, %xmm1
vmovaps %xmm1, %xmm2
vinserti128 $1, %xmm0, %ymm2, %ymm2
vmovaps %ymm2, 96(%rsp)
vmovaps %ymm2, 32(%rsp)
movq 88(%rsp), %rax
vmovaps %ymm2, 3168(%rax)
movl 28(%rsp), %eax
movq %rbp, %rsp
popq %rbp
vzeroupper
retq
Am I facing a compiler error in clang / g ++? Or am I just doing something wrong?