In theory , you might have the most efficient code since synchronization is no longer required.
, , / (, ARMv8.3-A). .
, x86_64 fetch_sub(std::memory_order_acq_rel) fetch_sub(std::memory_order_release) .
, , , , :
std::atomic<int> cnt;
int* p;
void optimal_in_therory() {
if (cnt.fetch_sub(1, std::memory_order_release) == 1) {
cnt.load(std::memory_order_acquire);
delete p;
}
}
void optimal_in_practice_on_x86_64() {
if (cnt.fetch_sub(1, std::memory_order_acq_rel) == 1) {
delete p;
}
}
:
optimal_in_therory():
lock sub DWORD PTR cnt[rip], 1
je .L4
rep ret
.L4:
mov eax, DWORD PTR cnt[rip] ;Unnecessary extra load
mov rdi, QWORD PTR p[rip]
mov esi, 4
jmp operator delete(void*, unsigned long)
optimal_in_practice_on_x86_64():
lock sub DWORD PTR cnt[rip], 1
je .L7
rep ret
.L7:
mov rdi, QWORD PTR p[rip]
mov esi, 4
jmp operator delete(void*, unsigned long)
, - Desproges
?
, . , :
cnt.load(std::memory_order_acquire);
cnt.load(std::memory_order_acquire);
cnt.load(std::memory_order_acquire);
GCC Clang :
mov eax, DWORD PTR cnt[rip]
mov eax, DWORD PTR cnt[rip]
mov eax, DWORD PTR cnt[rip]
. , - "" "". , volatile , , volatile: " - ". ( ). SO.
, , , , .
atom_thread_fence, Kieth . , , "" , (, , ;)).
, ?
shared_ptr , . Desctructor , "" .
delete p; " " , .
:
[intro.races]/9:
. B, :
[intro.races]/10:
A B (, , B A), :
, "synchronize with" fetch_sub, delete p, fetch_sub.
[atomics.order]/2:
A, M, B, M , A.
, delete p , , fetch_sub.
[expr.races]/5 fetch_sub ( cnt) fetch_sub, a fetch_sub --, fetch_add (, cnt).
, delete p fetch_sub, , delete p, "". , .