C ++ mapping for assembly

Question

C ++ mapping for assembly

When compiling some code with clang 3.9.1 and optimizations (-O2) at runtime, I came across unexpected behavior that I did not see with other compilers (clang 3.8 and gcc 6.3).

I thought I might have unintended behavior undefined (compiling with ubsan removes unexpected behavior), so I tried to simplify the program and found that one particular function seems to cause differences in behavior.

Now I am comparing the assembly with C ++ to see where this is happening, to try to determine why this is happening, and there are several portions that are difficult for me to map to.

link godbolt

C ++:

#include <atomic>
#include <cstdint>
#include <cstdlib>
#include <thread>
#include <cstdio>

enum class FooState { A, B };

struct Foo {
  std::atomic<std::int64_t> counter{0};
  std::atomic<std::int64_t> counter_a{0};
  std::atomic<std::int64_t> counter_b{0};
};

//__attribute__((noinline))
FooState to_state(const std::int64_t c) {
  return c >= 0 ? FooState::A : FooState::B;
}

static const int NUM_MODIFIES = 100;

int value_a = 0, value_b = 0;
Foo foo;
std::atomic<std::int64_t> total_sum{0};

void test_function() {
  bool done = false;
  while (!done) {
    const std::int64_t count =
        foo.counter.fetch_add(1, std::memory_order_seq_cst);
    const FooState state = to_state(count);

    int &val = FooState::A == state ? value_a : value_b;
    if (val == NUM_MODIFIES) {
      total_sum += val;
      done = true;
    }

    std::atomic<std::int64_t> &c =
        FooState::A == state ? foo.counter_a : foo.counter_b;
    c.fetch_add(1, std::memory_order_seq_cst);
  }
}

Assembly:

test_function():                     # @test_function()
        test    rax, rax
        setns   al
        lock
        inc     qword ptr [rip + foo]
        mov     ecx, value_a
        mov     edx, value_b
        cmovg   rdx, rcx
        cmp     dword ptr [rdx], 100
        je      .LBB1_3
        mov     ecx, foo+8
        mov     edx, value_a
.LBB1_2:                                # =>This Inner Loop Header: Depth=1
        test    al, 1
        mov     eax, foo+16
        cmovne  rax, rcx
        lock
        inc     qword ptr [rax]
        test    rax, rax
        setns   al
        lock
        inc     qword ptr [rip + foo]
        mov     esi, value_b
        cmovg   rsi, rdx
        cmp     dword ptr [rsi], 100
        jne     .LBB1_2
.LBB1_3:
        lock
        add     qword ptr [rip + total_sum], 100
        test    al, al
        mov     eax, foo+8
        mov     ecx, foo+16
        cmovne  rcx, rax
        lock
        inc     qword ptr [rcx]
        ret

, to_state noinline done "" .

, , , counter >= 0, counter_a , counter_b . , , , , / .

, , test rax, rax; setns al test al, 1. , al , , , , , , - .

, . clang 3.9 -O2 .

#include <atomic>
#include <cstdint>
#include <cstdlib>
#include <thread>
#include <cstdio>

enum class FooState { A, B };

struct Foo {
  std::atomic<std::int64_t> counter{0};
  std::atomic<std::int64_t> counter_a{0};
  std::atomic<std::int64_t> counter_b{0};
};

//__attribute__((noinline))
FooState to_state(const std::int64_t c) {
  return c >= 0 ? FooState::A : FooState::B;
}

//__attribute__((noinline))
FooState to_state2(const std::int64_t c) {
  return c >= 0 ? FooState::A : FooState::B;
}

static const int NUM_MODIFIES = 100;

int value_a = 0, value_b = 0;
Foo foo;
std::atomic<std::int64_t> total_sum{0};

void test_function() {
  bool done = false;
  while (!done) {
    const std::int64_t count =
        foo.counter.fetch_add(1, std::memory_order_seq_cst);
    const FooState state = to_state(count);

    int &val = FooState::A == state ? value_a : value_b;
    if (val == NUM_MODIFIES) {
      total_sum += val;
      done = true;
    }

    std::atomic<std::int64_t> &c =
        FooState::A == state ? foo.counter_a : foo.counter_b;
    c.fetch_add(1, std::memory_order_seq_cst);
  }
}

int main() {
  std::thread thread = std::thread(test_function);

  for (std::size_t i = 0; i <= NUM_MODIFIES; ++i) {
    const std::int64_t count =
        foo.counter.load(std::memory_order_seq_cst);
    const FooState state = to_state2(count);

    unsigned log_count = 0;

    auto &inactive_val = FooState::A == state ? value_b : value_a;
    inactive_val = i;

    if (FooState::A == state) {
      foo.counter_b.store(0, std::memory_order_seq_cst);
      const auto accesses_to_wait_for =
          foo.counter.exchange((std::numeric_limits<std::int64_t>::min)(),
                               std::memory_order_seq_cst);
      while (accesses_to_wait_for !=
             foo.counter_a.load(std::memory_order_seq_cst)) {
        std::this_thread::yield();

        if(++log_count <= 10) {
          std::printf("#1 wait_for=%ld, val=%ld\n", accesses_to_wait_for, 
            foo.counter_a.load(std::memory_order_seq_cst));
        }
      }
    } else {
      foo.counter_a.store(0, std::memory_order_seq_cst);

      auto temp = foo.counter.exchange(0, std::memory_order_seq_cst);
      std::int64_t accesses_to_wait_for = 0;
      while (temp != INT64_MIN) {
        ++accesses_to_wait_for;
        --temp;
      }

      while (accesses_to_wait_for !=
             foo.counter_b.load(std::memory_order_seq_cst)) {
        std::this_thread::yield();

        if (++log_count <= 10) {
          std::printf("#2 wait_for=%ld, val=%ld\n", accesses_to_wait_for, 
            foo.counter_b.load(std::memory_order_seq_cst));
        }
      }
    }

    std::printf("modify #%lu complete\n", i);
  }

  std::printf("modifies complete\n");

  thread.join();

  const std::size_t expected_result = NUM_MODIFIES;
  std::printf("%s\n", total_sum == expected_result ? "ok" : "fail");
}

+4

c++ assembly clang

CTT 11 . '17 17:41

1

Ped7g · Accepted Answer · 2017-01-11T18:29:24+0000

100% ( , ), , test rax,rax + setns al - .

, rax < 0 (UB), "NS" ( 32b rax = > SF = 0 = > al= 1), al == 1 counter_a.

, ( ).

C ++ mapping for assembly

More articles: