I'm learning how to use __asm__ volatile in GCC and came up with a problem. I want implement a function performing atomic compare and exchange and returning the value that was previously stored in the destination.
Why does an "=a"(expected) output constraint work, but an "=r"(expected) constraint lets the compiler generate code that doesn't work?
Case 1.
#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>
uint64_t atomic_cas(uint64_t * destination, uint64_t expected, uint64_t value){
__asm__ volatile (
"lock cmpxchgq %3, %1":
"=a" (expected) :
"m" (*destination), "a" (expected), "r" (value) :
"memory"
);
return expected;
}
int main(void){
uint64_t v1 = 10;
uint64_t result = atomic_cas(&v1, 10, 5);
printf("%" PRIu64 "\n", result); //prints 10, the value before, OK
printf("%" PRIu64 "\n", v1); //prints 5, the new value, OK
}
It works as expected. Now consider the following case:
Case 2.
#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>
uint64_t atomic_cas(uint64_t * destination, uint64_t expected, uint64_t value){
__asm__ volatile (
"lock cmpxchgq %3, %1":
"=r" (expected) ://<----- I changed a with r and expected GCC understood it from the inputs
"m" (*destination), "a" (expected), "r" (value) :
"memory"
);
return expected;
}
int main(void){
uint64_t v1 = 10;
uint64_t result = atomic_cas(&v1, 10, 5);
printf("%" PRIu64 "\n", result); //prints 5, wrong
printf("%" PRIu64 "\n", v1); //prints 5, the new value, OK
}
I examined generated assembly and noticed the following things:
I. In both of the cases the function code is the same and looks as
0x0000555555554760 <+0>: mov rax,rsi
0x0000555555554763 <+3>: lock cmpxchg QWORD PTR [rdi],rdx
0x0000555555554768 <+8>: ret
II. The problem came when GCC inlined the atomic_cas so in the later case the correct value was not passed to the printf function. Here is the related fragment of disas main:
0x00000000000005f6 <+38>: lock cmpxchg QWORD PTR [rsp],rdx
0x00000000000005fc <+44>: lea rsi,[rip+0x1f1] # 0x7f4
0x0000000000000603 <+51>: mov rdx,rax ; <-----This instruction is absent in the Case 2.
0x0000000000000606 <+54>: mov edi,0x1
0x000000000000060b <+59>: xor eax,eax
QUESTION: Why does the replacing rax(a) with an arbitrary register (r) produce wrong result? I expected it worked in both of the cases?
UPD. I compile with the following flags -Wl,-z,lazy -Warray-bounds -Wextra -Wall -g3 -O3