Assembly for C++ developers

ยท 1403 words ยท 7 minute read

A quick post on how to read assembly quickly as C++ developer. Note that this is NOT a guide on how to read assembly code. Rather, this post shows how common C++ constructs/routines are compiled into assembly. From there, it can help you to read asm faster.

There are plenty of guides on assembly online, for example here.

To get assembly generated from your code, you can use

Setup ๐Ÿ”—

This article show a list of C++ code and corresponding assembly. The examples use Intel syntax, and are compiled for Linux x86-64. Most of them are using GCC with optimization enabled (-O3). Other platforms and compilers might show different results.

There are links to the explanation if you find the resulted assembly code surprising.

Examples ๐Ÿ”—

C++ Possible Assembly
Basic
// Global variable int x; x: .zero 4
// String literals std::cout << "ABCDEF"; .LC0: .string "ABCDEF" foo(): mov edx, 6 mov esi, OFFSET FLAT:.LC0 mov edi, OFFSET FLAT:_ZSt4cout ...
// Array literals std::array<int, 4> a{1, 2, 3, 4}; .LC0: .long 1 .long 2 .long 3 .long 4
// Set a variable to 0 x = 0; xor eax, eax
// Test if variable equals 0 return x == 0; xor eax, eax test edi, edi sete al ret
struct A { int *x; A() { x = new int[50]; } };
int foo() { // Static variable inside function static A a; return a.x[0]; }
foo(): movzx eax, BYTE PTR guard variable for foo()::a[rip] test al, al je .L16 mov rax, QWORD PTR foo()::a[rip] mov eax, DWORD PTR [rax] ret .L16: push rbx mov edi, OFFSET FLAT:guard variable for foo()::a call __cxa_guard_acquire test eax, eax jne .L17 mov rax, QWORD PTR foo()::a[rip] pop rbx mov eax, DWORD PTR [rax] ret .L17: mov edi, 200 call operator new[](unsigned long) mov edi, OFFSET FLAT:guard variable for foo()::a mov QWORD PTR foo()::a[rip], rax call __cxa_guard_release mov rax, QWORD PTR foo()::a[rip] pop rbx mov eax, DWORD PTR [rax] ret mov rbx, rax jmp .L5 foo() [clone .cold]: .L5: mov edi, OFFSET FLAT:guard variable for foo()::a call __cxa_guard_abort mov rdi, rbx call _Unwind_Resume
// Volatile void foo(volatile int &x, volatile int &y) { y = x; y = x; }
void bar(int &x, int &y) { y = x; y = x; }
foo(int volatile&, int volatile&): mov eax, DWORD PTR [rdi] mov DWORD PTR [rsi], eax mov eax, DWORD PTR [rdi] mov DWORD PTR [rsi], eax ret bar(int&, int&): mov eax, DWORD PTR [rdi] mov DWORD PTR [rsi], eax ret
a = 100; a = a + 10; b = b + 10; // Without optimization mov DWORD PTR [rdi], 100 add DWORD PTR [rdi], 10 add esi, 10 mov DWORD PTR [rdi], esi
// With optimization mov DWORD PTR [rdi], 110 add DWORD PTR [rsi], 10
if (b % 2 == 0) { a = a + 20; } and esi, 1 jne .L1 add DWORD PTR [rdi], 20 .L1: ret
// Change division by a variable // to division by constants switch (b) { case 2: return a / 2; case 3: return a / 3; [[likely]] case 7: return a / 7; default: return a / b; } mov esi, DWORD PTR [rsi] mov ecx, DWORD PTR [rdi] cmp esi, 7 jne .L8 // [[likely]] reorder branch. This is a / 7 // Division by a constant movsx rax, ecx imul rax, rax, -1840700269 shr rax, 32 add eax, ecx sar ecx, 31 sar eax, 2 sub eax, ecx ret .L8: jg .L3 cmp esi, 2 je .L4 cmp esi, 3 jne .L3 movsx rdx, ecx sar ecx, 31 imul rdx, rdx, 1431655766 shr rdx, 32 mov eax, edx sub eax, ecx ret .L3: mov eax, ecx cdq idiv esi ret .L4: mov edx, ecx shr edx, 31 lea eax, [rdx+rcx] sar eax ret
switch (b) { case 1: return a + 1; case 2: return a / 2; case 3: return (a + 5) / 7; case 4: return (a + 9) / 10; case 5: return (a - 7) / 8; default: return a; } cmp esi, 5 ja .L9 mov esi, esi jmp [QWORD PTR .L4[0+rsi*8]] .L4: // Jump Table .quad .L9 .quad .L8 .quad .L7 .quad .L6 .quad .L5 .quad .L3 .L3: mov eax, edi sub eax, 7 cmovns edi, eax mov eax, edi sar eax, 3 ret .L8: lea eax, [rdi+1] ret .L7: mov eax, edi shr eax, 31 add edi, eax mov eax, edi sar eax ret .L6: lea edx, [rdi+5] movsx rax, edx imul rax, rax, -1840700269 shr rax, 32 add eax, edx sar edx, 31 sar eax, 2 sub eax, edx ret .L5: lea edx, [rdi+9] movsx rax, edx sar edx, 31 imul rax, rax, 1717986919 sar rax, 34 sub eax, edx ret .L9: mov eax, edi ret
// Normal loop, without optimization for (auto elem : arr) { sum += elem; } mov rax, QWORD PTR [rdi] mov rcx, QWORD PTR [rdi+8] xor edx, edx cmp rcx, rax je .L1 .L3: add edx, DWORD PTR [rax] add rax, 4 cmp rax, rcx jne .L3 .L1: mov eax, edx ret
// With loop unrolling __attribute__((optimize("unroll-loops")))
... for (auto elem : arr) { sum += elem; }
mov rdx, QWORD PTR [rdi] mov rsi, QWORD PTR [rdi+8] xor eax, eax cmp rsi, rdx je .L4 mov rcx, rsi sub rcx, rdx sub rcx, 4 shr rcx, 2 add rcx, 1 and ecx, 7 je .L3 cmp rcx, 1 je .L27 cmp rcx, 2 je .L28 cmp rcx, 3 je .L29 cmp rcx, 4 je .L30 cmp rcx, 5 je .L31 cmp rcx, 6 jne .L42 .L32: add eax, DWORD PTR [rdx] add rdx, 4 .L31: add eax, DWORD PTR [rdx] add rdx, 4 .L30: add eax, DWORD PTR [rdx] add rdx, 4 .L29: add eax, DWORD PTR [rdx] add rdx, 4 .L28: add eax, DWORD PTR [rdx] add rdx, 4 .L27: add eax, DWORD PTR [rdx] add rdx, 4 cmp rdx, rsi je .L43 .L3: add eax, DWORD PTR [rdx] add rdx, 32 add eax, DWORD PTR [rdx-28] add eax, DWORD PTR [rdx-24] add eax, DWORD PTR [rdx-20] add eax, DWORD PTR [rdx-16] add eax, DWORD PTR [rdx-12] add eax, DWORD PTR [rdx-8] add eax, DWORD PTR [rdx-4] cmp rdx, rsi jne .L3 ret .L42: mov eax, DWORD PTR [rdx] add rdx, 4 jmp .L32 .L43: ret .L4: ret
// With vectorize instructions for (auto elem : arr) { sum += elem; } mov rdx, QWORD PTR [rdi] mov rdi, QWORD PTR [rdi+8] cmp rdi, rdx je .L7 lea rcx, [rdi-4] mov rax, rdx sub rcx, rdx mov rsi, rcx shr rsi, 2 add rsi, 1 cmp rcx, 8 jbe .L8 mov rcx, rsi pxor xmm0, xmm0 shr rcx, 2 sal rcx, 4 add rcx, rdx .L4: movdqu xmm2, XMMWORD PTR [rax] add rax, 16 paddd xmm0, xmm2 cmp rcx, rax jne .L4 movdqa xmm1, xmm0 psrldq xmm1, 8 paddd xmm0, xmm1 movdqa xmm1, xmm0 psrldq xmm1, 4 paddd xmm0, xmm1 movd eax, xmm0 test sil, 3 je .L1 and rsi, -4 lea rdx, [rdx+rsi*4] .L3: lea rcx, [rdx+4] add eax, DWORD PTR [rdx] cmp rdi, rcx je .L1 lea rcx, [rdx+8] add eax, DWORD PTR [rdx+4] cmp rdi, rcx je .L1 add eax, DWORD PTR [rdx+8] ret .L7: xor eax, eax .L1: ret .L8: xor eax, eax jmp .L3
Function
KxVector::find(KxLogObserver* const&) const std::allocator::allocator() // Name Mangling _ZNK8KxVectorIP13KxLogObserverjE4findERKS1_ _ZNSaIwEC1Ev
return foo(a) + 5; // x86 calling conventions sub rsp, 8 call foo(int) add rsp, 8 add eax, 5 ret
// More complex function call struct B { int a; int b; };
int foo(int, B, std::string);
... x = foo(3, B{.a=1, .b=2}, "abcdef");
push rbp mov eax, 26213 mov edi, 3 movabs rsi, 8589934593 push rbx sub rsp, 40 lea rbp, [rsp+16] mov rdx, rsp mov WORD PTR [rsp+20], ax mov QWORD PTR [rsp], rbp mov DWORD PTR [rsp+16], 1684234849 mov QWORD PTR [rsp+8], 6 mov BYTE PTR [rsp+22], 0 call foo(...) mov rdi, QWORD PTR [rsp] mov ebx, eax cmp rdi, rbp je .L2 mov rax, QWORD PTR [rsp+16] lea rsi, [rax+1] call operator delete(void*, unsigned long) .L2: add rsp, 40 lea eax, [rbx+1] pop rbx pop rbp ret mov rbx, rax jmp .L3
void foo(std::unique_ptr<int> &p) { *p = 2; }
void bar(int *p) { *p = 2; }
// ABI requirements on non-trivial types foo(std::unique_ptr >&): mov rax, QWORD PTR [rdi] mov DWORD PTR [rax], 2 ret bar(int*): mov DWORD PTR [rdi], 2 ret
// Virtual function call struct Foo { virtual int foo() = 0; };
int GetFoo(Foo *foo) { return foo->foo() + 5; }
sub rsp, 8 mov rax, QWORD PTR [rdi] call [QWORD PTR [rax]] add rsp, 8 add eax, 5 ret
// syscall via glibc #include <fcntl.h>
auto fp = ::open(path.c_str(), O_APPEND);
sub rsp, 8 mov rdi, QWORD PTR [rdi] mov esi, 1024 xor eax, eax call open add rsp, 8
puts("abc"); // Compile with -fPIE lea rdi, .LC0[rip] // Procedure Linkage Table jmp puts@PLT
Multi-threading
thread_local int x; ... return x; // fs register mov eax, DWORD PTR fs:x@tpoff ret
std::lock_guard _(mutex); mov eax, OFFSET FLAT:_ZL28__gthrw___pthread_key_createPjPFvPvE push rbx mov rbx, rdi test rax, rax je .L2 call __gthrw_pthread_mutex_lock(pthread_mutex_t*) test eax, eax je .L2 mov edi, eax call std::__throw_system_error(int)
std::atomic<int> x; ... x.store(1); mov eax, 1 xchg eax, DWORD PTR [rdi]
x = 1; // Memory barrier std::atomic_thread_fence( std::memory_order_seq_cst); x = 3; mov DWORD PTR [rdi], 1 lock or QWORD PTR [rsp], 0 mov DWORD PTR [rdi], 3
Exception
try { if (x == 3) throw std::runtime_error("Test"); else return 4; } catch (...) { return 1; } .LC0: .string "Test" bar(int): cmp edi, 3 je .L8 mov eax, 4 ret
// Hot Cold Spliting bar(int) [clone .cold]: .L8: push rbp mov edi, 16 push rbx push rcx call __cxa_allocate_exception mov esi, OFFSET FLAT:.LC0 mov rdi, rax mov rbp, rax call std::runtime_error::runtime_error(char const*) [complete object constructor] mov edx, OFFSET FLAT:_ZNSt13runtime_errorD1Ev mov esi, OFFSET FLAT:_ZTISt13runtime_error mov rdi, rbp call __cxa_throw mov rbx, rax mov rdi, rbp call __cxa_free_exception mov rdi, rbx .L4: call __cxa_begin_catch call __cxa_end_catch pop rdx mov eax, 1 pop rbx pop rbp ret mov rdi, rax jmp .L4
auto* foo(int x) { return new int[x]; }
// noexcept auto* bar(int x) noexcept { return new int[x]; }
foo(int): movabs rax, 2305843009213693950 movsx rdi, edi cmp rax, rdi jb .L2 sal rdi, 2 jmp operator new[](unsigned long) foo(int) [clone .cold]: .L2: push rax call __cxa_throw_bad_array_new_length bar(int): movsx rdi, edi sub rsp, 8 movabs rax, 2305843009213693950 cmp rax, rdi jb .L7 sal rdi, 2 call operator new[](unsigned long) add rsp, 8 ret bar(int) [clone .cold]: .L7: call __cxa_throw_bad_array_new_length
Misc.
for (int i = 0; i < len; ++i) { src[i] = 42; } // Stack Smashing Protector sub rsp, 24 mov rax, QWORD PTR fs:40 mov QWORD PTR [rsp+8], rax xor eax, eax test esi, esi jle .L1 mov rax, QWORD PTR [rsp+8] sub rax, QWORD PTR fs:40 jne .L6 mov edx, esi add rsp, 24 mov esi, 42 jmp memset .L1: mov rax, QWORD PTR [rsp+8] sub rax, QWORD PTR fs:40 jne .L6 add rsp, 24 ret .L6: call __stack_chk_fail
long long sum = 0; for (int i = 0; i < n; ++i) { sum += i; }
return sum;
// No more loop :o test edi, edi jle .LBB0_1 lea eax, [rdi - 1] lea ecx, [rdi - 2] imul rcx, rax shr rcx lea eax, [rdi + rcx] dec eax ret .LBB0_1: xor eax, eax ret
int bar(int x) { assert(x == 1); } .LC1: .string "int bar(int)" .LC2: .string "/app/example.cpp" .LC3: .string "x == 1" bar(int): push rbp mov rbp, rsp sub rsp, 16 mov DWORD PTR [rbp-4], edi cmp DWORD PTR [rbp-4], 1 je .L3 mov ecx, OFFSET FLAT:.LC1 mov edx, 16 mov esi, OFFSET FLAT:.LC2 mov edi, OFFSET FLAT:.LC3 call __assert_fail .L3: // ud2 generated under undefined behavior ud2
int foo(int a) { while (a) {} return 1; } foo(int): mov eax, 1 ret