Why is GCC exchanging rax and xmm0 registers?

Question

I was verifying some assembly generated by gcc version 5.2.1 20151010 (Ubuntu 5.2.1-22ubuntu2) and realized that the following instructions were being generated:

movq    %xmm0, %rax
movq    %rax, %xmm0

I'd like to know what is the purpose of these instructions considering that it seems irrelevant, is it some kind of optimization? Like when we do:

xor ax, ax

I'd like to let clear that this code appeared just when I used the option -mtune=native and my CPU is a Intel Core I5 4200U.

Following is my source code:

#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include "print.h"

void multiply(const unsigned int* array1, const unsigned int* array2, unsigned int* array3, const unsigned int array_size)
{
    unsigned int i = 0;

    for (i = 0; i < array_size; i++)
    {
        array3[i] = array1[i] * array2[i];
    }
}

int main()
{   
    const unsigned int array_size = 1024*1024;

    unsigned int* array1 = (unsigned int*)malloc(sizeof(unsigned int) * array_size);
    unsigned int* array2 = (unsigned int*)malloc(sizeof(unsigned int) * array_size);
    unsigned int* array3 = (unsigned int*)malloc(sizeof(unsigned int) * array_size);

    int i = 0;

    srand(time(NULL));

    for (i = 0; i < array_size; i++)
    {
        array1[i] = rand();
        array2[i] = rand();
    }

    clock_t t0 = clock();

    multiply(array1,array2,array3, array_size);
    multiply(array1,array2,array3, array_size);

    clock_t t1 = clock();

    printf("\nTempo: %f\n", ((double)(t1 - t0)) / CLOCKS_PER_SEC);
}

This is the assembly generated by GCC using:gcc -S -mtune=native Main.c:

.file   "Main.c"
.text
.globl  multiply
.type   multiply, @function
multiply:
.LFB2:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    movq    %rdi, -24(%rbp)
    movq    %rsi, -32(%rbp)
    movq    %rdx, -40(%rbp)
    movl    %ecx, -44(%rbp)
    movl    $0, -4(%rbp)
    movl    $0, -4(%rbp)
    jmp .L2
.L3:
    movl    -4(%rbp), %eax
    leaq    0(,%rax,4), %rdx
    movq    -40(%rbp), %rax
    addq    %rax, %rdx
    movl    -4(%rbp), %eax
    leaq    0(,%rax,4), %rcx
    movq    -24(%rbp), %rax
    addq    %rcx, %rax
    movl    (%rax), %ecx
    movl    -4(%rbp), %eax
    leaq    0(,%rax,4), %rsi
    movq    -32(%rbp), %rax
    addq    %rsi, %rax
    movl    (%rax), %eax
    imull   %ecx, %eax
    movl    %eax, (%rdx)
    addl    $1, -4(%rbp)
.L2:
    movl    -4(%rbp), %eax
    cmpl    -44(%rbp), %eax
    jb  .L3
    nop
    popq    %rbp
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE2:
    .size   multiply, .-multiply
    .section    .rodata
.LC1:
    .string "\nTempo: %f\n"
    .text
    .globl  main
    .type   main, @function
main:
.LFB3:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    pushq   %rbx
    subq    $56, %rsp
    .cfi_offset 3, -24
    movl    $1048576, -60(%rbp)
    movl    -60(%rbp), %eax
    salq    $2, %rax
    movq    %rax, %rdi
    call    malloc
    movq    %rax, -56(%rbp)
    movl    -60(%rbp), %eax
    salq    $2, %rax
    movq    %rax, %rdi
    call    malloc
    movq    %rax, -48(%rbp)
    movl    -60(%rbp), %eax
    salq    $2, %rax
    movq    %rax, %rdi
    call    malloc
    movq    %rax, -40(%rbp)
    movl    $0, -64(%rbp)
    movl    $0, %edi
    call    time
    movl    %eax, %edi
    call    srand
    movl    $0, -64(%rbp)
    jmp .L5
.L6:
    movl    -64(%rbp), %eax
    cltq
    leaq    0(,%rax,4), %rdx
    movq    -56(%rbp), %rax
    leaq    (%rdx,%rax), %rbx
    call    rand
    movl    %eax, (%rbx)
    movl    -64(%rbp), %eax
    cltq
    leaq    0(,%rax,4), %rdx
    movq    -48(%rbp), %rax
    leaq    (%rdx,%rax), %rbx
    call    rand
    movl    %eax, (%rbx)
    addl    $1, -64(%rbp)
.L5:
    movl    -64(%rbp), %eax
    cmpl    -60(%rbp), %eax
    jb  .L6
    call    clock
    movq    %rax, -32(%rbp)
    movl    -60(%rbp), %ecx
    movq    -40(%rbp), %rdx
    movq    -48(%rbp), %rsi
    movq    -56(%rbp), %rax
    movq    %rax, %rdi
    call    multiply
    movl    -60(%rbp), %ecx
    movq    -40(%rbp), %rdx
    movq    -48(%rbp), %rsi
    movq    -56(%rbp), %rax
    movq    %rax, %rdi
    call    multiply
    call    clock
    movq    %rax, -24(%rbp)
    movq    -24(%rbp), %rax
    subq    -32(%rbp), %rax
    pxor    %xmm0, %xmm0
    cvtsi2sdq   %rax, %xmm0
    movsd   .LC0(%rip), %xmm1
    divsd   %xmm1, %xmm0
    movq    %xmm0, %rax
    movq    %rax, %xmm0
    movl    $.LC1, %edi
    movl    $1, %eax
    call    printf
    movl    $0, %eax
    addq    $56, %rsp
    popq    %rbx
    popq    %rbp
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE3:
    .size   main, .-main
    .section    .rodata
    .align 8
.LC0:
    .long   0
    .long   1093567616
    .ident  "GCC: (Ubuntu 5.2.1-22ubuntu2) 5.2.1 20151010"
    .section    .note.GNU-stack,"",@progbits

And this with gcc -S Main.c:

.file   "Main.c"
.text
.globl  multiply
.type   multiply, @function
multiply:
.LFB2:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    movq    %rdi, -24(%rbp)
    movq    %rsi, -32(%rbp)
    movq    %rdx, -40(%rbp)
    movl    %ecx, -44(%rbp)
    movl    $0, -4(%rbp)
    movl    $0, -4(%rbp)
    jmp .L2
.L3:
    movl    -4(%rbp), %eax
    leaq    0(,%rax,4), %rdx
    movq    -40(%rbp), %rax
    addq    %rax, %rdx
    movl    -4(%rbp), %eax
    leaq    0(,%rax,4), %rcx
    movq    -24(%rbp), %rax
    addq    %rcx, %rax
    movl    (%rax), %ecx
    movl    -4(%rbp), %eax
    leaq    0(,%rax,4), %rsi
    movq    -32(%rbp), %rax
    addq    %rsi, %rax
    movl    (%rax), %eax
    imull   %ecx, %eax
    movl    %eax, (%rdx)
    addl    $1, -4(%rbp)
.L2:
    movl    -4(%rbp), %eax
    cmpl    -44(%rbp), %eax
    jb  .L3
    nop
    popq    %rbp
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE2:
    .size   multiply, .-multiply
    .section    .rodata
.LC1:
    .string "\nTempo: %f\n"
    .text
    .globl  main
    .type   main, @function
main:
.LFB3:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    pushq   %rbx
    subq    $56, %rsp
    .cfi_offset 3, -24
    movl    $1048576, -60(%rbp)
    movl    -60(%rbp), %eax
    salq    $2, %rax
    movq    %rax, %rdi
    call    malloc
    movq    %rax, -56(%rbp)
    movl    -60(%rbp), %eax
    salq    $2, %rax
    movq    %rax, %rdi
    call    malloc
    movq    %rax, -48(%rbp)
    movl    -60(%rbp), %eax
    salq    $2, %rax
    movq    %rax, %rdi
    call    malloc
    movq    %rax, -40(%rbp)
    movl    $0, -64(%rbp)
    movl    $0, %edi
    call    time
    movl    %eax, %edi
    call    srand
    movl    $0, -64(%rbp)
    jmp .L5
.L6:
    movl    -64(%rbp), %eax
    cltq
    leaq    0(,%rax,4), %rdx
    movq    -56(%rbp), %rax
    leaq    (%rdx,%rax), %rbx
    call    rand
    movl    %eax, (%rbx)
    movl    -64(%rbp), %eax
    cltq
    leaq    0(,%rax,4), %rdx
    movq    -48(%rbp), %rax
    leaq    (%rdx,%rax), %rbx
    call    rand
    movl    %eax, (%rbx)
    addl    $1, -64(%rbp)
.L5:
    movl    -64(%rbp), %eax
    cmpl    -60(%rbp), %eax
    jb  .L6
    call    clock
    movq    %rax, -32(%rbp)
    movl    -60(%rbp), %ecx
    movq    -40(%rbp), %rdx
    movq    -48(%rbp), %rsi
    movq    -56(%rbp), %rax
    movq    %rax, %rdi
    call    multiply
    movl    -60(%rbp), %ecx
    movq    -40(%rbp), %rdx
    movq    -48(%rbp), %rsi
    movq    -56(%rbp), %rax
    movq    %rax, %rdi
    call    multiply
    call    clock
    movq    %rax, -24(%rbp)
    movq    -24(%rbp), %rax
    subq    -32(%rbp), %rax
    pxor    %xmm0, %xmm0
    cvtsi2sdq   %rax, %xmm0
    movsd   .LC0(%rip), %xmm1
    divsd   %xmm1, %xmm0
    movl    $.LC1, %edi
    movl    $1, %eax
    call    printf
    movl    $0, %eax
    addq    $56, %rsp
    popq    %rbx
    popq    %rbp
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE3:
    .size   main, .-main
    .section    .rodata
    .align 8
.LC0:
    .long   0
    .long   1093567616
    .ident  "GCC: (Ubuntu 5.2.1-22ubuntu2) 5.2.1 20151010"
    .section    .note.GNU-stack,"",@progbits

The differences can be found at the end of .L5 label.

You forgot to enable optimizations. Otherwise all bets are off as to why the compiler generates silly code. — Jester, Nov 16 '15 at 21:40
Thank you for your fast answer @Jester, I will try with optimization enabled. — Bruno Simas Hadlich, Nov 16 '15 at 21:43
Instructions seem legit, because of the different sizes of the involved registers... You should look at the immediately preceding or following instructions in order to understand better why they are generated (and why not)... — Macmade, Nov 16 '15 at 21:44
@Macmade, I'm sure that the instructions around may be affected by those movq, but the fact is that the only difference was in the addition of those movq, nothing more, anyway I did what Jester told me and added -O3, this way those instructions disappeared. — Bruno Simas Hadlich, Nov 16 '15 at 22:00
Although doing what @Jester told me worked for cleaning the extra instructions the question itself was not answared, I want to know why with -mtune=native the extra instructions are generated and without it they aren't, in addition I believe this is not "a problem that can no longer be reproduced" considering that I informed GCC's version, CPU model, source code and compilation directives. — Bruno Simas Hadlich, Nov 17 '15 at 16:34
I told you, without optimizations, all bets are off. It's just a side effect of how the compiler works. It has seen an expression, and decided to use `rax` to store it. That's the end of the previous block. In the next block, it sees it has to pass this expression as argument to a function, and the calling convention says to do so using `xmm0` so it reloaded the value from `rax`. Without optimization, it does not realize these two operations cancel. — Jester, Nov 17 '15 at 16:46
[Don't cast the result of `malloc` in C](http://stackoverflow.com/q/605845/995714) — phuclv, Apr 15 '16 at 02:16

Why is GCC exchanging rax and xmm0 registers?

0 Answers0