I was verifying some assembly generated by gcc version 5.2.1 20151010 (Ubuntu 5.2.1-22ubuntu2) and realized that the following instructions were being generated:
movq %xmm0, %rax
movq %rax, %xmm0
I'd like to know what is the purpose of these instructions considering that it seems irrelevant, is it some kind of optimization? Like when we do:
xor ax, ax
I'd like to let clear that this code appeared just when I used the option -mtune=native and my CPU is a Intel Core I5 4200U.
Following is my source code:
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include "print.h"
void multiply(const unsigned int* array1, const unsigned int* array2, unsigned int* array3, const unsigned int array_size)
{
unsigned int i = 0;
for (i = 0; i < array_size; i++)
{
array3[i] = array1[i] * array2[i];
}
}
int main()
{
const unsigned int array_size = 1024*1024;
unsigned int* array1 = (unsigned int*)malloc(sizeof(unsigned int) * array_size);
unsigned int* array2 = (unsigned int*)malloc(sizeof(unsigned int) * array_size);
unsigned int* array3 = (unsigned int*)malloc(sizeof(unsigned int) * array_size);
int i = 0;
srand(time(NULL));
for (i = 0; i < array_size; i++)
{
array1[i] = rand();
array2[i] = rand();
}
clock_t t0 = clock();
multiply(array1,array2,array3, array_size);
multiply(array1,array2,array3, array_size);
clock_t t1 = clock();
printf("\nTempo: %f\n", ((double)(t1 - t0)) / CLOCKS_PER_SEC);
}
This is the assembly generated by GCC using:gcc -S -mtune=native Main.c:
.file "Main.c"
.text
.globl multiply
.type multiply, @function
multiply:
.LFB2:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movq %rdi, -24(%rbp)
movq %rsi, -32(%rbp)
movq %rdx, -40(%rbp)
movl %ecx, -44(%rbp)
movl $0, -4(%rbp)
movl $0, -4(%rbp)
jmp .L2
.L3:
movl -4(%rbp), %eax
leaq 0(,%rax,4), %rdx
movq -40(%rbp), %rax
addq %rax, %rdx
movl -4(%rbp), %eax
leaq 0(,%rax,4), %rcx
movq -24(%rbp), %rax
addq %rcx, %rax
movl (%rax), %ecx
movl -4(%rbp), %eax
leaq 0(,%rax,4), %rsi
movq -32(%rbp), %rax
addq %rsi, %rax
movl (%rax), %eax
imull %ecx, %eax
movl %eax, (%rdx)
addl $1, -4(%rbp)
.L2:
movl -4(%rbp), %eax
cmpl -44(%rbp), %eax
jb .L3
nop
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE2:
.size multiply, .-multiply
.section .rodata
.LC1:
.string "\nTempo: %f\n"
.text
.globl main
.type main, @function
main:
.LFB3:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %rbx
subq $56, %rsp
.cfi_offset 3, -24
movl $1048576, -60(%rbp)
movl -60(%rbp), %eax
salq $2, %rax
movq %rax, %rdi
call malloc
movq %rax, -56(%rbp)
movl -60(%rbp), %eax
salq $2, %rax
movq %rax, %rdi
call malloc
movq %rax, -48(%rbp)
movl -60(%rbp), %eax
salq $2, %rax
movq %rax, %rdi
call malloc
movq %rax, -40(%rbp)
movl $0, -64(%rbp)
movl $0, %edi
call time
movl %eax, %edi
call srand
movl $0, -64(%rbp)
jmp .L5
.L6:
movl -64(%rbp), %eax
cltq
leaq 0(,%rax,4), %rdx
movq -56(%rbp), %rax
leaq (%rdx,%rax), %rbx
call rand
movl %eax, (%rbx)
movl -64(%rbp), %eax
cltq
leaq 0(,%rax,4), %rdx
movq -48(%rbp), %rax
leaq (%rdx,%rax), %rbx
call rand
movl %eax, (%rbx)
addl $1, -64(%rbp)
.L5:
movl -64(%rbp), %eax
cmpl -60(%rbp), %eax
jb .L6
call clock
movq %rax, -32(%rbp)
movl -60(%rbp), %ecx
movq -40(%rbp), %rdx
movq -48(%rbp), %rsi
movq -56(%rbp), %rax
movq %rax, %rdi
call multiply
movl -60(%rbp), %ecx
movq -40(%rbp), %rdx
movq -48(%rbp), %rsi
movq -56(%rbp), %rax
movq %rax, %rdi
call multiply
call clock
movq %rax, -24(%rbp)
movq -24(%rbp), %rax
subq -32(%rbp), %rax
pxor %xmm0, %xmm0
cvtsi2sdq %rax, %xmm0
movsd .LC0(%rip), %xmm1
divsd %xmm1, %xmm0
movq %xmm0, %rax
movq %rax, %xmm0
movl $.LC1, %edi
movl $1, %eax
call printf
movl $0, %eax
addq $56, %rsp
popq %rbx
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE3:
.size main, .-main
.section .rodata
.align 8
.LC0:
.long 0
.long 1093567616
.ident "GCC: (Ubuntu 5.2.1-22ubuntu2) 5.2.1 20151010"
.section .note.GNU-stack,"",@progbits
And this with gcc -S Main.c:
.file "Main.c"
.text
.globl multiply
.type multiply, @function
multiply:
.LFB2:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movq %rdi, -24(%rbp)
movq %rsi, -32(%rbp)
movq %rdx, -40(%rbp)
movl %ecx, -44(%rbp)
movl $0, -4(%rbp)
movl $0, -4(%rbp)
jmp .L2
.L3:
movl -4(%rbp), %eax
leaq 0(,%rax,4), %rdx
movq -40(%rbp), %rax
addq %rax, %rdx
movl -4(%rbp), %eax
leaq 0(,%rax,4), %rcx
movq -24(%rbp), %rax
addq %rcx, %rax
movl (%rax), %ecx
movl -4(%rbp), %eax
leaq 0(,%rax,4), %rsi
movq -32(%rbp), %rax
addq %rsi, %rax
movl (%rax), %eax
imull %ecx, %eax
movl %eax, (%rdx)
addl $1, -4(%rbp)
.L2:
movl -4(%rbp), %eax
cmpl -44(%rbp), %eax
jb .L3
nop
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE2:
.size multiply, .-multiply
.section .rodata
.LC1:
.string "\nTempo: %f\n"
.text
.globl main
.type main, @function
main:
.LFB3:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %rbx
subq $56, %rsp
.cfi_offset 3, -24
movl $1048576, -60(%rbp)
movl -60(%rbp), %eax
salq $2, %rax
movq %rax, %rdi
call malloc
movq %rax, -56(%rbp)
movl -60(%rbp), %eax
salq $2, %rax
movq %rax, %rdi
call malloc
movq %rax, -48(%rbp)
movl -60(%rbp), %eax
salq $2, %rax
movq %rax, %rdi
call malloc
movq %rax, -40(%rbp)
movl $0, -64(%rbp)
movl $0, %edi
call time
movl %eax, %edi
call srand
movl $0, -64(%rbp)
jmp .L5
.L6:
movl -64(%rbp), %eax
cltq
leaq 0(,%rax,4), %rdx
movq -56(%rbp), %rax
leaq (%rdx,%rax), %rbx
call rand
movl %eax, (%rbx)
movl -64(%rbp), %eax
cltq
leaq 0(,%rax,4), %rdx
movq -48(%rbp), %rax
leaq (%rdx,%rax), %rbx
call rand
movl %eax, (%rbx)
addl $1, -64(%rbp)
.L5:
movl -64(%rbp), %eax
cmpl -60(%rbp), %eax
jb .L6
call clock
movq %rax, -32(%rbp)
movl -60(%rbp), %ecx
movq -40(%rbp), %rdx
movq -48(%rbp), %rsi
movq -56(%rbp), %rax
movq %rax, %rdi
call multiply
movl -60(%rbp), %ecx
movq -40(%rbp), %rdx
movq -48(%rbp), %rsi
movq -56(%rbp), %rax
movq %rax, %rdi
call multiply
call clock
movq %rax, -24(%rbp)
movq -24(%rbp), %rax
subq -32(%rbp), %rax
pxor %xmm0, %xmm0
cvtsi2sdq %rax, %xmm0
movsd .LC0(%rip), %xmm1
divsd %xmm1, %xmm0
movl $.LC1, %edi
movl $1, %eax
call printf
movl $0, %eax
addq $56, %rsp
popq %rbx
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE3:
.size main, .-main
.section .rodata
.align 8
.LC0:
.long 0
.long 1093567616
.ident "GCC: (Ubuntu 5.2.1-22ubuntu2) 5.2.1 20151010"
.section .note.GNU-stack,"",@progbits
The differences can be found at the end of .L5 label.