Monday, 8 March 2010

Performance-tuning example with gcc

From time to time, I need to rip-apart functions to do some old-school performance tuning. This is really easy to do.

Here is an example file, called perf.c ... in it, I have two versions of a function. One is inefficient, the other is more efficient by virtue of being written slightly differently.


typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;


// Inefficient inner loop

void byteswap (const uint8_t *PpFrom, uint8_t *PpDst, uint32_t PBytes)
{
    long i;
    long lWords = PBytes >> 1;

    for (i=0; i < lWords; i++)
    {
        unsigned lWord = ((const uint16_t*)PpFrom)[i];
        ((uint16_t*)PpDst)[i] = ((lWord & 0x0ff) << 8) | lWord >> 8;
    }
}

// Efficient inner loop

void byteswap_2 (const uint8_t *PpFrom, uint8_t *PpDst, uint32_t PBytes)
{
    uint32_t lWords = PBytes / 2;

    uint16_t* PpDstPtr = (uint16_t *)PpDst;
    uint16_t* PpDstEnd = PpDstPtr + lWords;

    const uint16_t* PpFromPtr = (const uint16_t *)PpFrom;
    for (;PpDstPtr < PpDstEnd;)
    {
        unsigned lWord = (*PpFromPtr++);
        (*PpDstPtr++) = ((lWord & 0x0ff) << 8) | lWord >> 8;
    }
}


From the command-line, run this command (assuming you have gcc!)

gcc -c -g -gstabs perf.c perf.s

This takes your source code (perf.c), and outputs the disassembled output to a file called perf.s ... which you can take a look at with your text editor.

In the file, the code generated for the first function looks like this:


.globl _byteswap
.def _byteswap; .scl 2; .type 32; .endef
_byteswap:
LFB3:
.file 1 "perf.c"
.loc 1 9 0
pushl %ebp
LCFI0:
movl %esp, %ebp
LCFI1:
subl $16, %esp
LCFI2:
.loc 1 11 0
movl 16(%ebp), %eax
shrl %eax
movl %eax, -8(%ebp)
.loc 1 13 0
movl $0, -12(%ebp)
jmp L2
L3:
LBB2:
.loc 1 15 0
movl 8(%ebp), %edx
movl -12(%ebp), %eax
addl %eax, %eax
leal (%edx,%eax), %eax
movzwl (%eax), %eax
movzwl %ax, %eax
movl %eax, -4(%ebp)
.loc 1 16 0
movl 12(%ebp), %edx
movl -12(%ebp), %eax
addl %eax, %eax
leal (%edx,%eax), %ecx
movl -4(%ebp), %eax
movl %eax, %edx
sall $8, %edx
movl -4(%ebp), %eax
shrl $8, %eax
orl %edx, %eax
movw %ax, (%ecx)
LBE2:
.loc 1 13 0
addl $1, -12(%ebp)
L2:
movl -12(%ebp), %eax
cmpl -8(%ebp), %eax
jl L3

.loc 1 18 0
leave
ret



The code generated for the second function looks like this:


.globl _byteswap_2
.def _byteswap_2; .scl 2; .type 32; .endef
_byteswap_2:
LFB4:
.loc 1 21 0
pushl %ebp
LCFI3:
movl %esp, %ebp
LCFI4:
subl $32, %esp
LCFI5:
.loc 1 22 0
movl 16(%ebp), %eax
shrl %eax
movl %eax, -20(%ebp)
.loc 1 24 0
movl 12(%ebp), %eax
movl %eax, -16(%ebp)
.loc 1 25 0
movl -20(%ebp), %eax
addl %eax, %eax
addl -16(%ebp), %eax
movl %eax, -12(%ebp)
.loc 1 27 0
movl 8(%ebp), %eax
movl %eax, -8(%ebp)
jmp L6
L7:
LBB3:
.loc 1 30 0
movl -8(%ebp), %eax
movzwl (%eax), %eax
movzwl %ax, %eax
movl %eax, -4(%ebp)
addl $2, -8(%ebp)
.loc 1 31 0
movl -4(%ebp), %eax
movl %eax, %edx
sall $8, %edx
movl -4(%ebp), %eax
shrl $8, %eax
orl %eax, %edx
movl -16(%ebp), %eax
movw %dx, (%eax)
addl $2, -16(%ebp)
L6:
LBE3:
.loc 1 28 0
movl -16(%ebp), %eax
cmpl -12(%ebp), %eax
jb L7

.loc 1 33 0
leave
ret


I've highlighted the assembly of the code generated for the two inner loops. Look how much more efficient the second version is, with only a simple bit of re-coding!

It should be easy using this approach to learn how to make your code more efficient, without much effort.

No comments: