Pete Cole from Intermorphic: March 2010

From time to time, I need to rip-apart functions to do some old-school performance tuning. This is really easy to do.

Here is an example file, called perf.c ... in it, I have two versions of a function. One is inefficient, the other is more efficient by virtue of being written slightly differently.



typedef unsigned char uint8_t;

typedef unsigned short uint16_t;

typedef unsigned int uint32_t;





// Inefficient inner loop



void byteswap (const uint8_t *PpFrom, uint8_t *PpDst, uint32_t PBytes)

{

    long i;

    long lWords = PBytes >> 1;



    for (i=0; i < lWords; i++)

    {

        unsigned lWord = ((const uint16_t*)PpFrom)[i];

        ((uint16_t*)PpDst)[i] = ((lWord & 0x0ff) << 8) | lWord >> 8;

    }

}



// Efficient inner loop



void byteswap_2 (const uint8_t *PpFrom, uint8_t *PpDst, uint32_t PBytes)

{

    uint32_t lWords = PBytes / 2;



    uint16_t* PpDstPtr = (uint16_t *)PpDst;

    uint16_t* PpDstEnd = PpDstPtr + lWords;



    const uint16_t* PpFromPtr = (const uint16_t *)PpFrom;

    for (;PpDstPtr < PpDstEnd;)

    {

        unsigned lWord = (*PpFromPtr++);

        (*PpDstPtr++) = ((lWord & 0x0ff) << 8) | lWord >> 8;

    }

}

From the command-line, run this command (assuming you have gcc!)

gcc -c -g -gstabs perf.c perf.s

This takes your source code (perf.c), and outputs the disassembled output to a file called perf.s ... which you can take a look at with your text editor.

In the file, the code generated for the first function looks like this:



.globl _byteswap

.def    _byteswap;  .scl    2;  .type   32; .endef

_byteswap:

LFB3:

.file 1 "perf.c"

.loc 1 9 0

pushl   %ebp

LCFI0:

movl    %esp, %ebp

LCFI1:

subl    $16, %esp

LCFI2:

.loc 1 11 0

movl    16(%ebp), %eax

shrl    %eax

movl    %eax, -8(%ebp)

.loc 1 13 0

movl    $0, -12(%ebp)

jmp L2

L3:

LBB2:

.loc 1 15 0

movl    8(%ebp), %edx

movl    -12(%ebp), %eax

addl    %eax, %eax

leal    (%edx,%eax), %eax

movzwl  (%eax), %eax

movzwl  %ax, %eax

movl    %eax, -4(%ebp)

.loc 1 16 0

movl    12(%ebp), %edx

movl    -12(%ebp), %eax

addl    %eax, %eax

leal    (%edx,%eax), %ecx

movl    -4(%ebp), %eax

movl    %eax, %edx

sall    $8, %edx

movl    -4(%ebp), %eax

shrl    $8, %eax

orl %edx, %eax

movw    %ax, (%ecx)

LBE2:

.loc 1 13 0

addl    $1, -12(%ebp)

L2:

movl    -12(%ebp), %eax

cmpl    -8(%ebp), %eax

jl  L3

.loc 1 18 0

leave

ret

The code generated for the second function looks like this:



.globl _byteswap_2

.def    _byteswap_2;    .scl    2;  .type   32; .endef

_byteswap_2:

LFB4:

.loc 1 21 0

pushl   %ebp

LCFI3:

movl    %esp, %ebp

LCFI4:

subl    $32, %esp

LCFI5:

.loc 1 22 0

movl    16(%ebp), %eax

shrl    %eax

movl    %eax, -20(%ebp)

.loc 1 24 0

movl    12(%ebp), %eax

movl    %eax, -16(%ebp)

.loc 1 25 0

movl    -20(%ebp), %eax

addl    %eax, %eax

addl    -16(%ebp), %eax

movl    %eax, -12(%ebp)

.loc 1 27 0

movl    8(%ebp), %eax

movl    %eax, -8(%ebp)

jmp L6

L7:

LBB3:

.loc 1 30 0

movl    -8(%ebp), %eax

movzwl  (%eax), %eax

movzwl  %ax, %eax

movl    %eax, -4(%ebp)

addl    $2, -8(%ebp)

.loc 1 31 0

movl    -4(%ebp), %eax

movl    %eax, %edx

sall    $8, %edx

movl    -4(%ebp), %eax

shrl    $8, %eax

orl %eax, %edx

movl    -16(%ebp), %eax

movw    %dx, (%eax)

addl    $2, -16(%ebp)

L6:

LBE3:

.loc 1 28 0

movl    -16(%ebp), %eax

cmpl    -12(%ebp), %eax

jb  L7

.loc 1 33 0

leave

ret

I've highlighted the assembly of the code generated for the two inner loops. Look how much more efficient the second version is, with only a simple bit of re-coding!

It should be easy using this approach to learn how to make your code more efficient, without much effort.

I've done quite a bit of work over the past year with PC-hosted Linux (Ubuntu) VMs under VMWare Player, mainly for Android work. I got so fed up recently with the poor system integration (poor Mouse and Keyboard transitioning when moving to/from host and the VM, no copy/paste between Windows and the VM, limited support for VM desktop resizing, not easy to share file data between host and VM), that I decided to spend some time looking at VirtualBox.

After a day evaluating, I've now moved entirely to VirtualBox! I was even able to (easily) set-up 64-bit VMs running on 32-bit Windows, which was very handy; and it was simple to set-up a shared file area between host and VM.

I haven't yet tried VirtualBox on Mac, but I'm perfectly happy with Parallels 5 at the moment so haven't felt the need to use anything else on Mac...

Pete Cole from Intermorphic

Monday 8 March 2010

Performance-tuning example with gcc

Monday 1 March 2010

VisualBox, VMWare Player and Parallels

Intermorphic

Blog Archive

About Me

Blog Roll

Cool Sites

Followers