Here is an example file, called perf.c ... in it, I have two versions of a function. One is inefficient, the other is more efficient by virtue of being written slightly differently.
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
// Inefficient inner loop
void byteswap (const uint8_t *PpFrom, uint8_t *PpDst, uint32_t PBytes)
{
long i;
long lWords = PBytes >> 1;
for (i=0; i < lWords; i++)
{
unsigned lWord = ((const uint16_t*)PpFrom)[i];
((uint16_t*)PpDst)[i] = ((lWord & 0x0ff) << 8) | lWord >> 8;
}
}
// Efficient inner loop
void byteswap_2 (const uint8_t *PpFrom, uint8_t *PpDst, uint32_t PBytes)
{
uint32_t lWords = PBytes / 2;
uint16_t* PpDstPtr = (uint16_t *)PpDst;
uint16_t* PpDstEnd = PpDstPtr + lWords;
const uint16_t* PpFromPtr = (const uint16_t *)PpFrom;
for (;PpDstPtr < PpDstEnd;)
{
unsigned lWord = (*PpFromPtr++);
(*PpDstPtr++) = ((lWord & 0x0ff) << 8) | lWord >> 8;
}
}
From the command-line, run this command (assuming you have gcc!)
gcc -c -g -gstabs perf.c perf.s
This takes your source code (perf.c), and outputs the disassembled output to a file called perf.s ... which you can take a look at with your text editor.
In the file, the code generated for the first function looks like this:
.globl _byteswap
.def _byteswap; .scl 2; .type 32; .endef
_byteswap:
LFB3:
.file 1 "perf.c"
.loc 1 9 0
pushl %ebp
LCFI0:
movl %esp, %ebp
LCFI1:
subl $16, %esp
LCFI2:
.loc 1 11 0
movl 16(%ebp), %eax
shrl %eax
movl %eax, -8(%ebp)
.loc 1 13 0
movl $0, -12(%ebp)
jmp L2
L3:
LBB2:
.loc 1 15 0
movl 8(%ebp), %edx
movl -12(%ebp), %eax
addl %eax, %eax
leal (%edx,%eax), %eax
movzwl (%eax), %eax
movzwl %ax, %eax
movl %eax, -4(%ebp)
.loc 1 16 0
movl 12(%ebp), %edx
movl -12(%ebp), %eax
addl %eax, %eax
leal (%edx,%eax), %ecx
movl -4(%ebp), %eax
movl %eax, %edx
sall $8, %edx
movl -4(%ebp), %eax
shrl $8, %eax
orl %edx, %eax
movw %ax, (%ecx)
LBE2:
.loc 1 13 0
addl $1, -12(%ebp)
L2:
movl -12(%ebp), %eax
cmpl -8(%ebp), %eax
jl L3
.loc 1 18 0
leave
ret
The code generated for the second function looks like this:
.globl _byteswap_2
.def _byteswap_2; .scl 2; .type 32; .endef
_byteswap_2:
LFB4:
.loc 1 21 0
pushl %ebp
LCFI3:
movl %esp, %ebp
LCFI4:
subl $32, %esp
LCFI5:
.loc 1 22 0
movl 16(%ebp), %eax
shrl %eax
movl %eax, -20(%ebp)
.loc 1 24 0
movl 12(%ebp), %eax
movl %eax, -16(%ebp)
.loc 1 25 0
movl -20(%ebp), %eax
addl %eax, %eax
addl -16(%ebp), %eax
movl %eax, -12(%ebp)
.loc 1 27 0
movl 8(%ebp), %eax
movl %eax, -8(%ebp)
jmp L6
L7:
LBB3:
.loc 1 30 0
movl -8(%ebp), %eax
movzwl (%eax), %eax
movzwl %ax, %eax
movl %eax, -4(%ebp)
addl $2, -8(%ebp)
.loc 1 31 0
movl -4(%ebp), %eax
movl %eax, %edx
sall $8, %edx
movl -4(%ebp), %eax
shrl $8, %eax
orl %eax, %edx
movl -16(%ebp), %eax
movw %dx, (%eax)
addl $2, -16(%ebp)
L6:
LBE3:
.loc 1 28 0
movl -16(%ebp), %eax
cmpl -12(%ebp), %eax
jb L7
.loc 1 33 0
leave
ret
I've highlighted the assembly of the code generated for the two inner loops. Look how much more efficient the second version is, with only a simple bit of re-coding!
It should be easy using this approach to learn how to make your code more efficient, without much effort.