ARM Cortex-M Run-Time Library Analysis

This page forms part of an ARM Cortex-M Run-Time Library Analysis by Tom Vajzovic.

Copyright

The text and presentation of this analysis is copyright 2018 Tom Vajzovic. You may not copy it except as permitted by law.

The ARM and GCC routines presented here are subject to separate copyright. Displaying them in this way is academic fair use and so I have not sought a licence from the copyright holders. You must not take them from here to use them for any other purpose. You shouldn't want to anyway, because they are suboptimal.

You may use my versions (which are better) according to the terms of the The Truly Free Licence (public domain).

Cortex-M0 (ARMv6-M)

64-bit Multiply Function

Signed or Unsigned Multiply 64 x 64 = 64

Tool ARM standardlib ARM Microlib GCC 4 GCC 5 GCC 6 GCC 7 Mine
Code
__aeabi_lmul:
    muls    r3, r0
    muls    r1, r2
    push    {r4, r5, lr}
    adds    r4, r3, r1
    lsrs    r1, r0, 16
    lsrs    r3, r2, 16
    mov     r5, r1
    uxth    r2, r2
    muls    r5, r3
    uxth    r0, r0
    muls    r1, r2
    adds    r4, r5, r4
    mov     r5, r0
    muls    r5, r2
    lsrs    r2, r1, 16
    lsls    r1, r1, 16
    adds    r5, r1, r5
    adcs    r2, r4
    muls    r0, r3
    lsrs    r1, r0, 16
    lsls    r0, r0, 16
    adds    r0, r0, r5
    adcs    r1, r2
    pop     {r4, r5, pc}
__aeabi_lmul:
    push    {r4-r7, lr}
    push    {r0-r3, r4}
    sub     sp, 24
    movs    r0, 0
    str     r0, [sp, 0]
    str     r0, [sp, 4]
    str     r0, [sp, 8]
1:  ldr     r0, [sp, 24]
    ldr     r1, [sp, 24]
    uxth    r0, r0
    str     r0, [sp, 16]
    ldr     r0, [sp, 28]
    lsrs    r1, r1, 16
    lsls    r2, r0, 16
    asrs    r0, r0, 16
    orrs    r1, r2
    str     r0, [sp, 28]
    movs    r7, 0
    ldr     r5, [sp, 32]
    ldr     r0, [sp, 36]
    str     r1, [sp, 24]
    mov     r6, r7
    mov     r4, r7
    str     r0, [sp, 12]
2:  ldr     r0, [sp, 12]
    uxth    r1, r5
    lsls    r2, r0, 16
    lsrs    r0, r0, 16
    str     r0, [sp, 12]
    ldr     r0, [sp, 16]
    lsrs    r5, r5, 16
    orrs    r5, r2
    muls    r0, r1
    movs    r1, 0
    mov     r2, r4
    bl      __aeabi_llsl
    adds    r7, r0, r7
    adcs    r1, r6
    adds    r4, 16
    mov     r6, r1
    cmp     r4, 64
    blt.n   2b
    mov     r0, r7
    ldr     r2, [sp, 8]
    bl      __aeabi_llsl
    ldr     r2, [sp, 0]
    ldr     r3, [sp, 4]
    adds    r0, r0, r2
    str     r0, [sp, 0]
    adcs    r1, r3
    ldr     r0, [sp, 8]
    str     r1, [sp, 4]
    adds    r0, 16
    str     r0, [sp, 8]
    cmp     r0, 64
    blt.n   1b
    ldr     r0, [sp, 0]
    add     sp, 44
    pop     {r4-r7, pc}
__aeabi_lmul:
    push    {r4-r7, lr}
    mov     r7, r9
    mov     r6, r8
    lsls    r5, r0, 16
    lsrs    r5, r5, 16
    adds    r4, r5, 0
    push    {r6, r7}
    lsls    r7, r2, 16
    lsrs    r6, r2, 16
    lsrs    r7, r7, 16
    mov     r9, r3
    lsrs    r3, r0, 16
    muls    r4, r7
    muls    r5, r6
    muls    r7, r3
    muls    r3, r6
    adds    r5, r7, r5
    lsrs    r6, r4, 16
    adds    r5, r5, r6
    mov     r12, r3
    cmp     r7, r5
    bls.n   1f
    movs    r3, 128
    lsls    r3, r3, 9
    mov     r8, r3
    add     r12, r8
1:  mov     r3, r9
    muls    r1, r2
    muls    r3, r0
    lsls    r4, r4, 16
    lsrs    r6, r5, 16
    lsrs    r4, r4, 16
    lsls    r5, r5, 16
    add     r6, r12
    adds    r4, r5, r4
    adds    r1, r3, r1
    adds    r1, r1, r6
    adds    r0, r4, 0
    pop     {r2, r3}
    mov     r8, r2
    mov     r9, r3
    pop     {r4-r7, pc}
__aeabi_lmul:
    push    {r4-r7, lr}
    mov     r7, r9
    mov     r6, r8
    push    {r6, r7}
    lsls    r6, r2, 16
    lsrs    r6, r6, 16
    mov     r9, r3
    movs    r3, r6
    lsls    r5, r0, 16
    lsrs    r4, r5, 16
    lsrs    r7, r0, 16
    lsrs    r5, r2, 16
    muls    r3, r4
    muls    r6, r7
    muls    r7, r5
    muls    r5, r4
    lsrs    r4, r3, 16
    adds    r5, r5, r6
    adds    r4, r4, r5
    mov     r12, r3
    cmp     r6, r4
    bls.n   1f
    movs    r3, 128
    lsls    r3, r3, 9
    mov     r8, r3
    add     r7, r8
1:  mov     r3, r12
    lsrs    r5, r4, 16
    adds    r7, r5, r7
    lsls    r5, r3, 16
    mov     r3, r9
    muls    r2, r1
    muls    r3, r0
    lsrs    r5, r5, 16
    lsls    r4, r4, 16
    adds    r4, r4, r5
    adds    r1, r3, r2
    adds    r1, r1, r7
    movs    r0, r4
    pop     {r2, r3}
    mov     r8, r2
    mov     r9, r3
    pop     {r4-r7, pc}
__aeabi_lmul:
    push    {r4-r7, lr}
    mov     lr, r9
    mov     r7, r8
    lsls    r5, r2, 16
    lsrs    r5, r5, 16
    movs    r6, r5
    push    {r7, lr}
    lsls    r7, r0, 16
    lsrs    r4, r2, 16
    lsrs    r7, r7, 16
    mov     r9, r3
    lsrs    r3, r0, 16
    muls    r6, r7
    muls    r5, r3
    muls    r7, r4
    muls    r3, r4
    adds    r7, r7, r5
    lsrs    r4, r6, 16
    adds    r4, r4, r7
    mov     r12, r3
    cmp     r5, r4
    bls.n   1f
    movs    r3, 128
    lsls    r3, r3, 9
    mov     r8, r3
    add     r12, r8
1:  mov     r3, r9
    muls    r1, r2
    muls    r3, r0
    lsls    r6, r6, 16
    lsrs    r6, r6, 16
    lsrs    r5, r4, 16
    lsls    r4, r4, 16
    add     r5, r12
    adds    r4, r4, r6
    adds    r1, r3, r1
    adds    r1, r1, r5
    movs    r0, r4
    pop     {r2, r3}
    mov     r8, r2
    mov     r9, r3
    pop     {r4-r7, pc}
__aeabi_lmul:
    push    {r4-r7, lr}
    mov     lr, r9
    mov     r7, r8
    lsls    r5, r2, 16
    lsrs    r5, r5, 16
    movs    r6, r5
    push    {r7, lr}
    lsls    r7, r0, 16
    lsrs    r4, r2, 16
    lsrs    r7, r7, 16
    mov     r9, r3
    lsrs    r3, r0, 16
    muls    r6, r7
    muls    r5, r3
    muls    r7, r4
    muls    r3, r4
    adds    r7, r7, r5
    lsrs    r4, r6, 16
    adds    r4, r4, r7
    mov     r12, r3
    cmp     r5, r4
    bls.n   1f
    movs    r3, 128
    lsls    r3, r3, 9
    mov     r8, r3
    add     r12, r8
1:  mov     r3, r9
    muls    r3, r0
    muls    r1, r2
    lsrs    r5, r4, 16
    lsls    r6, r6, 16
    add     r5, r12
    lsrs    r6, r6, 16
    lsls    r4, r4, 16
    adds    r4, r4, r6
    adds    r3, r3, r5
    adds    r1, r3, r1
    movs    r0, r4
    pop     {r2, r3}
    mov     r8, r2
    mov     r9, r3
    pop     {r4-r7, pc}
__aeabi_lmul:
    muls    r1, r2
    muls    r3, r0
    adds    r1, r3
    mov     r12, r1
    lsrs    r1, r2, 16
    uxth    r3, r0
    muls    r3, r1
    push    {r4}
    lsrs    r4, r0, 16
    muls    r1, r4
    uxth    r2, r2
    uxth    r0, r0
    muls    r0, r2
    muls    r2, r4
    lsls    r4, r3, 16
    lsrs    r3, 16
    adds    r0, r4
    pop     {r4}
    adcs    r1, r3
    lsls    r3, r2, 16
    lsrs    r2, 16
    adds    r0, r3
    adcs    r1, r2
    add     r1, r12
    bx lr
Code (bytes) 48 118 (+32) 84 86 84 84 50
Stack (bytes) 12 64 28 28 28 28 4
Cycles (0ws) 33 996 59 or 57 60 or 58 59 or 57 59 or 57 29

Details

Additional size in parentheses is for other functions that the code calls.

Cycle counts for the GCC versions vary as shown because they take two different code paths depending on whether a carry is required.

Cycle counts for all versions will be more than what is shown in proportion to how much each version accesses the stack. This is because Cortex-M0 has a Von-Neumann architecture, meaning that data access will delay fetching the next instruction.

Details of exact versions tested.

Conclusions

The GCC versions are all similar. GCC 5 is one cycle slower than GCC 4 or 6 or 7.

My version is twice as fast as the GCC versions. The ARM standardlib version is only slightly slower than mine.

The ARM Microlib version is not optimized at all, neither for size nor speed. It is more than twice as big as the ARM standardlib version which it is intended to be a smaller replacement for. It takes more than thirty times longer to run than it should.