From 7d0056835d5f1c615c7c389e7b0c1893a734c0a1 Mon Sep 17 00:00:00 2001 From: Jon C Date: Mon, 4 Nov 2024 20:42:40 +0100 Subject: [PATCH] transfer-lamports: Improve asm performance, update table (#12) #### Problem The assembly version of transfer-lamports does some redundant work on certain registers. Also, the tables could use more info. #### Summary of changes Optimize the assembly version a bit further, and add some information about the relative performance of different implementations. --- README.md | 21 +++++++++++++-------- transfer-lamports/asm/main.s | 8 ++++---- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index bd2e4a3..f3400a8 100644 --- a/README.md +++ b/README.md @@ -165,20 +165,25 @@ the amount given by a little-endian u64 in instruction data. | Rust | 459 | | Zig | 44 | | C | 104 | -| Assembly | 31 | +| Assembly | 30 | | Rust (pinocchio) | 32 | This one starts to get interesting since it requires parsing the instruction input. Since the assembly version knows exactly where to find everything, it can -be hyper-optimized. +be hyper-optimized. The pinocchio version performs very closely to the assembly +implementation! * CPI: allocates a PDA given by the seed "You pass butter" and a bump seed in the instruction data. This requires a call to `create_program_address` to check the address and `invoke_signed` to CPI to the system program. -| Language | CU Usage | -| --- | --- | -| Rust | 3662 | -| Zig | 2825 | -| C | 3122 | -| Rust (pinocchio) | 2816 | +| Language | CU Usage | CU Usage (minus syscalls) | +| --- | --- | --- | +| Rust | 3662 | 1162 | +| Zig | 2825 | 325 | +| C | 3122 | 622 | +| Rust (pinocchio) | 2816 | 316 | + +Note: `create_program_address` consumes 1500 CUs, and `invoke` consumes 1000, so +we can subtract 2500 CUs from each program to see the actual cost of the program +logic. diff --git a/transfer-lamports/asm/main.s b/transfer-lamports/asm/main.s index cdba678..f3de535 100644 --- a/transfer-lamports/asm/main.s +++ b/transfer-lamports/asm/main.s @@ -12,10 +12,9 @@ entrypoint: add64 r4, 8 + 8 + 32 + 32 + 8 + 8 + 10240 + 8 # calculate end of account data add64 r4, r3 mov64 r5, r4 # check how much padding we need to add - and64 r5, -8 # clear low bits + and64 r4, -8 # clear low bits jeq r5, r4, 1 # no low bits set, jump ahead add64 r4, 8 # add 8 for truncation if needed - and64 r4, -8 # clear low bits ldxb r5, [r4 + 0] # get second account jne r5, 0xff, error # we don't allow duplicates @@ -25,11 +24,12 @@ entrypoint: add64 r7, 8 + 32 + 32 + 8 + 8 + 10240 + 8 # calculate end of account data add64 r7, r6 mov64 r8, r7 # check how much padding we need to add - and64 r8, -8 # clear low bits + and64 r7, -8 # clear low bits jeq r8, r7, 1 # no low bits set, jump ahead add64 r7, 8 # add 8 for truncation if low bits are set + ldxdw r8, [r7 + 0] # get instruction data size - jne r8, 0x08, error # need 8 bytes of instruction data + jne r8, 8, error # need 8 bytes of instruction data ldxdw r8, [r7 + 8] # get instruction data as little-endian u64 sub64 r2, r8 # subtract lamports