From 01796285d8cb45850379b0c397e75b0842450f55 Mon Sep 17 00:00:00 2001 From: Nicolas De Carli Date: Mon, 25 Nov 2024 19:30:05 -0300 Subject: [PATCH 01/11] Add huffman decoding arm assembly --- build/cmake/lib/CMakeLists.txt | 6 +- build/meson/lib/meson.build | 1 + contrib/linux-kernel/Makefile | 1 + lib/common/compiler.h | 6 + lib/common/portability_macros.h | 12 +- lib/common/zstd_internal.h | 4 + lib/compress/zstd_compress.c | 4 + lib/decompress/huf_decompress.c | 45 ++- lib/decompress/huf_decompress_amd64.S | 4 +- lib/decompress/huf_decompress_arm64.S | 406 ++++++++++++++++++++++++++ lib/libzstd.mk | 4 + tests/fuzz/huf_decompress.c | 8 +- tests/fuzz/huf_round_trip.c | 8 +- 13 files changed, 491 insertions(+), 18 deletions(-) create mode 100644 lib/decompress/huf_decompress_arm64.S diff --git a/build/cmake/lib/CMakeLists.txt b/build/cmake/lib/CMakeLists.txt index 43b14d1753b..6e19114fbc1 100644 --- a/build/cmake/lib/CMakeLists.txt +++ b/build/cmake/lib/CMakeLists.txt @@ -42,7 +42,11 @@ else () if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|AMD64.*|x86_64.*|X86_64.*") set(DecompressSources ${DecompressSources} ${LIBRARY_DIR}/decompress/huf_decompress_amd64.S) else() - add_compile_options(-DZSTD_DISABLE_ASM) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64.*|ARM64.*") + set(DecompressSources ${DecompressSources} ${LIBRARY_DIR}/decompress/huf_decompress_arm64.S) + else() + add_compile_options(-DZSTD_DISABLE_ASM) + endif() endif() endif () file(GLOB DictBuilderSources ${LIBRARY_DIR}/dictBuilder/*.c) diff --git a/build/meson/lib/meson.build b/build/meson/lib/meson.build index d086fc2d745..e697baee11e 100644 --- a/build/meson/lib/meson.build +++ b/build/meson/lib/meson.build @@ -51,6 +51,7 @@ libzstd_sources = [join_paths(zstd_rootdir, 'lib/common/entropy_common.c'), # Otherwise, explicitly disable assembly. if [compiler_gcc, compiler_clang].contains(cc_id) libzstd_sources += join_paths(zstd_rootdir, 'lib/decompress/huf_decompress_amd64.S') + libzstd_sources += join_paths(zstd_rootdir, 'lib/decompress/huf_decompress_arm64.S') else add_project_arguments('-DZSTD_DISABLE_ASM', language: 'c') endif diff --git a/contrib/linux-kernel/Makefile b/contrib/linux-kernel/Makefile index 63dd15d958f..1c1467b57ca 100644 --- a/contrib/linux-kernel/Makefile +++ b/contrib/linux-kernel/Makefile @@ -57,6 +57,7 @@ libzstd: -DZSTD_DISABLE_ASM \ -DZSTD_LINUX_KERNEL rm linux/lib/zstd/decompress/huf_decompress_amd64.S + rm linux/lib/zstd/decompress/huf_decompress_arm64.S mv linux/lib/zstd/zstd.h linux/include/linux/zstd_lib.h mv linux/lib/zstd/zstd_errors.h linux/include/linux/ cp linux_zstd.h linux/include/linux/zstd.h diff --git a/lib/common/compiler.h b/lib/common/compiler.h index b6cbcee0366..2e1f41e7d3d 100644 --- a/lib/common/compiler.h +++ b/lib/common/compiler.h @@ -128,8 +128,14 @@ /* Target attribute for BMI2 dynamic dispatch. * Enable lzcnt, bmi, and bmi2. * We test for bmi1 & bmi2. lzcnt is included in bmi1. + * ARMv8+ supports decoding huffman using assembly code + * without requiring any further instruction set */ +#if defined(__aarch64__) +#define BMI2_TARGET_ATTRIBUTE +#else #define BMI2_TARGET_ATTRIBUTE TARGET_ATTRIBUTE("lzcnt,bmi,bmi2") +#endif /* prefetch * can be disabled, by declaring NO_PREFETCH build macro */ diff --git a/lib/common/portability_macros.h b/lib/common/portability_macros.h index b1d9765fb07..10ca95092bf 100644 --- a/lib/common/portability_macros.h +++ b/lib/common/portability_macros.h @@ -81,7 +81,7 @@ #if ((defined(__clang__) && __has_attribute(__target__)) \ || (defined(__GNUC__) \ && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \ - && (defined(__x86_64__) || defined(_M_X64)) \ + && (defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__)) \ && !defined(__BMI2__) # define DYNAMIC_BMI2 1 #else @@ -131,9 +131,19 @@ ZSTD_ASM_SUPPORTED && \ defined(__x86_64__) && \ (DYNAMIC_BMI2 || defined(__BMI2__)) +#if defined(__x86_64__) # define ZSTD_ENABLE_ASM_X86_64_BMI2 1 +# define ZSTD_ENABLE_ASM_ARM64_BMI2 0 +#elif defined(__aarch64__) +# define ZSTD_ENABLE_ASM_X86_64_BMI2 0 +# define ZSTD_ENABLE_ASM_ARM64_BMI2 1 +#else +# define ZSTD_ENABLE_ASM_X86_64_BMI2 0 +# define ZSTD_ENABLE_ASM_ARM64_BMI2 0 +#endif #else # define ZSTD_ENABLE_ASM_X86_64_BMI2 0 +# define ZSTD_ENABLE_ASM_ARM64_BMI2 0 #endif /* diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h index ecb9cfba87c..d15e15cbefc 100644 --- a/lib/common/zstd_internal.h +++ b/lib/common/zstd_internal.h @@ -381,8 +381,12 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, */ MEM_STATIC int ZSTD_cpuSupportsBmi2(void) { +#ifdef __aarch64__ + return 1; +#else ZSTD_cpuid_t cpuid = ZSTD_cpuid(); return ZSTD_cpuid_bmi1(cpuid) && ZSTD_cpuid_bmi2(cpuid); +#endif } #if defined (__cplusplus) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 3d78b7da355..ac8d797b4d2 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -143,7 +143,11 @@ ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize) cctx->blockState.nextCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t)); cctx->tmpWorkspace = ZSTD_cwksp_reserve_object(&cctx->workspace, TMP_WORKSPACE_SIZE); cctx->tmpWkspSize = TMP_WORKSPACE_SIZE; +#ifdef __aarch64__ + cctx->bmi2 = 1; +#else cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); +#endif return cctx; } diff --git a/lib/decompress/huf_decompress.c b/lib/decompress/huf_decompress.c index f85dd0beea0..486ee421888 100644 --- a/lib/decompress/huf_decompress.c +++ b/lib/decompress/huf_decompress.c @@ -711,7 +711,11 @@ size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); } -#if ZSTD_ENABLE_ASM_X86_64_BMI2 +#if ZSTD_ENABLE_ASM_ARM64_BMI2 + +HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_arm64_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; + +#elif ZSTD_ENABLE_ASM_X86_64_BMI2 HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; @@ -903,20 +907,26 @@ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, #if DYNAMIC_BMI2 if (flags & HUF_flags_bmi2) { fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2; -# if ZSTD_ENABLE_ASM_X86_64_BMI2 if (!(flags & HUF_flags_disableAsm)) { +#if ZSTD_ENABLE_ASM_ARM64_BMI2 + loopFn = HUF_decompress4X1_usingDTable_internal_fast_arm64_loop; +#elif ZSTD_ENABLE_ASM_X86_64_BMI2 loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; - } # endif + } } else { return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); } #endif -#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) - if (!(flags & HUF_flags_disableAsm)) { - loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; - } +#if defined(__BMI2__) + if (!(flags & HUF_flags_disableAsm)) { +#if ZSTD_ENABLE_ASM_ARM64_BMI2 + loopFn = HUF_decompress4X1_usingDTable_internal_fast_arm64_loop; +#elif ZSTD_ENABLE_ASM_X86_64_BMI2 + loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; +# endif + } #endif if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { @@ -1514,7 +1524,11 @@ size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); } -#if ZSTD_ENABLE_ASM_X86_64_BMI2 +#if ZSTD_ENABLE_ASM_ARM64_BMI2 + +HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_arm64_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; + +#elif ZSTD_ENABLE_ASM_X86_64_BMI2 HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; @@ -1725,19 +1739,26 @@ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, #if DYNAMIC_BMI2 if (flags & HUF_flags_bmi2) { fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2; -# if ZSTD_ENABLE_ASM_X86_64_BMI2 + if (!(flags & HUF_flags_disableAsm)) { +#if ZSTD_ENABLE_ASM_ARM64_BMI2 + loopFn = HUF_decompress4X2_usingDTable_internal_fast_arm64_loop; +#elif ZSTD_ENABLE_ASM_X86_64_BMI2 loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; +#endif } -# endif } else { return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); } #endif -#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) +#if defined(__BMI2__) if (!(flags & HUF_flags_disableAsm)) { - loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; +#if ZSTD_ENABLE_ASM_ARM64_BMI2 + loopFn = HUF_decompress4X2_usingDTable_internal_fast_arm64_loop; +#elif ZSTD_ENABLE_ASM_X86_64_BMI2 + loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; +#endif } #endif diff --git a/lib/decompress/huf_decompress_amd64.S b/lib/decompress/huf_decompress_amd64.S index 78da291ee3c..69cd8e12422 100644 --- a/lib/decompress/huf_decompress_amd64.S +++ b/lib/decompress/huf_decompress_amd64.S @@ -119,7 +119,7 @@ ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop) _HUF_decompress4X1_usingDTable_internal_fast_asm_loop: HUF_decompress4X1_usingDTable_internal_fast_asm_loop: ZSTD_CET_ENDBRANCH - /* Save all registers - even if they are callee saved for simplicity. */ + /* Save all registers - even if they are caller saved for simplicity. */ push %rax push %rbx push %rcx @@ -374,7 +374,7 @@ HUF_decompress4X1_usingDTable_internal_fast_asm_loop: _HUF_decompress4X2_usingDTable_internal_fast_asm_loop: HUF_decompress4X2_usingDTable_internal_fast_asm_loop: ZSTD_CET_ENDBRANCH - /* Save all registers - even if they are callee saved for simplicity. */ + /* Save all registers - even if they are caller saved for simplicity. */ push %rax push %rbx push %rcx diff --git a/lib/decompress/huf_decompress_arm64.S b/lib/decompress/huf_decompress_arm64.S new file mode 100644 index 00000000000..c0530d351cd --- /dev/null +++ b/lib/decompress/huf_decompress_arm64.S @@ -0,0 +1,406 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#include "../common/portability_macros.h" + +/* Stack marking + * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart + */ +#if defined(__ELF__) && defined(__GNUC__) +.section .note.GNU-stack,"",%progbits +#endif + +#if ZSTD_ENABLE_ASM_ARM64_BMI2 + +/* Calling convention: + * + * x0 contains the first argument: HUF_DecompressAsmArgs*. + * sp contains the stack pointer. + * register's [x19, x24] contents are preserved. + * + * TODO: Support Windows calling convention. + */ + +ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_arm64_loop) +ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_arm64_loop) +ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_arm64_loop) +ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_arm64_loop) +.global HUF_decompress4X1_usingDTable_internal_fast_arm64_loop +.global HUF_decompress4X2_usingDTable_internal_fast_arm64_loop +.global _HUF_decompress4X1_usingDTable_internal_fast_arm64_loop +.global _HUF_decompress4X2_usingDTable_internal_fast_arm64_loop +.text + +/* Sets up register mappings for clarity. */ + +#define op0 x2 +#define op1 x3 +#define op2 x4 +#define op3 x5 + +#define ip0 x6 +#define ip1 x7 +#define ip2 x8 +#define ip3 x9 + +#define bits0 x10 +#define bits1 x11 +#define bits2 x12 +#define bits3 x13 +#define dtable x14 +#define olimit x15 +#define ilowest x23 +#define oend x24 +#define oend1 x16 +#define oend2 x17 +#define oend3 x19 + +#define var0 x19 +#define var1 x1 +#define var2 x16 +#define var3 x17 + +/* 32-bit var registers */ +#define vard0 w19 +#define vard1 w1 +#define vard2 w16 +#define vard3 w17 + +/* Calls X(N) for each stream 0, 1, 2, 3. */ +#define FOR_EACH_STREAM(X) \ + X(0); \ + X(1); \ + X(2); \ + X(3) + +/* Calls X(N, idx) for each stream 0, 1, 2, 3. */ +#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \ + X(0, idx); \ + X(1, idx); \ + X(2, idx); \ + X(3, idx) + +/* Define both _HUF_* & HUF_* symbols because MacOS + * C symbols are prefixed with '_' & Linux symbols aren't. + */ +_HUF_decompress4X1_usingDTable_internal_fast_arm64_loop: +HUF_decompress4X1_usingDTable_internal_fast_arm64_loop: + ZSTD_CET_ENDBRANCH + /* save callee saved registers. */ + + stp x19, x20, [sp, #-48] + stp x21, x22, [sp, #-32] + stp x23, x24, [sp, #-16] + + /* Read HUF_DecompressAsmArgs* args from x0 */ + ldp ip0, ip1, [x0] + ldp ip2, ip3, [x0, #16] + ldp op0, op1, [x0, #32] + ldp op2, op3, [x0, #48] + ldp bits0, bits1, [x0, #64] + ldp bits2, bits3, [x0, #80] + ldp dtable, ilowest, [x0, #96] + ldr oend, [x0, #112] + + mov x20, #9363 + movk x20, #37449, lsl 16 + movk x20, #18724, lsl 32 + movk x20, #9362, lsl 48 + +.L_4X1_compute_olimit: + /* Computes how many iterations we can do safely */ + + mov x15, #-3689348814741910324 + movk x15, #52429, lsl 0 + + /* x21 = ip0 - ilowest */ + sub x21, ip0, ilowest + + /* x22 = (ip0 - ilowest) / 7 */ + umulh x22, x21, x20 + sub x21, x21, x22 + add x22, x22, x21, lsr #1 + lsr x22, x22, 2 + + /* x21 = oend - op3 */ + sub x21, oend, op3 + + /* x15 = (oend - op3) / 5 */ + umulh x15, x21, x15 + lsr x15, x15, 2 + + /* x15 = min(x22, x15) */ + cmp x22, x15 + csel x15, x22, x15, ls + + cmp x15, #0 + blo .L_4X1_exit + + /* If (ip1 < ip0) go to exit */ + cmp ip0, ip1 + bhi .L_4X1_exit + + /* If (ip2 < ip1) go to exit */ + cmp ip1, ip2 + bhi .L_4X1_exit + + /* If (ip3 < ip2) go to exit */ + cmp ip2, ip3 + bhi .L_4X1_exit + + /* x15 = x15 * 5 */ + add x15, x15, x15, lsl #2 + + /* olimit = op3 + x15 */ + add olimit, x15, op3 + +/* Reads top 11 bits from bits[n] + * Loads dt[bits[n]] into var[n] + */ +#define GET_NEXT_DELT(n) \ + lsr var##n, bits##n, #53; \ + ldrh vard##n, [dtable, var##n, lsl #1]; + +/* var[n] must contain the DTable entry computed with GET_NEXT_DELT + * Moves var[n] to %rax + * bits[n] <<= var[n] & 63 + * op[n][idx] = %rax >> 8 + * %ah is a way to access bits [8, 16) of %rax + */ +#define DECODE_FROM_DELT(n, idx) \ + lsr x21, var##n, #8; \ + lsl bits##n, bits##n, var##n; \ + strb w21, [op##n, ##idx]; + +/* Assumes GET_NEXT_DELT has been called. + * Calls DECODE_FROM_DELT then GET_NEXT_DELT + */ +#define DECODE_AND_GET_NEXT(n, idx) \ + DECODE_FROM_DELT(n, idx); \ + GET_NEXT_DELT(n) \ + +/* // ctz & nbBytes is stored in bits[n] + * // nbBits is stored in %rax + * ctz = CTZ[bits[n]] + * nbBits = ctz & 7 + * nbBytes = ctz >> 3 + * op[n] += 5 + * ip[n] -= nbBytes + * // Note: x86-64 is little-endian ==> no bswap + * bits[n] = MEM_readST(ip[n]) | 1 + * bits[n] <<= nbBits + */ +#define RELOAD_BITS(n) \ + rbit bits##n, bits##n; \ + clz bits##n, bits##n; \ + sub ip##n, ip##n, bits##n, lsr #3; \ + and x22, bits##n, #7; \ + add op##n, op##n, #5; \ + ldr bits##n, [ip##n]; \ + orr bits##n, bits##n, #1; \ + lsl bits##n, bits##n, x22; + + /* Call GET_NEXT_DELT for each stream */ + FOR_EACH_STREAM(GET_NEXT_DELT) + + .p2align 6 + +.L_4X1_loop_body: + /* Decode 5 symbols in each of the 4 streams (20 total) + * Must have called GET_NEXT_DELT for each stream + */ + FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0) + FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1) + FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2) + FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3) + FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4) + + /* Reload each stream & fetch the next table entry + * to prepare for the next iteration + */ + RELOAD_BITS(0) + GET_NEXT_DELT(0) + + RELOAD_BITS(1) + GET_NEXT_DELT(1) + + RELOAD_BITS(2) + GET_NEXT_DELT(2) + + RELOAD_BITS(3) + GET_NEXT_DELT(3) + + /* If op3 < olimit: continue the loop */ + cmp olimit, op3 + bhi .L_4X1_loop_body + + /* Re-compute olimit */ + b .L_4X1_compute_olimit + +#undef GET_NEXT_DELT +#undef DECODE_FROM_DELT +#undef DECODE +#undef RELOAD_BITS +.L_4X1_exit: + + /* Save ip / op / bits */ + stp ip0, ip1, [x0] + stp ip2, ip3, [x0, #16] + stp op0, op1, [x0, #32] + stp op2, op3, [x0, #48] + stp bits0, bits1, [x0, #64] + stp bits2, bits3, [x0, #80] + + ldp x19, x20, [sp, #-48] + ldp x21, x22, [sp, #-32] + ldp x23, x24, [sp, #-16] + + ret + +_HUF_decompress4X2_usingDTable_internal_fast_arm64_loop: +HUF_decompress4X2_usingDTable_internal_fast_arm64_loop: + ZSTD_CET_ENDBRANCH + /* save callee saved registers. */ + + stp x19, x20, [sp, #-48] + stp x21, x22, [sp, #-32] + stp x23, x24, [sp, #-16] + + /* Read HUF_DecompressAsmArgs* args from x0 */ + ldp ip0, ip1, [x0] + ldp ip2, ip3, [x0, #16] + ldp op0, op1, [x0, #32] + ldp op2, op3, [x0, #48] + ldp bits0, bits1, [x0, #64] + ldp bits2, bits3, [x0, #80] + ldp dtable, ilowest, [x0, #96] + ldr oend, [x0, #112] + + mov oend1, op1 + mov oend2, op2 + mov oend3, op3 + + mov x22, #9363 + movk x22, #37449, lsl 16 + movk x22, #18724, lsl 32 + movk x22, #9362, lsl 48 + +.L_4X2_compute_olimit: + + /* x21 = ip0 - ilowest */ + sub x21, ip0, ilowest + sub x1, oend1, op0 + sub x20, oend2, op1 + + /* x15 = (ip0 - ilowest) / 7 */ + /* x15 = min(x15, min(oend1 - op0, oend2 - op1, oend3 - op2, oend - op3) / 10) */ + + umulh x15, x21, x22 + sub x21, x21, x15 + add x15, x15, x21, lsr #1 + + sub x21, oend3, op2 + + lsr x15, x15, 2 + + cmp x1, x20 + csel x20, x1, x20, ls + + sub x1, oend, op3 + + cmp x21, x20 + csel x20, x21, x20, ls + + mov x21, -3689348814741910324 + movk x21, 0xcccd, lsl 0 + + cmp x1, x20 + csel x20, x1, x20, ls + + umulh x21, x20, x21 + lsr x20, x21, 3 + + cmp x15, x20 + csel x20, x15, x20, ls + + /* If x20 == 0 */ + cmp x20, #0 + beq .L_4X2_exit + + /* If (ip1 < ip0) go to exit */ + cmp ip0, ip1 + bhi .L_4X2_exit + + /* If (ip2 < ip1) go to exit */ + cmp ip1, ip2 + bhi .L_4X2_exit + + /* If (ip3 < ip2) go to exit */ + cmp ip2, ip3 + bhi .L_4X2_exit + + /* olimit = op3 + 5 * x20 */ + add x1, x20, x20, lsl #2 + add olimit, op3, x1 + +#define DECODE(n, idx) \ + lsr x21, bits##n, #53; \ + ldr w21, [dtable, x21, lsl #2]; \ + strh w21, [op##n]; \ + lsr w20, w21, #16; \ + add op##n, op##n, x21, lsr #24; \ + and x20, x20, #255; \ + lsl bits##n, bits##n, x20; + +#define RELOAD_BITS(n) \ + rbit bits##n, bits##n; \ + clz bits##n, bits##n; \ + sub ip##n, ip##n, bits##n, lsr #3; \ + and x1, bits##n, #7; \ + ldr bits##n, [ip##n]; \ + orr bits##n, bits##n, #1; \ + lsl bits##n, bits##n, x1; + + .p2align 6 + +.L_4X2_loop_body: + + /* Decode 5 symbols from each of the 4 streams (20 symbols total). */ + FOR_EACH_STREAM_WITH_INDEX(DECODE, 0) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 1) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 2) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 3) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 4) + + FOR_EACH_STREAM(RELOAD_BITS) + + cmp olimit, op3 + bhi .L_4X2_loop_body + b .L_4X2_compute_olimit + +#undef DECODE +#undef RELOAD_BITS +.L_4X2_exit: + + /* Save ip / op / bits */ + stp ip0, ip1, [x0] + stp ip2, ip3, [x0, #16] + stp op0, op1, [x0, #32] + stp op2, op3, [x0, #48] + stp bits0, bits1, [x0, #64] + stp bits2, bits3, [x0, #80] + + ldp x19, x20, [sp, #-48] + ldp x21, x22, [sp, #-32] + ldp x23, x24, [sp, #-16] + + ret + +#endif diff --git a/lib/libzstd.mk b/lib/libzstd.mk index 91bd4caf382..6cd75945d88 100644 --- a/lib/libzstd.mk +++ b/lib/libzstd.mk @@ -155,12 +155,16 @@ ZSTD_LEGACY_FILES := ZSTD_DECOMPRESS_AMD64_ASM_FILES := $(sort $(wildcard $(LIB_SRCDIR)/decompress/*_amd64.S)) +ZSTD_DECOMPRESS_ARM64_ASM_FILES := $(sort $(wildcard $(LIBZSTD)/decompress/*_arm64.S)) + ifneq ($(ZSTD_NO_ASM), 0) CPPFLAGS += -DZSTD_DISABLE_ASM else # Unconditionally add the ASM files they are disabled by # macros in the .S file. ZSTD_DECOMPRESS_FILES += $(ZSTD_DECOMPRESS_AMD64_ASM_FILES) + + ZSTD_DECOMPRESS_FILES += $(ZSTD_DECOMPRESS_ARM64_ASM_FILES) endif ifneq ($(HUF_FORCE_DECOMPRESS_X1), 0) diff --git a/tests/fuzz/huf_decompress.c b/tests/fuzz/huf_decompress.c index fcd4b1a3bd2..5c2442a6879 100644 --- a/tests/fuzz/huf_decompress.c +++ b/tests/fuzz/huf_decompress.c @@ -28,8 +28,14 @@ int LLVMFuzzerTestOneInput(const uint8_t *src, size_t size) /* Select random parameters: #streams, X1 or X2 decoding, bmi2 */ int const streams = FUZZ_dataProducer_int32Range(producer, 0, 1); int const symbols = FUZZ_dataProducer_int32Range(producer, 0, 1); + bool cpuSupportsBmi; +#ifdef __aarch64__ + cpuSupportsBmi = true; +#else + cpuSupportsBmi = ZSTD_cpuid_bmi2(ZSTD_cpuid()); +#endif int const flags = 0 - | (ZSTD_cpuid_bmi2(ZSTD_cpuid()) && FUZZ_dataProducer_int32Range(producer, 0, 1) ? HUF_flags_bmi2 : 0) + | (cpuSupportsBmi && FUZZ_dataProducer_int32Range(producer, 0, 1) ? HUF_flags_bmi2 : 0) | (FUZZ_dataProducer_int32Range(producer, 0, 1) ? HUF_flags_optimalDepth : 0) | (FUZZ_dataProducer_int32Range(producer, 0, 1) ? HUF_flags_preferRepeat : 0) | (FUZZ_dataProducer_int32Range(producer, 0, 1) ? HUF_flags_suspectUncompressible : 0) diff --git a/tests/fuzz/huf_round_trip.c b/tests/fuzz/huf_round_trip.c index 4d0f8de23f5..da108ef081a 100644 --- a/tests/fuzz/huf_round_trip.c +++ b/tests/fuzz/huf_round_trip.c @@ -44,8 +44,14 @@ int LLVMFuzzerTestOneInput(const uint8_t *src, size_t size) /* Select random parameters: #streams, X1 or X2 decoding, bmi2 */ int const streams = FUZZ_dataProducer_int32Range(producer, 0, 1); int const symbols = FUZZ_dataProducer_int32Range(producer, 0, 1); + bool cpuSupportsBmi; +#ifdef __aarch64__ + cpuSupportsBmi = true; +#else + cpuSupportsBmi = ZSTD_cpuid_bmi2(ZSTD_cpuid()); +#endif int const flags = 0 - | (ZSTD_cpuid_bmi2(ZSTD_cpuid()) && FUZZ_dataProducer_int32Range(producer, 0, 1) ? HUF_flags_bmi2 : 0) + | (cpuSupportsBmi && FUZZ_dataProducer_int32Range(producer, 0, 1) ? HUF_flags_bmi2 : 0) | (FUZZ_dataProducer_int32Range(producer, 0, 1) ? HUF_flags_optimalDepth : 0) | (FUZZ_dataProducer_int32Range(producer, 0, 1) ? HUF_flags_preferRepeat : 0) | (FUZZ_dataProducer_int32Range(producer, 0, 1) ? HUF_flags_suspectUncompressible : 0) From 37a39c8b0235f2aafdd583c7eab25c45fcbb3e9f Mon Sep 17 00:00:00 2001 From: Nicolas De Carli Date: Mon, 25 Nov 2024 19:43:49 -0300 Subject: [PATCH 02/11] Fix cmake --- build/cmake/lib/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/cmake/lib/CMakeLists.txt b/build/cmake/lib/CMakeLists.txt index 6e19114fbc1..fbe7eef44d4 100644 --- a/build/cmake/lib/CMakeLists.txt +++ b/build/cmake/lib/CMakeLists.txt @@ -42,7 +42,7 @@ else () if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|AMD64.*|x86_64.*|X86_64.*") set(DecompressSources ${DecompressSources} ${LIBRARY_DIR}/decompress/huf_decompress_amd64.S) else() - if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64.*|ARM64.*") + if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)") set(DecompressSources ${DecompressSources} ${LIBRARY_DIR}/decompress/huf_decompress_arm64.S) else() add_compile_options(-DZSTD_DISABLE_ASM) From c4ed7d065ca8da4ba7d9e45235e449e40431728a Mon Sep 17 00:00:00 2001 From: Nicolas De Carli Date: Mon, 25 Nov 2024 19:56:44 -0300 Subject: [PATCH 03/11] Fix not enabling asm on arm --- lib/common/portability_macros.h | 1 - lib/libzstd.mk | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/common/portability_macros.h b/lib/common/portability_macros.h index 10ca95092bf..fb7ea75c380 100644 --- a/lib/common/portability_macros.h +++ b/lib/common/portability_macros.h @@ -129,7 +129,6 @@ */ #if !defined(ZSTD_DISABLE_ASM) && \ ZSTD_ASM_SUPPORTED && \ - defined(__x86_64__) && \ (DYNAMIC_BMI2 || defined(__BMI2__)) #if defined(__x86_64__) # define ZSTD_ENABLE_ASM_X86_64_BMI2 1 diff --git a/lib/libzstd.mk b/lib/libzstd.mk index 6cd75945d88..b54c004938d 100644 --- a/lib/libzstd.mk +++ b/lib/libzstd.mk @@ -155,7 +155,7 @@ ZSTD_LEGACY_FILES := ZSTD_DECOMPRESS_AMD64_ASM_FILES := $(sort $(wildcard $(LIB_SRCDIR)/decompress/*_amd64.S)) -ZSTD_DECOMPRESS_ARM64_ASM_FILES := $(sort $(wildcard $(LIBZSTD)/decompress/*_arm64.S)) +ZSTD_DECOMPRESS_ARM64_ASM_FILES := $(sort $(wildcard $(LIB_SRCDIR)/decompress/*_arm64.S)) ifneq ($(ZSTD_NO_ASM), 0) CPPFLAGS += -DZSTD_DISABLE_ASM From 45a8c1f4817477eeeaf7100a5f9b847cdce0078a Mon Sep 17 00:00:00 2001 From: Nicolas De Carli Date: Mon, 25 Nov 2024 20:32:25 -0300 Subject: [PATCH 04/11] test --- lib/decompress/huf_decompress_arm64.S | 2 +- lib/libzstd.mk | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/decompress/huf_decompress_arm64.S b/lib/decompress/huf_decompress_arm64.S index c0530d351cd..208f2207a35 100644 --- a/lib/decompress/huf_decompress_arm64.S +++ b/lib/decompress/huf_decompress_arm64.S @@ -141,7 +141,7 @@ HUF_decompress4X1_usingDTable_internal_fast_arm64_loop: csel x15, x22, x15, ls cmp x15, #0 - blo .L_4X1_exit + beq .L_4X1_exit /* If (ip1 < ip0) go to exit */ cmp ip0, ip1 diff --git a/lib/libzstd.mk b/lib/libzstd.mk index b54c004938d..b1b98c27c01 100644 --- a/lib/libzstd.mk +++ b/lib/libzstd.mk @@ -157,6 +157,8 @@ ZSTD_DECOMPRESS_AMD64_ASM_FILES := $(sort $(wildcard $(LIB_SRCDIR)/decompress/*_ ZSTD_DECOMPRESS_ARM64_ASM_FILES := $(sort $(wildcard $(LIB_SRCDIR)/decompress/*_arm64.S)) +CFLAGS += -DHUF_FORCE_DECOMPRESS_X2 + ifneq ($(ZSTD_NO_ASM), 0) CPPFLAGS += -DZSTD_DISABLE_ASM else @@ -168,7 +170,7 @@ else endif ifneq ($(HUF_FORCE_DECOMPRESS_X1), 0) - CFLAGS += -DHUF_FORCE_DECOMPRESS_X1 + #CFLAGS += -DHUF_FORCE_DECOMPRESS_X1 endif ifneq ($(HUF_FORCE_DECOMPRESS_X2), 0) From 8abba6f8b71c2089233e74869dd0805db47d17c1 Mon Sep 17 00:00:00 2001 From: Nicolas De Carli Date: Mon, 25 Nov 2024 20:38:08 -0300 Subject: [PATCH 05/11] Test case X1 --- lib/libzstd.mk | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/libzstd.mk b/lib/libzstd.mk index b1b98c27c01..dc1f8e0032e 100644 --- a/lib/libzstd.mk +++ b/lib/libzstd.mk @@ -157,7 +157,7 @@ ZSTD_DECOMPRESS_AMD64_ASM_FILES := $(sort $(wildcard $(LIB_SRCDIR)/decompress/*_ ZSTD_DECOMPRESS_ARM64_ASM_FILES := $(sort $(wildcard $(LIB_SRCDIR)/decompress/*_arm64.S)) -CFLAGS += -DHUF_FORCE_DECOMPRESS_X2 +CFLAGS += -DHUF_FORCE_DECOMPRESS_X1 ifneq ($(ZSTD_NO_ASM), 0) CPPFLAGS += -DZSTD_DISABLE_ASM @@ -170,11 +170,11 @@ else endif ifneq ($(HUF_FORCE_DECOMPRESS_X1), 0) - #CFLAGS += -DHUF_FORCE_DECOMPRESS_X1 + CFLAGS += -DHUF_FORCE_DECOMPRESS_X1 endif ifneq ($(HUF_FORCE_DECOMPRESS_X2), 0) - CFLAGS += -DHUF_FORCE_DECOMPRESS_X2 + #CFLAGS += -DHUF_FORCE_DECOMPRESS_X2 endif ifneq ($(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT), 0) From 4d9c0d0cb4c1e43f1631b6d7ea80e5f1e9ec4cee Mon Sep 17 00:00:00 2001 From: Nicolas De Carli Date: Mon, 25 Nov 2024 20:59:17 -0300 Subject: [PATCH 06/11] Clean --- lib/common/portability_macros.h | 31 +++++++++++++++----------- lib/decompress/huf_decompress.c | 32 ++++++++++++--------------- lib/decompress/huf_decompress_amd64.S | 4 ++-- lib/decompress/huf_decompress_arm64.S | 2 +- lib/libzstd.mk | 4 +--- 5 files changed, 36 insertions(+), 37 deletions(-) diff --git a/lib/common/portability_macros.h b/lib/common/portability_macros.h index fb7ea75c380..51433868e26 100644 --- a/lib/common/portability_macros.h +++ b/lib/common/portability_macros.h @@ -128,21 +128,26 @@ * - BMI2 is supported at compile time */ #if !defined(ZSTD_DISABLE_ASM) && \ - ZSTD_ASM_SUPPORTED && \ - (DYNAMIC_BMI2 || defined(__BMI2__)) -#if defined(__x86_64__) -# define ZSTD_ENABLE_ASM_X86_64_BMI2 1 -# define ZSTD_ENABLE_ASM_ARM64_BMI2 0 -#elif defined(__aarch64__) -# define ZSTD_ENABLE_ASM_X86_64_BMI2 0 -# define ZSTD_ENABLE_ASM_ARM64_BMI2 1 -#else -# define ZSTD_ENABLE_ASM_X86_64_BMI2 0 -# define ZSTD_ENABLE_ASM_ARM64_BMI2 0 -#endif + ZSTD_ASM_SUPPORTED + +# if defined(__x86_64__) && (DYNAMIC_BMI2 || defined(__BMI2__)) +# define ZSTD_ENABLE_ASM_X86_64_BMI2 1 +# else +# define ZSTD_ENABLE_ASM_X86_64_BMI2 0 +# endif + +/* For now only enable ARM64 assembly when ZSTD_EXPERIMENTAL_ARM64 ++ * is defined. This ensures that it is only enabled for your tests. ++ */ +# if defined(__aarch64__) && defined(ZSTD_EXPERIMENTAL_ARM64) && !defined(__APPLE__) +# define ZSTD_ENABLE_ASM_ARM64 1 +# else +# define ZSTD_ENABLE_ASM_ARM64 0 +# endif + #else # define ZSTD_ENABLE_ASM_X86_64_BMI2 0 -# define ZSTD_ENABLE_ASM_ARM64_BMI2 0 +# define ZSTD_ENABLE_ASM_ARM64 0 #endif /* diff --git a/lib/decompress/huf_decompress.c b/lib/decompress/huf_decompress.c index 486ee421888..8cee6cbbc84 100644 --- a/lib/decompress/huf_decompress.c +++ b/lib/decompress/huf_decompress.c @@ -711,7 +711,7 @@ size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); } -#if ZSTD_ENABLE_ASM_ARM64_BMI2 +#if ZSTD_ENABLE_ASM_ARM64 HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_arm64_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; @@ -908,7 +908,7 @@ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, if (flags & HUF_flags_bmi2) { fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2; if (!(flags & HUF_flags_disableAsm)) { -#if ZSTD_ENABLE_ASM_ARM64_BMI2 +#if ZSTD_ENABLE_ASM_ARM64 loopFn = HUF_decompress4X1_usingDTable_internal_fast_arm64_loop; #elif ZSTD_ENABLE_ASM_X86_64_BMI2 loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; @@ -919,15 +919,13 @@ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, } #endif -#if defined(__BMI2__) - if (!(flags & HUF_flags_disableAsm)) { -#if ZSTD_ENABLE_ASM_ARM64_BMI2 - loopFn = HUF_decompress4X1_usingDTable_internal_fast_arm64_loop; -#elif ZSTD_ENABLE_ASM_X86_64_BMI2 - loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; -# endif - } + if (!(flags & HUF_flags_disableAsm)) { +#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) + loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; +#elif ZSTD_ENABLE_ASM_ARM64 + loopFn = HUF_decompress4X1_usingDTable_internal_fast_arm64_loop #endif + } if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); @@ -1524,7 +1522,7 @@ size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); } -#if ZSTD_ENABLE_ASM_ARM64_BMI2 +#if ZSTD_ENABLE_ASM_ARM64 HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_arm64_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; @@ -1741,7 +1739,7 @@ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2; if (!(flags & HUF_flags_disableAsm)) { -#if ZSTD_ENABLE_ASM_ARM64_BMI2 +#if ZSTD_ENABLE_ASM_ARM64 loopFn = HUF_decompress4X2_usingDTable_internal_fast_arm64_loop; #elif ZSTD_ENABLE_ASM_X86_64_BMI2 loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; @@ -1752,15 +1750,13 @@ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, } #endif -#if defined(__BMI2__) if (!(flags & HUF_flags_disableAsm)) { -#if ZSTD_ENABLE_ASM_ARM64_BMI2 - loopFn = HUF_decompress4X2_usingDTable_internal_fast_arm64_loop; -#elif ZSTD_ENABLE_ASM_X86_64_BMI2 - loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; +#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) + loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; +#elif ZSTD_ENABLE_ASM_ARM64 + loopFn = HUF_decompress4X2_usingDTable_internal_fast_arm64_loop #endif } -#endif if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); diff --git a/lib/decompress/huf_decompress_amd64.S b/lib/decompress/huf_decompress_amd64.S index 69cd8e12422..78da291ee3c 100644 --- a/lib/decompress/huf_decompress_amd64.S +++ b/lib/decompress/huf_decompress_amd64.S @@ -119,7 +119,7 @@ ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop) _HUF_decompress4X1_usingDTable_internal_fast_asm_loop: HUF_decompress4X1_usingDTable_internal_fast_asm_loop: ZSTD_CET_ENDBRANCH - /* Save all registers - even if they are caller saved for simplicity. */ + /* Save all registers - even if they are callee saved for simplicity. */ push %rax push %rbx push %rcx @@ -374,7 +374,7 @@ HUF_decompress4X1_usingDTable_internal_fast_asm_loop: _HUF_decompress4X2_usingDTable_internal_fast_asm_loop: HUF_decompress4X2_usingDTable_internal_fast_asm_loop: ZSTD_CET_ENDBRANCH - /* Save all registers - even if they are caller saved for simplicity. */ + /* Save all registers - even if they are callee saved for simplicity. */ push %rax push %rbx push %rcx diff --git a/lib/decompress/huf_decompress_arm64.S b/lib/decompress/huf_decompress_arm64.S index 208f2207a35..4c4ac906516 100644 --- a/lib/decompress/huf_decompress_arm64.S +++ b/lib/decompress/huf_decompress_arm64.S @@ -17,7 +17,7 @@ .section .note.GNU-stack,"",%progbits #endif -#if ZSTD_ENABLE_ASM_ARM64_BMI2 +#if ZSTD_ENABLE_ASM_ARM64 /* Calling convention: * diff --git a/lib/libzstd.mk b/lib/libzstd.mk index dc1f8e0032e..b54c004938d 100644 --- a/lib/libzstd.mk +++ b/lib/libzstd.mk @@ -157,8 +157,6 @@ ZSTD_DECOMPRESS_AMD64_ASM_FILES := $(sort $(wildcard $(LIB_SRCDIR)/decompress/*_ ZSTD_DECOMPRESS_ARM64_ASM_FILES := $(sort $(wildcard $(LIB_SRCDIR)/decompress/*_arm64.S)) -CFLAGS += -DHUF_FORCE_DECOMPRESS_X1 - ifneq ($(ZSTD_NO_ASM), 0) CPPFLAGS += -DZSTD_DISABLE_ASM else @@ -174,7 +172,7 @@ ifneq ($(HUF_FORCE_DECOMPRESS_X1), 0) endif ifneq ($(HUF_FORCE_DECOMPRESS_X2), 0) - #CFLAGS += -DHUF_FORCE_DECOMPRESS_X2 + CFLAGS += -DHUF_FORCE_DECOMPRESS_X2 endif ifneq ($(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT), 0) From eca2516d635eb72ad7e03093e2158d5b2c9c96e7 Mon Sep 17 00:00:00 2001 From: Nicolas De Carli Date: Mon, 25 Nov 2024 21:17:50 -0300 Subject: [PATCH 07/11] Use int instead of bool --- tests/fuzz/huf_decompress.c | 4 ++-- tests/fuzz/huf_round_trip.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/fuzz/huf_decompress.c b/tests/fuzz/huf_decompress.c index 5c2442a6879..37b6e7ef6e8 100644 --- a/tests/fuzz/huf_decompress.c +++ b/tests/fuzz/huf_decompress.c @@ -28,9 +28,9 @@ int LLVMFuzzerTestOneInput(const uint8_t *src, size_t size) /* Select random parameters: #streams, X1 or X2 decoding, bmi2 */ int const streams = FUZZ_dataProducer_int32Range(producer, 0, 1); int const symbols = FUZZ_dataProducer_int32Range(producer, 0, 1); - bool cpuSupportsBmi; + int cpuSupportsBmi; #ifdef __aarch64__ - cpuSupportsBmi = true; + cpuSupportsBmi = 1; #else cpuSupportsBmi = ZSTD_cpuid_bmi2(ZSTD_cpuid()); #endif diff --git a/tests/fuzz/huf_round_trip.c b/tests/fuzz/huf_round_trip.c index da108ef081a..7572a1d8149 100644 --- a/tests/fuzz/huf_round_trip.c +++ b/tests/fuzz/huf_round_trip.c @@ -44,9 +44,9 @@ int LLVMFuzzerTestOneInput(const uint8_t *src, size_t size) /* Select random parameters: #streams, X1 or X2 decoding, bmi2 */ int const streams = FUZZ_dataProducer_int32Range(producer, 0, 1); int const symbols = FUZZ_dataProducer_int32Range(producer, 0, 1); - bool cpuSupportsBmi; + int cpuSupportsBmi; #ifdef __aarch64__ - cpuSupportsBmi = true; + cpuSupportsBmi = 1; #else cpuSupportsBmi = ZSTD_cpuid_bmi2(ZSTD_cpuid()); #endif From f4a1f49e49f4614c3f9e3b359e289ce7014289b4 Mon Sep 17 00:00:00 2001 From: Nicolas De Carli Date: Mon, 25 Nov 2024 21:43:20 -0300 Subject: [PATCH 08/11] Enable arm asm code on apple --- lib/common/portability_macros.h | 2 +- lib/decompress/huf_decompress.c | 4 +- lib/decompress/huf_decompress_arm64.S | 85 ++++++++++++++------------- 3 files changed, 47 insertions(+), 44 deletions(-) diff --git a/lib/common/portability_macros.h b/lib/common/portability_macros.h index 51433868e26..67b17eda848 100644 --- a/lib/common/portability_macros.h +++ b/lib/common/portability_macros.h @@ -139,7 +139,7 @@ /* For now only enable ARM64 assembly when ZSTD_EXPERIMENTAL_ARM64 + * is defined. This ensures that it is only enabled for your tests. + */ -# if defined(__aarch64__) && defined(ZSTD_EXPERIMENTAL_ARM64) && !defined(__APPLE__) +# if defined(__aarch64__) # define ZSTD_ENABLE_ASM_ARM64 1 # else # define ZSTD_ENABLE_ASM_ARM64 0 diff --git a/lib/decompress/huf_decompress.c b/lib/decompress/huf_decompress.c index 8cee6cbbc84..9deb6335b60 100644 --- a/lib/decompress/huf_decompress.c +++ b/lib/decompress/huf_decompress.c @@ -923,7 +923,7 @@ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; #elif ZSTD_ENABLE_ASM_ARM64 - loopFn = HUF_decompress4X1_usingDTable_internal_fast_arm64_loop + loopFn = HUF_decompress4X1_usingDTable_internal_fast_arm64_loop; #endif } @@ -1754,7 +1754,7 @@ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; #elif ZSTD_ENABLE_ASM_ARM64 - loopFn = HUF_decompress4X2_usingDTable_internal_fast_arm64_loop + loopFn = HUF_decompress4X2_usingDTable_internal_fast_arm64_loop; #endif } diff --git a/lib/decompress/huf_decompress_arm64.S b/lib/decompress/huf_decompress_arm64.S index 4c4ac906516..12dbf2a9f0e 100644 --- a/lib/decompress/huf_decompress_arm64.S +++ b/lib/decompress/huf_decompress_arm64.S @@ -19,6 +19,11 @@ #if ZSTD_ENABLE_ASM_ARM64 +#if !(__APPLE__) +/* on Apple platforms ' %%' is used as seperator instead of ';' */ + #define %% ; +#endif + /* Calling convention: * * x0 contains the first argument: HUF_DecompressAsmArgs*. @@ -75,16 +80,16 @@ ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_arm64_loop) /* Calls X(N) for each stream 0, 1, 2, 3. */ #define FOR_EACH_STREAM(X) \ - X(0); \ - X(1); \ - X(2); \ + X(0) %% \ + X(1) %% \ + X(2) %% \ X(3) /* Calls X(N, idx) for each stream 0, 1, 2, 3. */ #define FOR_EACH_STREAM_WITH_INDEX(X, idx) \ - X(0, idx); \ - X(1, idx); \ - X(2, idx); \ + X(0, idx) %% \ + X(1, idx) %% \ + X(2, idx) %% \ X(3, idx) /* Define both _HUF_* & HUF_* symbols because MacOS @@ -164,30 +169,28 @@ HUF_decompress4X1_usingDTable_internal_fast_arm64_loop: /* Reads top 11 bits from bits[n] * Loads dt[bits[n]] into var[n] */ -#define GET_NEXT_DELT(n) \ - lsr var##n, bits##n, #53; \ - ldrh vard##n, [dtable, var##n, lsl #1]; +#define GET_NEXT_DELT(n) \ + lsr var##n, bits##n, #53 %% \ + ldrh vard##n, [dtable, var##n, lsl #1] %% /* var[n] must contain the DTable entry computed with GET_NEXT_DELT - * Moves var[n] to %rax + * Moves var[n] to x21 * bits[n] <<= var[n] & 63 - * op[n][idx] = %rax >> 8 - * %ah is a way to access bits [8, 16) of %rax + * op[n][idx] = x21 >> 8 */ -#define DECODE_FROM_DELT(n, idx) \ - lsr x21, var##n, #8; \ - lsl bits##n, bits##n, var##n; \ - strb w21, [op##n, ##idx]; +#define DECODE_FROM_DELT(n, idx) \ + lsr x21, var##n, #8 %% \ + lsl bits##n, bits##n, var##n %% \ + strb w21, [op##n, ##idx] %% /* Assumes GET_NEXT_DELT has been called. * Calls DECODE_FROM_DELT then GET_NEXT_DELT */ #define DECODE_AND_GET_NEXT(n, idx) \ - DECODE_FROM_DELT(n, idx); \ + DECODE_FROM_DELT(n, idx) %% \ GET_NEXT_DELT(n) \ /* // ctz & nbBytes is stored in bits[n] - * // nbBits is stored in %rax * ctz = CTZ[bits[n]] * nbBits = ctz & 7 * nbBytes = ctz >> 3 @@ -198,14 +201,14 @@ HUF_decompress4X1_usingDTable_internal_fast_arm64_loop: * bits[n] <<= nbBits */ #define RELOAD_BITS(n) \ - rbit bits##n, bits##n; \ - clz bits##n, bits##n; \ - sub ip##n, ip##n, bits##n, lsr #3; \ - and x22, bits##n, #7; \ - add op##n, op##n, #5; \ - ldr bits##n, [ip##n]; \ - orr bits##n, bits##n, #1; \ - lsl bits##n, bits##n, x22; + rbit bits##n, bits##n %% \ + clz bits##n, bits##n %% \ + sub ip##n, ip##n, bits##n, lsr #3 %% \ + and x22, bits##n, #7 %% \ + add op##n, op##n, #5 %% \ + ldr bits##n, [ip##n] %% \ + orr bits##n, bits##n, #1 %% \ + lsl bits##n, bits##n, x22 %% /* Call GET_NEXT_DELT for each stream */ FOR_EACH_STREAM(GET_NEXT_DELT) @@ -350,23 +353,23 @@ HUF_decompress4X2_usingDTable_internal_fast_arm64_loop: add x1, x20, x20, lsl #2 add olimit, op3, x1 -#define DECODE(n, idx) \ - lsr x21, bits##n, #53; \ - ldr w21, [dtable, x21, lsl #2]; \ - strh w21, [op##n]; \ - lsr w20, w21, #16; \ - add op##n, op##n, x21, lsr #24; \ - and x20, x20, #255; \ - lsl bits##n, bits##n, x20; +#define DECODE(n, idx) \ + lsr x21, bits##n, #53 %% \ + ldr w21, [dtable, x21, lsl #2] %% \ + strh w21, [op##n] %% \ + lsr w20, w21, #16 %% \ + add op##n, op##n, x21, lsr #24 %% \ + and x20, x20, #255 %% \ + lsl bits##n, bits##n, x20 %% #define RELOAD_BITS(n) \ - rbit bits##n, bits##n; \ - clz bits##n, bits##n; \ - sub ip##n, ip##n, bits##n, lsr #3; \ - and x1, bits##n, #7; \ - ldr bits##n, [ip##n]; \ - orr bits##n, bits##n, #1; \ - lsl bits##n, bits##n, x1; + rbit bits##n, bits##n %% \ + clz bits##n, bits##n %% \ + sub ip##n, ip##n, bits##n, lsr #3 %% \ + and x1, bits##n, #7 %% \ + ldr bits##n, [ip##n] %% \ + orr bits##n, bits##n, #1 %% \ + lsl bits##n, bits##n, x1 %% .p2align 6 From a725fe05f5ba56689c2b23a4b8b6ec3399d434a1 Mon Sep 17 00:00:00 2001 From: Nicolas De Carli Date: Mon, 25 Nov 2024 21:46:13 -0300 Subject: [PATCH 09/11] Fix asm on Linux --- lib/decompress/huf_decompress_arm64.S | 72 ++++++++++++++------------- 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/lib/decompress/huf_decompress_arm64.S b/lib/decompress/huf_decompress_arm64.S index 12dbf2a9f0e..94f990a906d 100644 --- a/lib/decompress/huf_decompress_arm64.S +++ b/lib/decompress/huf_decompress_arm64.S @@ -21,7 +21,9 @@ #if !(__APPLE__) /* on Apple platforms ' %%' is used as seperator instead of ';' */ - #define %% ; + #define __ENDL__ ; +#else + #define __ENDL__ %% #endif /* Calling convention: @@ -80,16 +82,16 @@ ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_arm64_loop) /* Calls X(N) for each stream 0, 1, 2, 3. */ #define FOR_EACH_STREAM(X) \ - X(0) %% \ - X(1) %% \ - X(2) %% \ + X(0) __ENDL__ \ + X(1) __ENDL__ \ + X(2) __ENDL__ \ X(3) /* Calls X(N, idx) for each stream 0, 1, 2, 3. */ #define FOR_EACH_STREAM_WITH_INDEX(X, idx) \ - X(0, idx) %% \ - X(1, idx) %% \ - X(2, idx) %% \ + X(0, idx) __ENDL__ \ + X(1, idx) __ENDL__ \ + X(2, idx) __ENDL__ \ X(3, idx) /* Define both _HUF_* & HUF_* symbols because MacOS @@ -170,8 +172,8 @@ HUF_decompress4X1_usingDTable_internal_fast_arm64_loop: * Loads dt[bits[n]] into var[n] */ #define GET_NEXT_DELT(n) \ - lsr var##n, bits##n, #53 %% \ - ldrh vard##n, [dtable, var##n, lsl #1] %% + lsr var##n, bits##n, #53 __ENDL__ \ + ldrh vard##n, [dtable, var##n, lsl #1] __ENDL__ /* var[n] must contain the DTable entry computed with GET_NEXT_DELT * Moves var[n] to x21 @@ -179,15 +181,15 @@ HUF_decompress4X1_usingDTable_internal_fast_arm64_loop: * op[n][idx] = x21 >> 8 */ #define DECODE_FROM_DELT(n, idx) \ - lsr x21, var##n, #8 %% \ - lsl bits##n, bits##n, var##n %% \ - strb w21, [op##n, ##idx] %% + lsr x21, var##n, #8 __ENDL__ \ + lsl bits##n, bits##n, var##n __ENDL__ \ + strb w21, [op##n, ##idx] __ENDL__ /* Assumes GET_NEXT_DELT has been called. * Calls DECODE_FROM_DELT then GET_NEXT_DELT */ #define DECODE_AND_GET_NEXT(n, idx) \ - DECODE_FROM_DELT(n, idx) %% \ + DECODE_FROM_DELT(n, idx) __ENDL__ \ GET_NEXT_DELT(n) \ /* // ctz & nbBytes is stored in bits[n] @@ -201,14 +203,14 @@ HUF_decompress4X1_usingDTable_internal_fast_arm64_loop: * bits[n] <<= nbBits */ #define RELOAD_BITS(n) \ - rbit bits##n, bits##n %% \ - clz bits##n, bits##n %% \ - sub ip##n, ip##n, bits##n, lsr #3 %% \ - and x22, bits##n, #7 %% \ - add op##n, op##n, #5 %% \ - ldr bits##n, [ip##n] %% \ - orr bits##n, bits##n, #1 %% \ - lsl bits##n, bits##n, x22 %% + rbit bits##n, bits##n __ENDL__ \ + clz bits##n, bits##n __ENDL__ \ + sub ip##n, ip##n, bits##n, lsr #3 __ENDL__ \ + and x22, bits##n, #7 __ENDL__ \ + add op##n, op##n, #5 __ENDL__ \ + ldr bits##n, [ip##n] __ENDL__ \ + orr bits##n, bits##n, #1 __ENDL__ \ + lsl bits##n, bits##n, x22 __ENDL__ /* Call GET_NEXT_DELT for each stream */ FOR_EACH_STREAM(GET_NEXT_DELT) @@ -354,22 +356,22 @@ HUF_decompress4X2_usingDTable_internal_fast_arm64_loop: add olimit, op3, x1 #define DECODE(n, idx) \ - lsr x21, bits##n, #53 %% \ - ldr w21, [dtable, x21, lsl #2] %% \ - strh w21, [op##n] %% \ - lsr w20, w21, #16 %% \ - add op##n, op##n, x21, lsr #24 %% \ - and x20, x20, #255 %% \ - lsl bits##n, bits##n, x20 %% + lsr x21, bits##n, #53 __ENDL__ \ + ldr w21, [dtable, x21, lsl #2] __ENDL__ \ + strh w21, [op##n] __ENDL__ \ + lsr w20, w21, #16 __ENDL__ \ + add op##n, op##n, x21, lsr #24 __ENDL__ \ + and x20, x20, #255 __ENDL__ \ + lsl bits##n, bits##n, x20 __ENDL__ #define RELOAD_BITS(n) \ - rbit bits##n, bits##n %% \ - clz bits##n, bits##n %% \ - sub ip##n, ip##n, bits##n, lsr #3 %% \ - and x1, bits##n, #7 %% \ - ldr bits##n, [ip##n] %% \ - orr bits##n, bits##n, #1 %% \ - lsl bits##n, bits##n, x1 %% + rbit bits##n, bits##n __ENDL__ \ + clz bits##n, bits##n __ENDL__ \ + sub ip##n, ip##n, bits##n, lsr #3 __ENDL__ \ + and x1, bits##n, #7 __ENDL__ \ + ldr bits##n, [ip##n] __ENDL__ \ + orr bits##n, bits##n, #1 __ENDL__ \ + lsl bits##n, bits##n, x1 __ENDL__ .p2align 6 From 420ebcede09f9ee20629049175f7056919b2847d Mon Sep 17 00:00:00 2001 From: Nicolas De Carli Date: Mon, 25 Nov 2024 22:26:20 -0300 Subject: [PATCH 10/11] Fix msan warning --- lib/decompress/huf_decompress_arm64.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/decompress/huf_decompress_arm64.S b/lib/decompress/huf_decompress_arm64.S index 94f990a906d..f0c564c0b77 100644 --- a/lib/decompress/huf_decompress_arm64.S +++ b/lib/decompress/huf_decompress_arm64.S @@ -19,11 +19,11 @@ #if ZSTD_ENABLE_ASM_ARM64 -#if !(__APPLE__) +#if defined(__APPLE__) && (__APPLE__) /* on Apple platforms ' %%' is used as seperator instead of ';' */ - #define __ENDL__ ; -#else #define __ENDL__ %% +#else + #define __ENDL__ ; #endif /* Calling convention: From bc766eca4fbb3334f8f77ebebecb08949dd12364 Mon Sep 17 00:00:00 2001 From: Nicolas De Carli Date: Mon, 25 Nov 2024 22:34:12 -0300 Subject: [PATCH 11/11] Call ZSTD_cpuid_bmi2 instead of adding extra branching --- lib/compress/zstd_compress.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index ac8d797b4d2..3d78b7da355 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -143,11 +143,7 @@ ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize) cctx->blockState.nextCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t)); cctx->tmpWorkspace = ZSTD_cwksp_reserve_object(&cctx->workspace, TMP_WORKSPACE_SIZE); cctx->tmpWkspSize = TMP_WORKSPACE_SIZE; -#ifdef __aarch64__ - cctx->bmi2 = 1; -#else cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); -#endif return cctx; }