Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ARM's assembly code for huffman decoding #4204

Open
wants to merge 11 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion build/cmake/lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,11 @@ else ()
if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|AMD64.*|x86_64.*|X86_64.*")
set(DecompressSources ${DecompressSources} ${LIBRARY_DIR}/decompress/huf_decompress_amd64.S)
else()
add_compile_options(-DZSTD_DISABLE_ASM)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)")
set(DecompressSources ${DecompressSources} ${LIBRARY_DIR}/decompress/huf_decompress_arm64.S)
else()
add_compile_options(-DZSTD_DISABLE_ASM)
endif()
endif()
endif ()
file(GLOB DictBuilderSources ${LIBRARY_DIR}/dictBuilder/*.c)
Expand Down
1 change: 1 addition & 0 deletions build/meson/lib/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ libzstd_sources = [join_paths(zstd_rootdir, 'lib/common/entropy_common.c'),
# Otherwise, explicitly disable assembly.
if [compiler_gcc, compiler_clang].contains(cc_id)
libzstd_sources += join_paths(zstd_rootdir, 'lib/decompress/huf_decompress_amd64.S')
libzstd_sources += join_paths(zstd_rootdir, 'lib/decompress/huf_decompress_arm64.S')
else
add_project_arguments('-DZSTD_DISABLE_ASM', language: 'c')
endif
Expand Down
1 change: 1 addition & 0 deletions contrib/linux-kernel/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ libzstd:
-DZSTD_DISABLE_ASM \
-DZSTD_LINUX_KERNEL
rm linux/lib/zstd/decompress/huf_decompress_amd64.S
rm linux/lib/zstd/decompress/huf_decompress_arm64.S
mv linux/lib/zstd/zstd.h linux/include/linux/zstd_lib.h
mv linux/lib/zstd/zstd_errors.h linux/include/linux/
cp linux_zstd.h linux/include/linux/zstd.h
Expand Down
6 changes: 6 additions & 0 deletions lib/common/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,14 @@
/* Target attribute for BMI2 dynamic dispatch.
* Enable lzcnt, bmi, and bmi2.
* We test for bmi1 & bmi2. lzcnt is included in bmi1.
* ARMv8+ supports decoding huffman using assembly code
* without requiring any further instruction set
*/
#if defined(__aarch64__)
#define BMI2_TARGET_ATTRIBUTE
#else
#define BMI2_TARGET_ATTRIBUTE TARGET_ATTRIBUTE("lzcnt,bmi,bmi2")
#endif

/* prefetch
* can be disabled, by declaring NO_PREFETCH build macro */
Expand Down
24 changes: 19 additions & 5 deletions lib/common/portability_macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@
#if ((defined(__clang__) && __has_attribute(__target__)) \
|| (defined(__GNUC__) \
&& (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \
&& (defined(__x86_64__) || defined(_M_X64)) \
&& (defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__)) \
&& !defined(__BMI2__)
# define DYNAMIC_BMI2 1
#else
Expand Down Expand Up @@ -128,12 +128,26 @@
* - BMI2 is supported at compile time
*/
#if !defined(ZSTD_DISABLE_ASM) && \
ZSTD_ASM_SUPPORTED && \
defined(__x86_64__) && \
(DYNAMIC_BMI2 || defined(__BMI2__))
# define ZSTD_ENABLE_ASM_X86_64_BMI2 1
ZSTD_ASM_SUPPORTED

# if defined(__x86_64__) && (DYNAMIC_BMI2 || defined(__BMI2__))
# define ZSTD_ENABLE_ASM_X86_64_BMI2 1
# else
# define ZSTD_ENABLE_ASM_X86_64_BMI2 0
# endif

/* For now only enable ARM64 assembly when ZSTD_EXPERIMENTAL_ARM64
+ * is defined. This ensures that it is only enabled for your tests.
+ */
# if defined(__aarch64__)
# define ZSTD_ENABLE_ASM_ARM64 1
# else
# define ZSTD_ENABLE_ASM_ARM64 0
# endif

#else
# define ZSTD_ENABLE_ASM_X86_64_BMI2 0
# define ZSTD_ENABLE_ASM_ARM64 0
#endif

/*
Expand Down
4 changes: 4 additions & 0 deletions lib/common/zstd_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -381,8 +381,12 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
*/
MEM_STATIC int ZSTD_cpuSupportsBmi2(void)
{
#ifdef __aarch64__
return 1;
#else
ZSTD_cpuid_t cpuid = ZSTD_cpuid();
return ZSTD_cpuid_bmi1(cpuid) && ZSTD_cpuid_bmi2(cpuid);
#endif
}

#if defined (__cplusplus)
Expand Down
37 changes: 27 additions & 10 deletions lib/decompress/huf_decompress.c
Original file line number Diff line number Diff line change
Expand Up @@ -711,7 +711,11 @@ size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize,
return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
}

#if ZSTD_ENABLE_ASM_X86_64_BMI2
#if ZSTD_ENABLE_ASM_ARM64

HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_arm64_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;

#elif ZSTD_ENABLE_ASM_X86_64_BMI2

HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;

Expand Down Expand Up @@ -903,21 +907,25 @@ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize,
#if DYNAMIC_BMI2
if (flags & HUF_flags_bmi2) {
fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2;
# if ZSTD_ENABLE_ASM_X86_64_BMI2
if (!(flags & HUF_flags_disableAsm)) {
#if ZSTD_ENABLE_ASM_ARM64
loopFn = HUF_decompress4X1_usingDTable_internal_fast_arm64_loop;
#elif ZSTD_ENABLE_ASM_X86_64_BMI2
loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
}
# endif
}
} else {
return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
}
#endif

#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
if (!(flags & HUF_flags_disableAsm)) {
#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
}
#elif ZSTD_ENABLE_ASM_ARM64
loopFn = HUF_decompress4X1_usingDTable_internal_fast_arm64_loop;
#endif
}

if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
Expand Down Expand Up @@ -1514,7 +1522,11 @@ size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize,
return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
}

#if ZSTD_ENABLE_ASM_X86_64_BMI2
#if ZSTD_ENABLE_ASM_ARM64

HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_arm64_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;

#elif ZSTD_ENABLE_ASM_X86_64_BMI2

HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;

Expand Down Expand Up @@ -1725,21 +1737,26 @@ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize,
#if DYNAMIC_BMI2
if (flags & HUF_flags_bmi2) {
fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2;
# if ZSTD_ENABLE_ASM_X86_64_BMI2

if (!(flags & HUF_flags_disableAsm)) {
#if ZSTD_ENABLE_ASM_ARM64
loopFn = HUF_decompress4X2_usingDTable_internal_fast_arm64_loop;
#elif ZSTD_ENABLE_ASM_X86_64_BMI2
loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
#endif
}
# endif
} else {
return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
}
#endif

#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
if (!(flags & HUF_flags_disableAsm)) {
#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
}
#elif ZSTD_ENABLE_ASM_ARM64
loopFn = HUF_decompress4X2_usingDTable_internal_fast_arm64_loop;
#endif
}

if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
Expand Down
Loading
Loading