|
// returns an index in range 0-13 on match, 14-32 if no match |
|
static DN_FORCEINLINE(uint32_t) |
|
find_first_matching_suffix_simd ( |
|
dn_simdhash_search_vector needle, |
|
// Only used by the vectorized implementations; discarded by scalar. |
|
dn_simdhash_suffixes haystack |
|
) { |
|
#if defined(__wasm_simd128__) |
|
return ctz(wasm_i8x16_bitmask(wasm_i8x16_eq(needle.vec, haystack.vec))); |
|
#elif defined(_M_AMD64) || defined(_M_X64) || (_M_IX86_FP == 2) || defined(__SSE2__) |
|
return ctz(_mm_movemask_epi8(_mm_cmpeq_epi8(needle.m128, haystack.m128))); |
|
#elif defined(__ARM_NEON) |
|
dn_simdhash_suffixes match_vector; |
|
// Completely untested. |
|
static const dn_simdhash_suffixes byte_mask = { |
|
{ 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 } |
|
}; |
|
union { |
|
uint8_t b[4]; |
|
uint32_t u; |
|
} msb; |
|
match_vector.vec = vceqq_u8(needle.vec, haystack.vec); |
|
dn_simdhash_suffixes masked; |
|
masked.vec = vandq_u8(match_vector.vec, byte_mask.vec); |
|
msb.b[0] = vaddv_u8(vget_low_u8(masked.vec)); |
|
msb.b[1] = vaddv_u8(vget_high_u8(masked.vec)); |
|
return ctz(msb.u); |
|
#else |
|
dn_simdhash_assert(!"Scalar fallback should be in use here"); |
|
return 32; |
|
#endif |
|
} |
Description
The current implementation for arm64 in the .NET runtime isn’t optimized. Since arm64 lacks a direct intrinsic equivalent to
_mm_movemask_epi8, an emulation is used, which negatively impacts performance:runtime/src/native/containers/dn-simdhash-arch.h
Lines 93 to 124 in 367cf39
This optimization can improve AOT compilation (build time) on macOS-arm64 host of MAUI template app in debug config by ~80%:
g_hash_table_lookup):Alternative implementations
Tasks