diff options
author | edef <edef@edef.eu> | 2023-01-11T18·57+0000 |
---|---|---|
committer | edef <edef@edef.eu> | 2023-01-11T20·10+0000 |
commit | ec470d254ffca9822cbfa3eb783a0c8f0f523f51 (patch) | |
tree | ec4b76c824dcaf850b1d14019a0c5c842758fb78 /users/edef/refscan/src/lib.rs | |
parent | 1afb4a9f44cbc27cfb5ddb9bb690739b38f0c73e (diff) |
feat(users/edef/refscan): AArch64 support r/5646
Change-Id: I5062078739f0bf9f70c6789a9f2eafceff65d76e Reviewed-on: https://cl.tvl.fyi/c/depot/+/7690 Reviewed-by: flokli <flokli@flokli.de> Tested-by: BuildkiteCI
Diffstat (limited to 'users/edef/refscan/src/lib.rs')
-rw-r--r-- | users/edef/refscan/src/lib.rs | 60 |
1 files changed, 60 insertions, 0 deletions
diff --git a/users/edef/refscan/src/lib.rs b/users/edef/refscan/src/lib.rs index fca4b290f1c4..3d4a07f3dd1c 100644 --- a/users/edef/refscan/src/lib.rs +++ b/users/edef/refscan/src/lib.rs @@ -55,6 +55,7 @@ mod test { } } +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] mod simd { #[cfg(target_arch = "x86")] use std::arch::x86 as arch; @@ -92,3 +93,62 @@ mod simd { } } } + +#[cfg(target_arch = "aarch64")] +mod simd { + use std::{ + arch::aarch64::{ + uint8x16_t as u8x16, vaddv_u8, vandq_u8, vcgtq_u8, vdupq_n_u8, vget_high_u8, + vget_low_u8, vshlq_u8, + }, + mem, ptr, + }; + + #[allow(non_camel_case_types)] + #[derive(Copy, Clone)] + #[repr(transparent)] + pub struct u8x32([u8x16; 2]); + + impl u8x32 { + #[cfg(target_endian = "little")] + #[inline(always)] + pub fn from_slice_unaligned(slice: &[u8]) -> Self { + assert_eq!(slice.len(), 32); + u8x32(unsafe { ptr::read_unaligned(slice.as_ptr().cast()) }) + } + + #[inline(always)] + pub fn splat(x: u8) -> Self { + u8x32(unsafe { + let x = vdupq_n_u8(x); + [x, x] + }) + } + + #[inline(always)] + pub fn gt(&self, b: Self) -> u32 { + let u8x32([al, ah]) = *self; + let u8x32([bl, bh]) = b; + + fn f(a: u8x16, b: u8x16) -> u32 { + unsafe { + let c = vshlq_u8( + vandq_u8(vdupq_n_u8(0x80), vcgtq_u8(a, b)), + mem::transmute([ + -7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0i8, + ]), + ); + + (vaddv_u8(vget_low_u8(c)) as u32) << 0 | (vaddv_u8(vget_high_u8(c)) as u32) << 8 + } + } + + f(al, bl) << 0 | f(ah, bh) << 16 + } + + #[inline(always)] + pub fn lt(self, b: Self) -> u32 { + b.gt(self) + } + } +} |