没啥效果,如果表的长度在 64个uint8_t之类,应该可以提高查表速度,否则还是C来的快
#ifdef HAVE_NEON_AARCH64
void table_lookup_AArch64_neon(uint8_t* lookup_table, uint32_t length, uint8_t* input_ptr, uint8_t* output_ptr) { /* Load lookup table. */ uint8x16x4_t table0 = vld1q_u8_x4(lookup_table); uint8x16x4_t table1 = vld1q_u8_x4(lookup_table+64); uint8x16x4_t table2 = vld1q_u8_x4(lookup_table+128); uint8x16x4_t table3 = vld1q_u8_x4(lookup_table+192); uint8x16x4_t elements; uint8x16_t src, dst; uint8x16_t diff = vmovq_n_s8(64); for(uint32_t i=0; i<length; i=i+64) { uint8_t* ptr = input_ptr+i; elements = vld1q_u8_x4(ptr); for(uint8_t j=0; j<4; j++) { dst = vqtbx4q_u8(dst, table0, src); src = vsubq_u8(src, diff); dst = vqtbx4q_u8(dst, table1, src); src = vsubq_u8(src, diff); dst = vqtbx4q_u8(dst, table2, src); src = vsubq_u8(src, diff); elements.val[j] = vqtbx4q_u8(dst, table3, src); } vst1q_u8_x4(ptr, elements); } }