This is a small test I built. Here we have two scenarios:
- Scenario 1: Two functions (
scenario1a
andscenario1b
) which inputs and outputs areuint16_t*
and load/store to/from Neon datatype (uint16x8x4_t
). - Scenario 2: Same functions as Scenario 1 (in this case
scenario2a
andscenario2b
) but the inputs and outputs areuint16x8x4_t*
, and the load and store are done in the main function.
(Below the c code I include the disassembly generated after compiling with -O3).
#include <stdio.h>
#include <stdlib.h>
#include <arm_neon.h>
void scenario1a(uint16_t* resultArray, const uint16_t* X);
void scenario1b(uint16_t* resultArray, const uint16_t* X);
void scenario2a(uint16x8x4_t* result, const uint16x8x4_t* temp);
void scenario2b(uint16x8x4_t* result, const uint16x8x4_t* temp);
void scenario1a(uint16_t* resultArray, const uint16_t* X) {
uint16x8x4_t temp, result;
temp = vld1q_u16_x4(X);
result.val[0] = vextq_u16(temp.val[0], vmulq_n_u16(temp.val[1], -1), 2);
result.val[1] = vextq_u16(temp.val[1], vmulq_n_u16(temp.val[2], -1), 2);
result.val[2] = vextq_u16(temp.val[2], vmulq_n_u16(temp.val[3], -1), 2);
result.val[3] = vextq_u16(temp.val[3], vmulq_n_u16(temp.val[0], -1), 2);
vst1q_u16_x4(resultArray, result);
}
void scenario1b(uint16_t* resultArray, const uint16_t* X) {
uint16x8x4_t temp, result;
temp = vld1q_u16_x4(X);
result.val[0] = vaddq_u16(temp.val[0], temp.val[1]);
result.val[1] = vmulq_n_u16(temp.val[1], -2);
result.val[2] = vaddq_u16(temp.val[2], temp.val[3]);
result.val[3] = vmulq_n_u16(temp.val[3], -2);
vst1q_u16_x4(resultArray, result);
}
void scenario2a(uint16x8x4_t* result, const uint16x8x4_t* temp) {
result->val[0] = vextq_u16(temp->val[0], vmulq_n_u16(temp->val[1], -1), 2);
result->val[1] = vextq_u16(temp->val[1], vmulq_n_u16(temp->val[2], -1), 2);
result->val[2] = vextq_u16(temp->val[2], vmulq_n_u16(temp->val[3], -1), 2);
result->val[3] = vextq_u16(temp->val[3], vmulq_n_u16(temp->val[0], -1), 2);
}
void scenario2b(uint16x8x4_t* result, const uint16x8x4_t* temp) {
result->val[0] = vaddq_u16(temp->val[0], temp->val[1]);
result->val[1] = vmulq_n_u16(temp->val[1], -2);
result->val[2] = vaddq_u16(temp->val[2], temp->val[3]);
result->val[3] = vmulq_n_u16(temp->val[3], -2);
}
int main(void) {
uint16_t input[32] = {15,3,1,85,44,156,32,97,3,54,97,17,0,55,9,17,163,23,74,85,96,14,25,36,95,84,76,51,42,63,58,74};
// Scenario 01: Input and output are uint16_t*
uint16_t result01a[32];
uint16_t result01_final[32];
scenario1a(result01a, input);
scenario1b(result01_final, result01a);
// Scenario 02: Input and output are uint16x8x4_t
uint16_t result02_final[32];
uint16x8x4_t temp, result02a, result02b;
temp = vld1q_u16_x4(input);
scenario2a(&result02a, &temp);
scenario2b(&result02b, &result02a);
vst1q_u16_x4(result02_final, result02b);
return 0;
}
Disassembly:
test: file format elf64-littleaarch64
Disassembly of section .init:
0000000000000658 <_init>:
658: a9bf7bfd stp x29, x30, [sp, #-16]!
65c: 910003fd mov x29, sp
660: 94000065 bl 7f4 <call_weak_fn>
664: a8c17bfd ldp x29, x30, [sp], #16
668: d65f03c0 ret
Disassembly of section .plt:
0000000000000670 <.plt>:
670: a9bf7bf0 stp x16, x30, [sp, #-16]!
674: 90000090 adrp x16, 10000 <__FRAME_END__ 0xf3d8>
678: f947c611 ldr x17, [x16, #3976]
67c: 913e2210 add x16, x16, #0xf88
680: d61f0220 br x17
684: d503201f nop
688: d503201f nop
68c: d503201f nop
0000000000000690 <__cxa_finalize@plt>:
690: 90000090 adrp x16, 10000 <__FRAME_END__ 0xf3d8>
694: f947ca11 ldr x17, [x16, #3984]
698: 913e4210 add x16, x16, #0xf90
69c: d61f0220 br x17
00000000000006a0 <__libc_start_main@plt>:
6a0: 90000090 adrp x16, 10000 <__FRAME_END__ 0xf3d8>
6a4: f947ce11 ldr x17, [x16, #3992]
6a8: 913e6210 add x16, x16, #0xf98
6ac: d61f0220 br x17
00000000000006b0 <__stack_chk_fail@plt>:
6b0: 90000090 adrp x16, 10000 <__FRAME_END__ 0xf3d8>
6b4: f947d211 ldr x17, [x16, #4000]
6b8: 913e8210 add x16, x16, #0xfa0
6bc: d61f0220 br x17
00000000000006c0 <__gmon_start__@plt>:
6c0: 90000090 adrp x16, 10000 <__FRAME_END__ 0xf3d8>
6c4: f947d611 ldr x17, [x16, #4008]
6c8: 913ea210 add x16, x16, #0xfa8
6cc: d61f0220 br x17
00000000000006d0 <abort@plt>:
6d0: 90000090 adrp x16, 10000 <__FRAME_END__ 0xf3d8>
6d4: f947da11 ldr x17, [x16, #4016]
6d8: 913ec210 add x16, x16, #0xfb0
6dc: d61f0220 br x17
Disassembly of section .text:
00000000000006e0 <main>:
6e0: 90000085 adrp x5, 10000 <__FRAME_END__ 0xf3d8>
6e4: a9a67bfd stp x29, x30, [sp, #-416]!
6e8: 910003fd mov x29, sp
6ec: 90000002 adrp x2, 0 <_init-0x658>
6f0: 91292042 add x2, x2, #0xa48
6f4: 910263e3 add x3, sp, #0x98
6f8: 910363e0 add x0, sp, #0xd8
6fc: 6f008434 mvni v20.8h, #0x1
700: f947f0a5 ldr x5, [x5, #4064]
704: aa0303e1 mov x1, x3
708: 910143e4 add x4, sp, #0x50
70c: a940344c ldp x12, x13, [x2]
710: a9412c4a ldp x10, x11, [x2, #16]
714: f94000a6 ldr x6, [x5]
718: f900cfe6 str x6, [sp, #408]
71c: d2800006 mov x6, #0x0 // #0
720: a9422448 ldp x8, x9, [x2, #32]
724: a9431c46 ldp x6, x7, [x2, #48]
728: 910463e2 add x2, sp, #0x118
72c: a909b7ec stp x12, x13, [sp, #152]
730: a90aafea stp x10, x11, [sp, #168]
734: a90ba7e8 stp x8, x9, [sp, #184]
738: a90c9fe6 stp x6, x7, [sp, #200]
73c: 94000069 bl 8e0 <scenario1a>
740: 4c402400 ld1 {v0.8h-v3.8h}, [x0]
744: 910043e1 add x1, sp, #0x10
748: aa0403e0 mov x0, x4
74c: 4c402470 ld1 {v16.8h-v19.8h}, [x3]
750: 4e619e85 mul v5.8h, v20.8h, v1.8h
754: 4e608424 add v4.8h, v1.8h, v0.8h
758: 4e628466 add v6.8h, v3.8h, v2.8h
75c: 4e639e87 mul v7.8h, v20.8h, v3.8h
760: 4c002030 st1 {v16.16b-v19.16b}, [x1]
764: 4c002444 st1 {v4.8h-v7.8h}, [x2]
768: 94000072 bl 930 <scenario2a>
76c: ad409885 ldp q5, q6, [x4, #16]
770: 90000081 adrp x1, 10000 <__FRAME_END__ 0xf3d8>
774: 910563e2 add x2, sp, #0x158
778: 3dc00c84 ldr q4, [x4, #48]
77c: 3dc017e7 ldr q7, [sp, #80]
780: f947f021 ldr x1, [x1, #4064]
784: 4e749c83 mul v3.8h, v4.8h, v20.8h
788: 4e668482 add v2.8h, v4.8h, v6.8h
78c: 4e749ca1 mul v1.8h, v5.8h, v20.8h
790: 4e6784a0 add v0.8h, v5.8h, v7.8h
794: 4c002440 st1 {v0.8h-v3.8h}, [x2]
798: f940cfe0 ldr x0, [sp, #408]
79c: f9400022 ldr x2, [x1]
7a0: eb020000 subs x0, x0, x2
7a4: d2800002 mov x2, #0x0 // #0
7a8: 54000081 b.ne 7b8 <main 0xd8> // b.any
7ac: 52800000 mov w0, #0x0 // #0
7b0: a8da7bfd ldp x29, x30, [sp], #416
7b4: d65f03c0 ret
7b8: 97ffffbe bl 6b0 <__stack_chk_fail@plt>
00000000000007bc <_start>:
7bc: d280001d mov x29, #0x0 // #0
7c0: d280001e mov x30, #0x0 // #0
7c4: aa0003e5 mov x5, x0
7c8: f94003e1 ldr x1, [sp]
7cc: 910023e2 add x2, sp, #0x8
7d0: 910003e6 mov x6, sp
7d4: 90000080 adrp x0, 10000 <__FRAME_END__ 0xf3d8>
7d8: f947f800 ldr x0, [x0, #4080]
7dc: 90000083 adrp x3, 10000 <__FRAME_END__ 0xf3d8>
7e0: f947f463 ldr x3, [x3, #4072]
7e4: 90000084 adrp x4, 10000 <__FRAME_END__ 0xf3d8>
7e8: f947e084 ldr x4, [x4, #4032]
7ec: 97ffffad bl 6a0 <__libc_start_main@plt>
7f0: 97ffffb8 bl 6d0 <abort@plt>
00000000000007f4 <call_weak_fn>:
7f4: 90000080 adrp x0, 10000 <__FRAME_END__ 0xf3d8>
7f8: f947ec00 ldr x0, [x0, #4056]
7fc: b4000040 cbz x0, 804 <call_weak_fn 0x10>
800: 17ffffb0 b 6c0 <__gmon_start__@plt>
804: d65f03c0 ret
808: d503201f nop
80c: d503201f nop
0000000000000810 <deregister_tm_clones>:
810: b0000080 adrp x0, 11000 <__data_start>
814: 91004000 add x0, x0, #0x10
818: b0000081 adrp x1, 11000 <__data_start>
81c: 91004021 add x1, x1, #0x10
820: eb00003f cmp x1, x0
824: 540000c0 b.eq 83c <deregister_tm_clones 0x2c> // b.none
828: 90000081 adrp x1, 10000 <__FRAME_END__ 0xf3d8>
82c: f947e421 ldr x1, [x1, #4040]
830: b4000061 cbz x1, 83c <deregister_tm_clones 0x2c>
834: aa0103f0 mov x16, x1
838: d61f0200 br x16
83c: d65f03c0 ret
0000000000000840 <register_tm_clones>:
840: b0000080 adrp x0, 11000 <__data_start>
844: 91004000 add x0, x0, #0x10
848: b0000081 adrp x1, 11000 <__data_start>
84c: 91004021 add x1, x1, #0x10
850: cb000021 sub x1, x1, x0
854: d37ffc22 lsr x2, x1, #63
858: 8b810c41 add x1, x2, x1, asr #3
85c: 9341fc21 asr x1, x1, #1
860: b40000c1 cbz x1, 878 <register_tm_clones 0x38>
864: 90000082 adrp x2, 10000 <__FRAME_END__ 0xf3d8>
868: f947fc42 ldr x2, [x2, #4088]
86c: b4000062 cbz x2, 878 <register_tm_clones 0x38>
870: aa0203f0 mov x16, x2
874: d61f0200 br x16
878: d65f03c0 ret
87c: d503201f nop
0000000000000880 <__do_global_dtors_aux>:
880: a9be7bfd stp x29, x30, [sp, #-32]!
884: 910003fd mov x29, sp
888: f9000bf3 str x19, [sp, #16]
88c: b0000093 adrp x19, 11000 <__data_start>
890: 39404260 ldrb w0, [x19, #16]
894: 35000140 cbnz w0, 8bc <__do_global_dtors_aux 0x3c>
898: 90000080 adrp x0, 10000 <__FRAME_END__ 0xf3d8>
89c: f947e800 ldr x0, [x0, #4048]
8a0: b4000080 cbz x0, 8b0 <__do_global_dtors_aux 0x30>
8a4: b0000080 adrp x0, 11000 <__data_start>
8a8: f9400400 ldr x0, [x0, #8]
8ac: 97ffff79 bl 690 <__cxa_finalize@plt>
8b0: 97ffffd8 bl 810 <deregister_tm_clones>
8b4: 52800020 mov w0, #0x1 // #1
8b8: 39004260 strb w0, [x19, #16]
8bc: f9400bf3 ldr x19, [sp, #16]
8c0: a8c27bfd ldp x29, x30, [sp], #32
8c4: d65f03c0 ret
8c8: d503201f nop
8cc: d503201f nop
00000000000008d0 <frame_dummy>:
8d0: 17ffffdc b 840 <register_tm_clones>
8d4: d503201f nop
8d8: d503201f nop
8dc: d503201f nop
00000000000008e0 <scenario1a>:
8e0: 4c402420 ld1 {v0.8h-v3.8h}, [x1]
8e4: 6e60b833 neg v19.8h, v1.8h
8e8: 6e60b852 neg v18.8h, v2.8h
8ec: 6e60b871 neg v17.8h, v3.8h
8f0: 6e60b810 neg v16.8h, v0.8h
8f4: 6e132004 ext v4.16b, v0.16b, v19.16b, #4
8f8: 6e122025 ext v5.16b, v1.16b, v18.16b, #4
8fc: 6e112046 ext v6.16b, v2.16b, v17.16b, #4
900: 6e102067 ext v7.16b, v3.16b, v16.16b, #4
904: 4c002404 st1 {v4.8h-v7.8h}, [x0]
908: d65f03c0 ret
90c: d503201f nop
0000000000000910 <scenario1b>:
910: 4c402420 ld1 {v0.8h-v3.8h}, [x1]
914: 6f008430 mvni v16.8h, #0x1
918: 4e619e05 mul v5.8h, v16.8h, v1.8h
91c: 4e608424 add v4.8h, v1.8h, v0.8h
920: 4e628466 add v6.8h, v3.8h, v2.8h
924: 4e639e07 mul v7.8h, v16.8h, v3.8h
928: 4c002404 st1 {v4.8h-v7.8h}, [x0]
92c: d65f03c0 ret
0000000000000930 <scenario2a>:
930: ad400025 ldp q5, q0, [x1]
934: ad408423 ldp q3, q1, [x1, #16]
938: 3dc00c24 ldr q4, [x1, #48]
93c: 6e60b800 neg v0.8h, v0.8h
940: 4ea11c22 mov v2.16b, v1.16b
944: 6e60b821 neg v1.8h, v1.8h
948: 6e0020a5 ext v5.16b, v5.16b, v0.16b, #4
94c: 4ea41c80 mov v0.16b, v4.16b
950: 6e60b884 neg v4.8h, v4.8h
954: 6e012063 ext v3.16b, v3.16b, v1.16b, #4
958: 3d800005 str q5, [x0]
95c: 3dc00021 ldr q1, [x1]
960: 6e042042 ext v2.16b, v2.16b, v4.16b, #4
964: ad008803 stp q3, q2, [x0, #16]
968: 6e60b821 neg v1.8h, v1.8h
96c: 6e012000 ext v0.16b, v0.16b, v1.16b, #4
970: 3d800c00 str q0, [x0, #48]
974: d65f03c0 ret
978: d503201f nop
97c: d503201f nop
0000000000000980 <scenario2b>:
980: ad401022 ldp q2, q4, [x1]
984: 6f008420 mvni v0.8h, #0x1
988: ad410c21 ldp q1, q3, [x1, #32]
98c: 4e609c85 mul v5.8h, v4.8h, v0.8h
990: 4e648442 add v2.8h, v2.8h, v4.8h
994: 4e609c60 mul v0.8h, v3.8h, v0.8h
998: 4e638421 add v1.8h, v1.8h, v3.8h
99c: ad001402 stp q2, q5, [x0]
9a0: ad010001 stp q1, q0, [x0, #32]
9a4: d65f03c0 ret
00000000000009a8 <__libc_csu_init>:
9a8: a9bc7bfd stp x29, x30, [sp, #-64]!
9ac: 910003fd mov x29, sp
9b0: a90153f3 stp x19, x20, [sp, #16]
9b4: 90000094 adrp x20, 10000 <__FRAME_END__ 0xf3d8>
9b8: 9135c294 add x20, x20, #0xd70
9bc: a9025bf5 stp x21, x22, [sp, #32]
9c0: 90000095 adrp x21, 10000 <__FRAME_END__ 0xf3d8>
9c4: 9135a2b5 add x21, x21, #0xd68
9c8: cb150294 sub x20, x20, x21
9cc: 2a0003f6 mov w22, w0
9d0: a90363f7 stp x23, x24, [sp, #48]
9d4: aa0103f7 mov x23, x1
9d8: aa0203f8 mov x24, x2
9dc: 97ffff1f bl 658 <_init>
9e0: eb940fff cmp xzr, x20, asr #3
9e4: 54000160 b.eq a10 <__libc_csu_init 0x68> // b.none
9e8: 9343fe94 asr x20, x20, #3
9ec: d2800013 mov x19, #0x0 // #0
9f0: f8737aa3 ldr x3, [x21, x19, lsl #3]
9f4: aa1803e2 mov x2, x24
9f8: 91000673 add x19, x19, #0x1
9fc: aa1703e1 mov x1, x23
a00: 2a1603e0 mov w0, w22
a04: d63f0060 blr x3
a08: eb13029f cmp x20, x19
a0c: 54ffff21 b.ne 9f0 <__libc_csu_init 0x48> // b.any
a10: a94153f3 ldp x19, x20, [sp, #16]
a14: a9425bf5 ldp x21, x22, [sp, #32]
a18: a94363f7 ldp x23, x24, [sp, #48]
a1c: a8c47bfd ldp x29, x30, [sp], #64
a20: d65f03c0 ret
a24: d503201f nop
0000000000000a28 <__libc_csu_fini>:
a28: d65f03c0 ret
Disassembly of section .fini:
0000000000000a2c <_fini>:
a2c: a9bf7bfd stp x29, x30, [sp, #-16]!
a30: 910003fd mov x29, sp
a34: a8c17bfd ldp x29, x30, [sp], #16
a38: d65f03c0 ret
Questions
Normally people load the data from the pointer (using
vld1q_u16_x4
), operates using the Neon datatypes, and store back to another pointer (usingvst1q_u16_x4
), and don't use an approach like the one I used in Scenario 2 (sending the Neon datatypes as inputs/outputs). Is there a general reason why is this?I checked the disassembly of Scenario 1a (starts at line
8e0
) vs. Scenario 2a (starts at line930
). It seems scenario 2a has more data movement. Will this happen in all scenarios? So is it faster to do what I asked in question 1? If so, then why this doesn't happen in Scenario 1b vs 2b (lines910
and980
, respectively).In the main function, there are some add/mul after both Scenario1a and 2a (in lines
750,754,758,75c
and784,788,78c,790
), but my main function has no multiplications nor additions. Why is this happening? (I'm just curious)
Thank you for all your help!
CodePudding user response:
There is absolutely no reason for using pointer to neon datatypes for parameters. Memory doesn't care about datatypes. Compilers are very conservative and bureaucratic, they simply have to. It's like filing an application to authorities: One wrong check mark, your application will land in the wrong hand, causing tremendeous unnecessary trouble.
Short: Keep it simple. Don't try to impress compilers or reviewers in any way.I told you last time to be explicit on memory load and store. You are computing directly from/to memory in scenario2. Never do this. Stick to load->compute->store. Local variables are your best friends. (
__restrict
directive might help)
Again, do not try to impress compilers or reviewers. Your scenario2 is just asking for trouble. A sheer disaster. The reviewer will raise a red flag immediately, and keep his eye on you and all your codes, if you are lucky and didn't get fired the instant.You shouldn't put callees in the same file as the caller. More than often, the caller will inline short non-static callees which makes profiling harder.