vc4: Use named parameters for the NEON inline asm.

This makes the asm code more intelligible and clarifies the functional
change in the next commit.

(commit message and commit squashing by anholt)
This commit is contained in:
Carsten Haitzler (Rasterman) 2019-01-08 16:05:25 +00:00 committed by Eric Anholt
parent f6292c32cc
commit 522f688471
1 changed files with 100 additions and 80 deletions

View File

@ -37,20 +37,22 @@ v3d_load_utile(void *cpu, uint32_t cpu_stride,
/* Load from the GPU in one shot, no interleave, to
* d0-d7.
*/
"vldm %0, {q0, q1, q2, q3}\n"
"vldm %[gpu], {q0, q1, q2, q3}\n"
/* Store each 8-byte line to cpu-side destination,
* incrementing it by the stride each time.
*/
"vst1.8 d0, [%1], %2\n"
"vst1.8 d1, [%1], %2\n"
"vst1.8 d2, [%1], %2\n"
"vst1.8 d3, [%1], %2\n"
"vst1.8 d4, [%1], %2\n"
"vst1.8 d5, [%1], %2\n"
"vst1.8 d6, [%1], %2\n"
"vst1.8 d7, [%1]\n"
"vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
"vst1.8 d1, [%[cpu]], %[cpu_stride]\n"
"vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
"vst1.8 d3, [%[cpu]], %[cpu_stride]\n"
"vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
"vst1.8 d5, [%[cpu]], %[cpu_stride]\n"
"vst1.8 d6, [%[cpu]], %[cpu_stride]\n"
"vst1.8 d7, [%[cpu]]\n"
:
: "r"(gpu), "r"(cpu), "r"(cpu_stride)
: [gpu] "r"(gpu),
[cpu] "r"(cpu),
[cpu_stride] "r"(cpu_stride)
: "q0", "q1", "q2", "q3");
return;
} else if (gpu_stride == 16) {
@ -58,21 +60,24 @@ v3d_load_utile(void *cpu, uint32_t cpu_stride,
/* Load from the GPU in one shot, no interleave, to
* d0-d7.
*/
"vldm %0, {q0, q1, q2, q3};\n"
"vldm %[gpu], {q0, q1, q2, q3};\n"
/* Store each 16-byte line in 2 parts to the cpu-side
* destination. (vld1 can only store one d-register
* at a time).
*/
"vst1.8 d0, [%1], %3\n"
"vst1.8 d1, [%2], %3\n"
"vst1.8 d2, [%1], %3\n"
"vst1.8 d3, [%2], %3\n"
"vst1.8 d4, [%1], %3\n"
"vst1.8 d5, [%2], %3\n"
"vst1.8 d6, [%1]\n"
"vst1.8 d7, [%2]\n"
"vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
"vst1.8 d1, [%[cpu2]],%[cpu_stride]\n"
"vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
"vst1.8 d3, [%[cpu2]],%[cpu_stride]\n"
"vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
"vst1.8 d5, [%[cpu2]],%[cpu_stride]\n"
"vst1.8 d6, [%[cpu]]\n"
"vst1.8 d7, [%[cpu2]]\n"
:
: "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
: [gpu] "r"(gpu),
[cpu] "r"(cpu),
[cpu2] "r"(cpu + 8),
[cpu_stride] "r"(cpu_stride)
: "q0", "q1", "q2", "q3");
return;
}
@ -82,20 +87,22 @@ v3d_load_utile(void *cpu, uint32_t cpu_stride,
/* Load from the GPU in one shot, no interleave, to
* d0-d7.
*/
"ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
"ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
/* Store each 8-byte line to cpu-side destination,
* incrementing it by the stride each time.
*/
"st1 {v0.D}[0], [%1], %2\n"
"st1 {v0.D}[1], [%1], %2\n"
"st1 {v1.D}[0], [%1], %2\n"
"st1 {v1.D}[1], [%1], %2\n"
"st1 {v2.D}[0], [%1], %2\n"
"st1 {v2.D}[1], [%1], %2\n"
"st1 {v3.D}[0], [%1], %2\n"
"st1 {v3.D}[1], [%1]\n"
"st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
"st1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
"st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
"st1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
"st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
"st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
"st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
"st1 {v3.D}[1], [%[cpu]]\n"
:
: "r"(gpu), "r"(cpu), "r"(cpu_stride)
: [gpu] "r"(gpu),
[cpu] "r"(cpu),
[cpu_stride] "r"(cpu_stride)
: "v0", "v1", "v2", "v3");
return;
} else if (gpu_stride == 16) {
@ -103,21 +110,24 @@ v3d_load_utile(void *cpu, uint32_t cpu_stride,
/* Load from the GPU in one shot, no interleave, to
* d0-d7.
*/
"ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
"ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
/* Store each 16-byte line in 2 parts to the cpu-side
* destination. (vld1 can only store one d-register
* at a time).
*/
"st1 {v0.D}[0], [%1], %3\n"
"st1 {v0.D}[1], [%2], %3\n"
"st1 {v1.D}[0], [%1], %3\n"
"st1 {v1.D}[1], [%2], %3\n"
"st1 {v2.D}[0], [%1], %3\n"
"st1 {v2.D}[1], [%2], %3\n"
"st1 {v3.D}[0], [%1]\n"
"st1 {v3.D}[1], [%2]\n"
"st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
"st1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
"st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
"st1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
"st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
"st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
"st1 {v3.D}[0], [%[cpu]]\n"
"st1 {v3.D}[1], [%[cpu2]]\n"
:
: "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
: [gpu] "r"(gpu),
[cpu] "r"(cpu),
[cpu2] "r"(cpu + 8),
[cpu_stride] "r"(cpu_stride)
: "v0", "v1", "v2", "v3");
return;
}
@ -139,20 +149,22 @@ v3d_store_utile(void *gpu, uint32_t gpu_stride,
/* Load each 8-byte line from cpu-side source,
* incrementing it by the stride each time.
*/
"vld1.8 d0, [%1], %2\n"
"vld1.8 d1, [%1], %2\n"
"vld1.8 d2, [%1], %2\n"
"vld1.8 d3, [%1], %2\n"
"vld1.8 d4, [%1], %2\n"
"vld1.8 d5, [%1], %2\n"
"vld1.8 d6, [%1], %2\n"
"vld1.8 d7, [%1]\n"
"vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
"vld1.8 d1, [%[cpu]], %[cpu_stride]\n"
"vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
"vld1.8 d3, [%[cpu]], %[cpu_stride]\n"
"vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
"vld1.8 d5, [%[cpu]], %[cpu_stride]\n"
"vld1.8 d6, [%[cpu]], %[cpu_stride]\n"
"vld1.8 d7, [%[cpu]]\n"
/* Load from the GPU in one shot, no interleave, to
* d0-d7.
*/
"vstm %0, {q0, q1, q2, q3}\n"
"vstm %[gpu], {q0, q1, q2, q3}\n"
:
: "r"(gpu), "r"(cpu), "r"(cpu_stride)
: [gpu] "r"(gpu),
[cpu] "r"(cpu),
[cpu_stride] "r"(cpu_stride)
: "q0", "q1", "q2", "q3");
return;
} else if (gpu_stride == 16) {
@ -161,18 +173,21 @@ v3d_store_utile(void *gpu, uint32_t gpu_stride,
* destination. (vld1 can only store one d-register
* at a time).
*/
"vld1.8 d0, [%1], %3\n"
"vld1.8 d1, [%2], %3\n"
"vld1.8 d2, [%1], %3\n"
"vld1.8 d3, [%2], %3\n"
"vld1.8 d4, [%1], %3\n"
"vld1.8 d5, [%2], %3\n"
"vld1.8 d6, [%1]\n"
"vld1.8 d7, [%2]\n"
"vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
"vld1.8 d1, [%[cpu2]],%[cpu_stride]\n"
"vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
"vld1.8 d3, [%[cpu2]],%[cpu_stride]\n"
"vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
"vld1.8 d5, [%[cpu2]],%[cpu_stride]\n"
"vld1.8 d6, [%[cpu]]\n"
"vld1.8 d7, [%[cpu2]]\n"
/* Store to the GPU in one shot, no interleave. */
"vstm %0, {q0, q1, q2, q3}\n"
"vstm %[gpu], {q0, q1, q2, q3}\n"
:
: "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
: [gpu] "r"(gpu),
[cpu] "r"(cpu),
[cpu2] "r"(cpu + 8),
[cpu_stride] "r"(cpu_stride)
: "q0", "q1", "q2", "q3");
return;
}
@ -182,18 +197,20 @@ v3d_store_utile(void *gpu, uint32_t gpu_stride,
/* Load each 8-byte line from cpu-side source,
* incrementing it by the stride each time.
*/
"ld1 {v0.D}[0], [%1], %2\n"
"ld1 {v0.D}[1], [%1], %2\n"
"ld1 {v1.D}[0], [%1], %2\n"
"ld1 {v1.D}[1], [%1], %2\n"
"ld1 {v2.D}[0], [%1], %2\n"
"ld1 {v2.D}[1], [%1], %2\n"
"ld1 {v3.D}[0], [%1], %2\n"
"ld1 {v3.D}[1], [%1]\n"
"ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
"ld1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
"ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
"ld1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
"ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
"ld1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
"ld1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
"ld1 {v3.D}[1], [%[cpu]]\n"
/* Store to the GPU in one shot, no interleave. */
"st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
"st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
:
: "r"(gpu), "r"(cpu), "r"(cpu_stride)
: [gpu] "r"(gpu),
[cpu] "r"(cpu),
[cpu_stride] "r"(cpu_stride)
: "v0", "v1", "v2", "v3");
return;
} else if (gpu_stride == 16) {
@ -202,18 +219,21 @@ v3d_store_utile(void *gpu, uint32_t gpu_stride,
* destination. (vld1 can only store one d-register
* at a time).
*/
"ld1 {v0.D}[0], [%1], %3\n"
"ld1 {v0.D}[1], [%2], %3\n"
"ld1 {v1.D}[0], [%1], %3\n"
"ld1 {v1.D}[1], [%2], %3\n"
"ld1 {v2.D}[0], [%1], %3\n"
"ld1 {v2.D}[1], [%2], %3\n"
"ld1 {v3.D}[0], [%1]\n"
"ld1 {v3.D}[1], [%2]\n"
"ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
"ld1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
"ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
"ld1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
"ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
"ld1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
"ld1 {v3.D}[0], [%[cpu]]\n"
"ld1 {v3.D}[1], [%[cpu2]]\n"
/* Store to the GPU in one shot, no interleave. */
"st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
"st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
:
: "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
: [gpu] "r"(gpu),
[cpu] "r"(cpu),
[cpu2] "r"(cpu + 8),
[cpu_stride] "r"(cpu_stride)
: "v0", "v1", "v2", "v3");
return;
}