/* * Copyright (C) 2022 Alyssa Rosenzweig * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "agx_compiler.h" #include "agx_builder.h" /* * Emits code for * * for (int i = 0; i < n; ++i) * registers[dests[i]] = registers[srcs[i]]; * * ...with all copies happening in parallel. * * That is, emit machine instructions equivalent to a parallel copy. This is * used to lower not only parallel copies but also collects and splits, which * also have parallel copy semantics. * * We only handles register-register copies, not general agx_index sources. This * suffices for its internal use for register allocation. */ static void do_copy(agx_builder *b, const struct agx_copy *copy) { agx_mov_to(b, agx_register(copy->dest, copy->size), agx_register(copy->src, copy->size)); } static void do_swap(agx_builder *b, const struct agx_copy *copy) { if (copy->dest == copy->src) return; agx_index x = agx_register(copy->dest, copy->size); agx_index y = agx_register(copy->src, copy->size); agx_xor_to(b, x, x, y); agx_xor_to(b, y, x, y); agx_xor_to(b, x, x, y); } struct copy_ctx { /* Number of copies being processed */ unsigned entry_count; /* For each physreg, the number of pending copy entries that use it as a * source. Once this drops to zero, then the physreg is unblocked and can * be moved to. */ unsigned physreg_use_count[AGX_NUM_REGS]; /* For each physreg, the pending copy_entry that uses it as a dest. */ struct agx_copy *physreg_dest[AGX_NUM_REGS]; struct agx_copy entries[AGX_NUM_REGS]; }; static bool entry_blocked(struct agx_copy *entry, struct copy_ctx *ctx) { for (unsigned i = 0; i < agx_size_align_16(entry->size); i++) { if (ctx->physreg_use_count[entry->dest + i] != 0) return true; } return false; } static bool is_real(struct agx_copy *entry) { /* TODO: Allow immediates in agx_copy */ return true; } /* TODO: Generalize to other bit sizes */ static void split_32bit_copy(struct copy_ctx *ctx, struct agx_copy *entry) { assert(!entry->done); assert(is_real(entry)); assert(agx_size_align_16(entry->size) == 2); struct agx_copy *new_entry = &ctx->entries[ctx->entry_count++]; new_entry->dest = entry->dest + 1; new_entry->src = entry->src + 1; new_entry->done = false; entry->size = AGX_SIZE_16; new_entry->size = AGX_SIZE_16; ctx->physreg_dest[entry->dest + 1] = new_entry; } void agx_emit_parallel_copies(agx_builder *b, struct agx_copy *copies, unsigned num_copies) { struct copy_ctx _ctx = { .entry_count = num_copies }; struct copy_ctx *ctx = &_ctx; /* Set up the bookkeeping */ memset(ctx->physreg_dest, 0, sizeof(ctx->physreg_dest)); memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count)); for (unsigned i = 0; i < ctx->entry_count; i++) { struct agx_copy *entry = &copies[i]; ctx->entries[i] = *entry; for (unsigned j = 0; j < agx_size_align_16(entry->size); j++) { if (is_real(entry)) ctx->physreg_use_count[entry->src + j]++; /* Copies should not have overlapping destinations. */ assert(!ctx->physreg_dest[entry->dest + j]); ctx->physreg_dest[entry->dest + j] = entry; } } bool progress = true; while (progress) { progress = false; /* Step 1: resolve paths in the transfer graph. This means finding * copies whose destination aren't blocked by something else and then * emitting them, continuing this process until every copy is blocked * and there are only cycles left. * * TODO: We should note that src is also available in dest to unblock * cycles that src is involved in. */ for (unsigned i = 0; i < ctx->entry_count; i++) { struct agx_copy *entry = &ctx->entries[i]; if (!entry->done && !entry_blocked(entry, ctx)) { entry->done = true; progress = true; do_copy(b, entry); for (unsigned j = 0; j < agx_size_align_16(entry->size); j++) { if (is_real(entry)) ctx->physreg_use_count[entry->src + j]--; ctx->physreg_dest[entry->dest + j] = NULL; } } } if (progress) continue; /* Step 2: Find partially blocked copies and split them. In the * mergedregs case, we can 32-bit copies which are only blocked on one * 16-bit half, and splitting them helps get things moving. * * We can skip splitting copies if the source isn't a register, * however, because it does not unblock anything and therefore doesn't * contribute to making forward progress with step 1. These copies * should still be resolved eventually in step 1 because they can't be * part of a cycle. */ for (unsigned i = 0; i < ctx->entry_count; i++) { struct agx_copy *entry = &ctx->entries[i]; if (entry->done || (agx_size_align_16(entry->size) != 2)) continue; if (((ctx->physreg_use_count[entry->dest] == 0 || ctx->physreg_use_count[entry->dest + 1] == 0)) && is_real(entry)) { split_32bit_copy(ctx, entry); progress = true; } } } /* Step 3: resolve cycles through swapping. * * At this point, the transfer graph should consist of only cycles. * The reason is that, given any physreg n_1 that's the source of a * remaining entry, it has a destination n_2, which (because every * copy is blocked) is the source of some other copy whose destination * is n_3, and so we can follow the chain until we get a cycle. If we * reached some other node than n_1: * * n_1 -> n_2 -> ... -> n_i * ^ | * |-------------| * * then n_2 would be the destination of 2 copies, which is illegal * (checked above in an assert). So n_1 must be part of a cycle: * * n_1 -> n_2 -> ... -> n_i * ^ | * |---------------------| * * and this must be only cycle n_1 is involved in, because any other * path starting from n_1 would also have to end in n_1, resulting in * a node somewhere along the way being the destination of 2 copies * when the 2 paths merge. * * The way we resolve the cycle is through picking a copy (n_1, n_2) * and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken * out of the cycle: * * n_1 -> ... -> n_i * ^ | * |--------------| * * and we can keep repeating this until the cycle is empty. */ for (unsigned i = 0; i < ctx->entry_count; i++) { struct agx_copy *entry = &ctx->entries[i]; if (entry->done) continue; assert(is_real(entry)); /* catch trivial copies */ if (entry->dest == entry->src) { entry->done = true; continue; } do_swap(b, entry); /* Split any blocking copies whose sources are only partially * contained within our destination. */ if (agx_size_align_16(entry->size) == 1) { for (unsigned j = 0; j < ctx->entry_count; j++) { struct agx_copy *blocking = &ctx->entries[j]; if (blocking->done) continue; if (blocking->src <= entry->dest && blocking->src + 1 >= entry->dest && agx_size_align_16(blocking->size) == 2) { split_32bit_copy(ctx, blocking); } } } /* Update sources of blocking copies. * * Note: at this point, every blocking copy's source should be * contained within our destination. */ for (unsigned j = 0; j < ctx->entry_count; j++) { struct agx_copy *blocking = &ctx->entries[j]; if (blocking->src >= entry->dest && blocking->src < entry->dest + agx_size_align_16(entry->size)) { blocking->src = entry->src + (blocking->src - entry->dest); } } entry->done = true; } }