/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
 * vim: set ts=8 sts=2 et sw=2 tw=80:
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "jit/EffectiveAddressAnalysis.h"

#include "jit/IonAnalysis.h"
#include "jit/MIR-wasm.h"
#include "jit/MIR.h"
#include "jit/MIRGenerator.h"
#include "jit/MIRGraph.h"

using namespace js;
using namespace jit;

// This is a very simple pass that tries to merge 32-bit shift-and-add into a
// single MIR node.  It results from a lot of experimentation with more
// aggressive load-effective-address formation, as documented in bug 1970035.
//
// This implementation only covers the two-addend form
// `base + (index << {1,2,3})` (and the same the other way around).  Previous
// experimentation showed that, while the 3-addend form
// `base + (index << {1,2,3}) + constant` can be reliably identified and merged
// into a single node, it doesn't reliably produce faster code.  Also, the
// implementation complexity is much higher than what is below.
//
// 3-addend LEAs can be completed in a single cycle on high-end Intels, but
// take 2 cycles on lower end Intels.  By comparison the 2-addend form is
// believed to take a single cycle on all Intels.  On arm64, the 3-addend form
// is not supported in a single machine instruction, and so can require zero,
// one or two extra instructions, depending on the size of the constant,
// possibly an extra register, and consequently some number of extra cycles.
//
// Because of this, restricting the transformation to the 2-addend case
// simplifies both the implementation and more importantly the cost-tradeoff
// landscape.  It gains much of the wins of the 3-addend case while more
// reliably producing nodes that can execute in a single cycle on all primary
// targets.

// =====================================================================

// On non-x86/x64 targets, incorporating any non-zero constant (displacement)
// in an EffectiveAddress2 node is not free, because the constant may have to
// be synthesised into a register in the back end.  Worse, on all such targets,
// arbitrary 32-bit constants will take two instructions to synthesise, which
// can lead to a net performance loss.
//
// `OffsetIsSmallEnough` is used in the logic below to restrict constants to
// single-instruction forms.  It is necessarily target-dependent.  Note this is
// merely a heuristic -- the resulting code should be *correct* on all targets
// regardless of the value returned.

static bool OffsetIsSmallEnough(int32_t imm) {
#if defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_X64)
  // For x86_32 and x86_64 we have the luxury of being able to roll in any
  // 32-bit `imm` value for free.
  return true;
#elif defined(JS_CODEGEN_ARM64) || defined(JS_CODEGEN_ARM)
  // On arm64, this can be synthesised in one insn as `movz #imm` or
  // `movn #imm`.  arm32 is similar.
  return imm >= -0xFFFF && imm <= 0xFFFF;
#elif defined(JS_CODEGEN_RISCV64) || defined(JS_CODEGEN_LOONG64) || \
    defined(JS_CODEGEN_MIPS64)
  return imm >= -0xFFF && imm <= 0xFFF;
#elif defined(JS_CODEGEN_WASM32) || defined(JS_CODEGEN_NONE)
  return true;
#else
#  error "This needs to be filled in for your platform"
#endif
}

// If `def` is of the form `x << {1,2,3}`, return `x` and the shift value.
// Otherwise return the pair `(nullptr, 0)`.
static std::pair<MDefinition*, int32_t> IsShiftBy123(MDefinition* def) {
  MOZ_ASSERT(def->type() == MIRType::Int32);
  if (!def->isLsh()) {
    return std::pair(nullptr, 0);
  }
  MLsh* lsh = def->toLsh();
  if (lsh->isRecoveredOnBailout()) {
    return std::pair(nullptr, 0);
  }
  MDefinition* shamt = lsh->rhs();
  MOZ_ASSERT(shamt->type() == MIRType::Int32);
  MConstant* con = shamt->maybeConstantValue();
  if (!con || con->toInt32() < 1 || con->toInt32() > 3) {
    return std::pair(nullptr, 0);
  }
  return std::pair(lsh->lhs(), con->toInt32());
}

// Try to convert `base + (index << {1,2,3})` into either an MEffectiveAddress2
// node (if base is a constant) or an MEffectiveAddress3 node with zero
// displacement (if base is non-constant).
static void TryMatchShiftAdd(TempAllocator& alloc, MAdd* root) {
  MOZ_ASSERT(root->isAdd());
  MOZ_ASSERT(root->type() == MIRType::Int32);
  MOZ_ASSERT(root->hasUses());

  // Try to match
  //
  //   base + (index << {1,2,3})
  //
  // in which the addends can appear in either order.  Obviously the shift
  // amount must be a constant, but `base` and `index` can be anything.

  MDefinition* base = nullptr;
  MDefinition* index = nullptr;
  int32_t shift = 0;

  auto pair = IsShiftBy123(root->rhs());
  MOZ_ASSERT((pair.first == nullptr) == (pair.second == 0));
  if (pair.first) {
    base = root->lhs();
    index = pair.first;
    shift = pair.second;
  } else {
    pair = IsShiftBy123(root->lhs());
    MOZ_ASSERT((pair.first == nullptr) == (pair.second == 0));
    if (pair.first) {
      base = root->rhs();
      index = pair.first;
      shift = pair.second;
    }
  }

  if (!base) {
    return;
  }
  MOZ_ASSERT(shift >= 1 && shift <= 3);

  // IsShiftBy123 ensures that the MLsh node is not `recoveredOnBailout`, and
  // this test takes care of the MAdd node.
  if (root->isRecoveredOnBailout()) {
    return;
  }

  // Pattern matching succeeded.
  Scale scale = ShiftToScale(shift);
  MOZ_ASSERT(scale != TimesOne);

  MInstruction* replacement = nullptr;
  if (base->maybeConstantValue()) {
    int32_t baseValue = base->maybeConstantValue()->toInt32();
    if (baseValue == 0) {
      // We'd only be rolling one operation -- the shift -- into the result, so
      // don't bother.
      return;
    }
    if (!OffsetIsSmallEnough(baseValue)) {
      // `baseValue` would take more than one insn to get into a register,
      // which makes the change less likely to be a win.  See bug 1979829.
      return;
    }
    replacement = MEffectiveAddress2::New(alloc, index, scale, baseValue);
  } else {
    replacement = MEffectiveAddress3::New(alloc, base, index, scale, 0);
  }

  root->replaceAllUsesWith(replacement);
  root->block()->insertAfter(root, replacement);

  if (JitSpewEnabled(JitSpew_EAA)) {
    JitSpewCont(JitSpew_EAA, "  create: '");
    DumpMIRDefinition(JitSpewPrinter(), replacement, /*showDetails=*/false);
    JitSpewCont(JitSpew_EAA, "'\n");
  }
}

// =====================================================================
//
// Top level driver.

bool EffectiveAddressAnalysis::analyze() {
  JitSpew(JitSpew_EAA, "Begin");

  for (ReversePostorderIterator block(graph_.rpoBegin());
       block != graph_.rpoEnd(); block++) {
    // Traverse backwards through `block`, trying to rewrite each MIR node if
    // we can.  Rewriting may cause nodes to become dead.  We do not try to
    // remove those here, but leave them for a later DCE pass to clear up.

    MInstructionReverseIterator ri(block->rbegin());
    while (ri != block->rend()) {
      // Nodes are added immediately after `curr`, so the iterator won't
      // traverse them, since we're iterating backwards.
      MInstruction* curr = *ri;
      ri++;

      if (MOZ_LIKELY(!curr->isAdd())) {
        continue;
      }
      if (curr->type() != MIRType::Int32 || !curr->hasUses()) {
        continue;
      }

      // This check needs to precede any allocation done in this loop.
      if (MOZ_UNLIKELY(!graph_.alloc().ensureBallast())) {
        return false;
      }

      TryMatchShiftAdd(graph_.alloc(), curr->toAdd());
    }
  }

  JitSpew(JitSpew_EAA, "End");
  return true;
}