// I //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines the interfaces that X86 uses to lower LLVM code into a // selection DAG. // //===----------------------------------------------------------------------===// #include "X86ISelLowering.h" #include "MCTargetDesc/X86ShuffleDecode.h" #include "X86.h" #include "X86FrameLowering.h" #include "X86InstrBuilder.h" #include "X86IntrinsicsInfo.h" #include "X86MachineFunctionInfo.h" #include "X86TargetMachine.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/EHPersonalities.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/PatternMatch.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetOptions.h" #include #include #include #include using namespace llvm; #define DEBUG_TYPE "x86-isel" static cl::opt ExperimentalPrefInnermostLoopAlignment( "x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc( "Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden); static cl::opt BrMergingBaseCostThresh( "x86-br-merging-base-cost", cl::init(2), cl::desc( "Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden); static cl::opt BrMergingCcmpBias( "x86-br-merging-ccmp-bias", cl::init(6), cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target " "supports conditional compare instructions."), cl::Hidden); static cl::opt WidenShift("x86-widen-shift", cl::init(true), cl::desc("Replace narrow shifts with wider shifts."), cl::Hidden); static cl::opt BrMergingLikelyBias( "x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden); static cl::opt BrMergingUnlikelyBias( "x86-br-merging-unlikely-bias", cl::init(-1), cl::desc( "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden); static cl::opt MulConstantOptimization( "mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden); X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) : TargetLowering(TM, STI), Subtarget(STI) { bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87(); MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0)); // Set up the TargetLowering object. // X86 is weird. It always uses i8 for shift amounts and setcc results. setBooleanContents(ZeroOrOneBooleanContent); // X86-SSE is even stranger. It uses -1 or 0 for vector masks. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // X86 instruction cache is coherent with its data cache so we can use the // default expansion to a no-op. setOperationAction(ISD::CLEAR_CACHE, MVT::Other, Expand); // For 64-bit, since we have so many registers, use the ILP scheduler. // For 32-bit, use the register pressure specific scheduling. // For Atom, always use ILP scheduling. if (Subtarget.isAtom()) setSchedulingPreference(Sched::ILP); else if (Subtarget.is64Bit()) setSchedulingPreference(Sched::ILP); else setSchedulingPreference(Sched::RegPressure); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); // Bypass expensive divides and use cheaper ones. if (TM.getOptLevel() >= CodeGenOptLevel::Default) { if (Subtarget.hasSlowDivide32()) addBypassSlowDiv(32, 8); if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit()) addBypassSlowDiv(64, 32); } if (Subtarget.canUseCMPXCHG16B()) setMaxAtomicSizeInBitsSupported(128); else if (Subtarget.canUseCMPXCHG8B()) setMaxAtomicSizeInBitsSupported(64); else setMaxAtomicSizeInBitsSupported(32); setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64); setMaxLargeFPConvertBitWidthSupported(128); // Set up the register classes. addRegisterClass(MVT::i8, &X86::GR8RegClass); addRegisterClass(MVT::i16, &X86::GR16RegClass); addRegisterClass(MVT::i32, &X86::GR32RegClass); if (Subtarget.is64Bit()) addRegisterClass(MVT::i64, &X86::GR64RegClass); for (MVT VT : MVT::integer_valuetypes()) setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); // We don't accept any truncstore of integer registers. setTruncStoreAction(MVT::i64, MVT::i32, Expand); setTruncStoreAction(MVT::i64, MVT::i16, Expand); setTruncStoreAction(MVT::i64, MVT::i8 , Expand); setTruncStoreAction(MVT::i32, MVT::i16, Expand); setTruncStoreAction(MVT::i32, MVT::i8 , Expand); setTruncStoreAction(MVT::i16, MVT::i8, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); // SETOEQ and SETUNE require checking two conditions. for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) { setCondCodeAction(ISD::SETOEQ, VT, Expand); setCondCodeAction(ISD::SETUNE, VT, Expand); } // Integer absolute. if (Subtarget.canUseCMOV()) { setOperationAction(ISD::ABS , MVT::i16 , Custom); setOperationAction(ISD::ABS , MVT::i32 , Custom); if (Subtarget.is64Bit()) setOperationAction(ISD::ABS , MVT::i64 , Custom); } // Absolute difference. for (auto Op : {ISD::ABDS, ISD::ABDU}) { setOperationAction(Op , MVT::i8 , Custom); setOperationAction(Op , MVT::i16 , Custom); setOperationAction(Op , MVT::i32 , Custom); if (Subtarget.is64Bit()) setOperationAction(Op , MVT::i64 , Custom); } // Signed saturation subtraction. setOperationAction(ISD::SSUBSAT , MVT::i8 , Custom); setOperationAction(ISD::SSUBSAT , MVT::i16 , Custom); setOperationAction(ISD::SSUBSAT , MVT::i32 , Custom); if (Subtarget.is64Bit()) setOperationAction(ISD::SSUBSAT , MVT::i64 , Custom); // Funnel shifts. for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) { // For slow shld targets we only lower for code size. LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal; setOperationAction(ShiftOp , MVT::i8 , Custom); setOperationAction(ShiftOp , MVT::i16 , Custom); setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction); if (Subtarget.is64Bit()) setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction); } if (!Subtarget.useSoftFloat()) { // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this // operation. setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote); setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote); // We have an algorithm for SSE2, and we turn this into a 64-bit // FILD or VCVTUSI2SS/SD for other targets. setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have // this operation. setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote); // SSE has no i16 to fp conversion, only i32. We promote in the handler // to allow f80 to use i16 and f64 to use i16 with sse1 only setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom); // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom); // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 // are Legal, f80 is custom lowered. setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom); // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have // this operation. setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote); // FIXME: This doesn't generate invalid exception when it should. PR44019. setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote); setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 // are Legal, f80 is custom lowered. setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom); // Handle FP_TO_UINT by promoting the destination to a larger signed // conversion. setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote); // FIXME: This doesn't generate invalid exception when it should. PR44019. setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote); setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote); // FIXME: This doesn't generate invalid exception when it should. PR44019. setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom); setOperationAction(ISD::LRINT, MVT::f32, Custom); setOperationAction(ISD::LRINT, MVT::f64, Custom); setOperationAction(ISD::LLRINT, MVT::f32, Custom); setOperationAction(ISD::LLRINT, MVT::f64, Custom); if (!Subtarget.is64Bit()) { setOperationAction(ISD::LRINT, MVT::i64, Custom); setOperationAction(ISD::LLRINT, MVT::i64, Custom); } } if (Subtarget.hasSSE2()) { // Custom lowering for saturating float to int conversions. // We handle promotion to larger result types manually. for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) { setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom); setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom); } setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom); setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom); if (Subtarget.is64Bit()) { setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom); } } if (Subtarget.hasAVX10_2()) { setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v2i32, Custom); setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v2i32, Custom); setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v8i64, Legal); setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v8i64, Legal); for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, MVT::v4i64}) { setOperationAction(ISD::FP_TO_UINT_SAT, VT, Legal); setOperationAction(ISD::FP_TO_SINT_SAT, VT, Legal); } if (Subtarget.is64Bit()) { setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Legal); setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Legal); } } // Handle address space casts between mixed sized pointers. setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); // TODO: when we have SSE, these could be more efficient, by using movd/movq. if (!Subtarget.hasSSE2()) { setOperationAction(ISD::BITCAST , MVT::f32 , Expand); setOperationAction(ISD::BITCAST , MVT::i32 , Expand); setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom); setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom); if (Subtarget.is64Bit()) { setOperationAction(ISD::BITCAST , MVT::f64 , Expand); // Without SSE, i64->f64 goes through memory. setOperationAction(ISD::BITCAST , MVT::i64 , Expand); } } else if (!Subtarget.is64Bit()) setOperationAction(ISD::BITCAST , MVT::i64 , Custom); // Scalar integer divide and remainder are lowered to use operations that // produce two results, to match the available instructions. This exposes // the two-result form to trivial CSE, which is able to combine x/y and x%y // into a single instruction. // // Scalar integer multiply-high is also lowered to use two-result // operations, to match the available instructions. However, plain multiply // (low) operations are left as Legal, as there are single-result // instructions for this in x86. Using the two-result multiply instructions // when both high and low results are needed must be arranged by dagcombine. for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); } setOperationAction(ISD::BR_JT , MVT::Other, Expand); setOperationAction(ISD::BRCOND , MVT::Other, Custom); for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128, MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::BR_CC, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); } if (Subtarget.is64Bit()) setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); setOperationAction(ISD::FREM, MVT::f32, LibCall); setOperationAction(ISD::FREM, MVT::f64, LibCall); setOperationAction(ISD::FREM, MVT::f80, LibCall); setOperationAction(ISD::FREM, MVT::f128, LibCall); if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) { setOperationAction(ISD::GET_ROUNDING , MVT::i32 , Custom); setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom); setOperationAction(ISD::GET_FPENV_MEM , MVT::Other, Custom); setOperationAction(ISD::SET_FPENV_MEM , MVT::Other, Custom); setOperationAction(ISD::RESET_FPENV , MVT::Other, Custom); } // Promote the i8 variants and force them on up to i32 which has a shorter // encoding. setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32); setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32); // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to // promote that too. setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32); setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , MVT::i32); if (!Subtarget.hasBMI()) { setOperationAction(ISD::CTTZ , MVT::i32 , Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal); if (Subtarget.is64Bit()) { setOperationAction(ISD::CTTZ , MVT::i64 , Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal); } } if (Subtarget.hasLZCNT()) { // When promoting the i8 variants, force them to i32 for a shorter // encoding. setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32); setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); } else { for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::CTLZ , VT, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom); } } for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16, ISD::STRICT_FP_TO_FP16}) { // Special handling for half-precision floating point conversions. // If we don't have F16C support, then lower half float conversions // into library calls. setOperationAction( Op, MVT::f32, (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand); // There's never any support for operations beyond MVT::f32. setOperationAction(Op, MVT::f64, Expand); setOperationAction(Op, MVT::f80, Expand); setOperationAction(Op, MVT::f128, Expand); } for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) { setOperationAction(ISD::STRICT_FP_TO_BF16, VT, Expand); setOperationAction(ISD::STRICT_BF16_TO_FP, VT, Expand); } for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) { setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand); setTruncStoreAction(VT, MVT::f16, Expand); setTruncStoreAction(VT, MVT::bf16, Expand); setOperationAction(ISD::BF16_TO_FP, VT, Expand); setOperationAction(ISD::FP_TO_BF16, VT, Custom); } setOperationAction(ISD::PARITY, MVT::i8, Custom); setOperationAction(ISD::PARITY, MVT::i16, Custom); setOperationAction(ISD::PARITY, MVT::i32, Custom); if (Subtarget.is64Bit()) setOperationAction(ISD::PARITY, MVT::i64, Custom); if (Subtarget.hasPOPCNT()) { setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32); // popcntw is longer to encode than popcntl and also has a false dependency // on the dest that popcntl hasn't had since Cannon Lake. setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32); } else { setOperationAction(ISD::CTPOP , MVT::i8 , Custom); setOperationAction(ISD::CTPOP , MVT::i16 , Custom); setOperationAction(ISD::CTPOP , MVT::i32 , Custom); setOperationAction(ISD::CTPOP , MVT::i64 , Custom); } setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); if (!Subtarget.hasMOVBE()) setOperationAction(ISD::BSWAP , MVT::i16 , Expand); // X86 wants to expand cmov itself. for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) { setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); } for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); } setOperationAction(ISD::COND_LOOP, MVT::Other, Custom); // Custom action for SELECT MMX and expand action for SELECT_CC MMX setOperationAction(ISD::SELECT, MVT::x86mmx, Custom); setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand); setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since // LLVM/Clang supports zero-cost DWARF and SEH exception handling. setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); // Darwin ABI issue. for (auto VT : { MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::ConstantPool , VT, Custom); setOperationAction(ISD::JumpTable , VT, Custom); setOperationAction(ISD::GlobalAddress , VT, Custom); setOperationAction(ISD::GlobalTLSAddress, VT, Custom); setOperationAction(ISD::ExternalSymbol , VT, Custom); setOperationAction(ISD::BlockAddress , VT, Custom); } // 64-bit shl, sra, srl (iff 32-bit x86) for (auto VT : { MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::SHL_PARTS, VT, Custom); setOperationAction(ISD::SRA_PARTS, VT, Custom); setOperationAction(ISD::SRL_PARTS, VT, Custom); } if (Subtarget.hasSSEPrefetch()) setOperationAction(ISD::PREFETCH , MVT::Other, Custom); setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); // Expand certain atomics for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom); setOperationAction(ISD::ATOMIC_STORE, VT, Custom); } if (!Subtarget.is64Bit()) setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); if (Subtarget.is64Bit() && Subtarget.hasAVX()) { // All CPUs supporting AVX will atomically load/store aligned 128-bit // values, so we can emit [V]MOVAPS/[V]MOVDQA. setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom); setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom); } if (Subtarget.canUseCMPXCHG16B()) setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); // FIXME - use subtarget debug flags if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() && !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() && TM.Options.ExceptionModel != ExceptionHandling::SjLj) { setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); } setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); setOperationAction(ISD::TRAP, MVT::Other, Legal); setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); if (Subtarget.isTargetPS()) setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand); else setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal); // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); setOperationAction(ISD::VAEND , MVT::Other, Expand); bool Is64Bit = Subtarget.is64Bit(); setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand); setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand); setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom); // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering. setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom); setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal); auto setF16Action = [&] (MVT VT, LegalizeAction Action) { setOperationAction(ISD::FABS, VT, Action); setOperationAction(ISD::FNEG, VT, Action); setOperationAction(ISD::FCOPYSIGN, VT, Expand); setOperationAction(ISD::FREM, VT, Action); setOperationAction(ISD::FMA, VT, Action); setOperationAction(ISD::FMINNUM, VT, Action); setOperationAction(ISD::FMAXNUM, VT, Action); setOperationAction(ISD::FMINIMUM, VT, Action); setOperationAction(ISD::FMAXIMUM, VT, Action); setOperationAction(ISD::FMINIMUMNUM, VT, Action); setOperationAction(ISD::FMAXIMUMNUM, VT, Action); setOperationAction(ISD::FSIN, VT, Action); setOperationAction(ISD::FCOS, VT, Action); setOperationAction(ISD::FSINCOS, VT, Action); setOperationAction(ISD::FTAN, VT, Action); setOperationAction(ISD::FSQRT, VT, Action); setOperationAction(ISD::FPOW, VT, Action); setOperationAction(ISD::FPOWI, VT, Action); setOperationAction(ISD::FLOG, VT, Action); setOperationAction(ISD::FLOG2, VT, Action); setOperationAction(ISD::FLOG10, VT, Action); setOperationAction(ISD::FEXP, VT, Action); setOperationAction(ISD::FEXP2, VT, Action); setOperationAction(ISD::FEXP10, VT, Action); setOperationAction(ISD::FCEIL, VT, Action); setOperationAction(ISD::FFLOOR, VT, Action); setOperationAction(ISD::FNEARBYINT, VT, Action); setOperationAction(ISD::FRINT, VT, Action); setOperationAction(ISD::BR_CC, VT, Action); setOperationAction(ISD::SETCC, VT, Action); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SELECT_CC, VT, Action); setOperationAction(ISD::FROUND, VT, Action); setOperationAction(ISD::FROUNDEVEN, VT, Action); setOperationAction(ISD::FTRUNC, VT, Action); setOperationAction(ISD::FLDEXP, VT, Action); setOperationAction(ISD::FSINCOSPI, VT, Action); }; if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { // f16, f32 and f64 use SSE. // Set up the FP register classes. addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass); addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass); addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass); // Disable f32->f64 extload as we can only generate this in one instruction // under optsize. So its easier to pattern match (fpext (load)) for that // case instead of needing to emit 2 instructions for extload in the // non-optsize case. setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); for (auto VT : { MVT::f32, MVT::f64 }) { // Use ANDPD to simulate FABS. setOperationAction(ISD::FABS, VT, Custom); // Use XORP to simulate FNEG. setOperationAction(ISD::FNEG, VT, Custom); // Use ANDPD and ORPD to simulate FCOPYSIGN. setOperationAction(ISD::FCOPYSIGN, VT, Custom); // These might be better off as horizontal vector ops. setOperationAction(ISD::FADD, VT, Custom); setOperationAction(ISD::FSUB, VT, Custom); // We don't support sin/cos/fmod setOperationAction(ISD::FSIN , VT, Expand); setOperationAction(ISD::FCOS , VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); } // Half type will be promoted by default. setF16Action(MVT::f16, Promote); setOperationAction(ISD::FADD, MVT::f16, Promote); setOperationAction(ISD::FSUB, MVT::f16, Promote); setOperationAction(ISD::FMUL, MVT::f16, Promote); setOperationAction(ISD::FDIV, MVT::f16, Promote); setOperationAction(ISD::FABS, MVT::f16, Custom); setOperationAction(ISD::FNEG, MVT::f16, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom); setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); setOperationAction(ISD::STRICT_FADD, MVT::f16, Promote); setOperationAction(ISD::STRICT_FSUB, MVT::f16, Promote); setOperationAction(ISD::STRICT_FMUL, MVT::f16, Promote); setOperationAction(ISD::STRICT_FDIV, MVT::f16, Promote); setOperationAction(ISD::STRICT_FMA, MVT::f16, Promote); setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Promote); setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Promote); setOperationAction(ISD::STRICT_FMINIMUM, MVT::f16, Promote); setOperationAction(ISD::STRICT_FMAXIMUM, MVT::f16, Promote); setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Promote); setOperationAction(ISD::STRICT_FPOW, MVT::f16, Promote); setOperationAction(ISD::STRICT_FLDEXP, MVT::f16, Promote); setOperationAction(ISD::STRICT_FLOG, MVT::f16, Promote); setOperationAction(ISD::STRICT_FLOG2, MVT::f16, Promote); setOperationAction(ISD::STRICT_FLOG10, MVT::f16, Promote); setOperationAction(ISD::STRICT_FEXP, MVT::f16, Promote); setOperationAction(ISD::STRICT_FEXP2, MVT::f16, Promote); setOperationAction(ISD::STRICT_FCEIL, MVT::f16, Promote); setOperationAction(ISD::STRICT_FFLOOR, MVT::f16, Promote); setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f16, Promote); setOperationAction(ISD::STRICT_FRINT, MVT::f16, Promote); setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Promote); setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Promote); setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote); setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote); setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::FCANONICALIZE, MVT::f16, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); setOperationAction(ISD::LLROUND, MVT::f16, Expand); setOperationAction(ISD::LROUND, MVT::f16, Expand); setOperationAction(ISD::LRINT, MVT::f16, Expand); setOperationAction(ISD::LLRINT, MVT::f16, Expand); setOperationAction(ISD::STRICT_LLROUND, MVT::f16, Promote); setOperationAction(ISD::STRICT_LROUND, MVT::f16, Promote); setOperationAction(ISD::STRICT_LRINT, MVT::f16, Promote); setOperationAction(ISD::STRICT_LLRINT, MVT::f16, Promote); // Lower this to MOVMSK plus an AND. setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() && (UseX87 || Is64Bit)) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. addRegisterClass(MVT::f32, &X86::FR32RegClass); if (UseX87) addRegisterClass(MVT::f64, &X86::RFP64RegClass); // Use ANDPS to simulate FABS. setOperationAction(ISD::FABS , MVT::f32, Custom); // Use XORP to simulate FNEG. setOperationAction(ISD::FNEG , MVT::f32, Custom); if (UseX87) setOperationAction(ISD::UNDEF, MVT::f64, Expand); // Use ANDPS and ORPS to simulate FCOPYSIGN. if (UseX87) setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); // We don't support sin/cos/fmod setOperationAction(ISD::FSIN , MVT::f32, Expand); setOperationAction(ISD::FCOS , MVT::f32, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); if (UseX87) { // Always expand sin/cos functions even though x87 has an instruction. setOperationAction(ISD::FSIN, MVT::f64, Expand); setOperationAction(ISD::FCOS, MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); } } else if (UseX87) { // f32 and f64 in x87. // Set up the FP register classes. addRegisterClass(MVT::f64, &X86::RFP64RegClass); addRegisterClass(MVT::f32, &X86::RFP32RegClass); for (auto VT : { MVT::f32, MVT::f64 }) { setOperationAction(ISD::UNDEF, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); // Always expand sin/cos functions even though x87 has an instruction. setOperationAction(ISD::FSIN , VT, Expand); setOperationAction(ISD::FCOS , VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); } } // Expand FP32 immediates into loads from the stack, save special cases. if (isTypeLegal(MVT::f32)) { if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) { addLegalFPImmediate(APFloat(+0.0f)); // FLD0 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS } else // SSE immediates. addLegalFPImmediate(APFloat(+0.0f)); // xorps } // Expand FP64 immediates into loads from the stack, save special cases. if (isTypeLegal(MVT::f64)) { if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) { addLegalFPImmediate(APFloat(+0.0)); // FLD0 addLegalFPImmediate(APFloat(+1.0)); // FLD1 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS } else // SSE immediates. addLegalFPImmediate(APFloat(+0.0)); // xorpd } // Support fp16 0 immediate. if (isTypeLegal(MVT::f16)) addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf())); // Handle constrained floating-point operations of scalar. setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal); setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal); // We don't support FMA. setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f32, Expand); // f80 always uses X87. if (UseX87) { addRegisterClass(MVT::f80, &X86::RFP80RegClass); setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); { APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended()); addLegalFPImmediate(TmpFlt); // FLD0 TmpFlt.changeSign(); addLegalFPImmediate(TmpFlt); // FLD0/FCHS bool ignored; APFloat TmpFlt2(+1.0); TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven, &ignored); addLegalFPImmediate(TmpFlt2); // FLD1 TmpFlt2.changeSign(); addLegalFPImmediate(TmpFlt2); // FLD1/FCHS } // Always expand sin/cos functions even though x87 has an instruction. // clang-format off setOperationAction(ISD::FSIN , MVT::f80, Expand); setOperationAction(ISD::FCOS , MVT::f80, Expand); setOperationAction(ISD::FSINCOS, MVT::f80, Expand); setOperationAction(ISD::FTAN , MVT::f80, Expand); setOperationAction(ISD::FASIN , MVT::f80, Expand); setOperationAction(ISD::FACOS , MVT::f80, Expand); setOperationAction(ISD::FATAN , MVT::f80, Expand); setOperationAction(ISD::FATAN2 , MVT::f80, Expand); setOperationAction(ISD::FSINH , MVT::f80, Expand); setOperationAction(ISD::FCOSH , MVT::f80, Expand); setOperationAction(ISD::FTANH , MVT::f80, Expand); // clang-format on setOperationAction(ISD::FFLOOR, MVT::f80, Expand); setOperationAction(ISD::FCEIL, MVT::f80, Expand); setOperationAction(ISD::FTRUNC, MVT::f80, Expand); setOperationAction(ISD::FRINT, MVT::f80, Expand); setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); setOperationAction(ISD::FROUNDEVEN, MVT::f80, Expand); setOperationAction(ISD::FMA, MVT::f80, Expand); setOperationAction(ISD::LROUND, MVT::f80, LibCall); setOperationAction(ISD::LLROUND, MVT::f80, LibCall); setOperationAction(ISD::LRINT, MVT::f80, Custom); setOperationAction(ISD::LLRINT, MVT::f80, Custom); // Handle constrained floating-point operations of scalar. setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal); setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal); setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal); setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal); setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal); setOperationAction(ISD::FCANONICALIZE , MVT::f80, Custom); if (isTypeLegal(MVT::f16)) { setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom); } else { setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal); } // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten // as Custom. setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal); } // f128 uses xmm registers, but most operations require libcalls. if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) { addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps setOperationAction(ISD::FADD, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall); setOperationAction(ISD::FSUB, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall); setOperationAction(ISD::FDIV, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall); setOperationAction(ISD::FMUL, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall); setOperationAction(ISD::FMA, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall); setOperationAction(ISD::FABS, MVT::f128, Custom); setOperationAction(ISD::FNEG, MVT::f128, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); // clang-format off setOperationAction(ISD::FSIN, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall); setOperationAction(ISD::FCOS, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall); setOperationAction(ISD::FSINCOS, MVT::f128, LibCall); setOperationAction(ISD::FTAN, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FTAN, MVT::f128, LibCall); // clang-format on // No STRICT_FSINCOS setOperationAction(ISD::FSQRT, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall); setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom); // We need to custom handle any FP_ROUND with an f128 input, but // LegalizeDAG uses the result type to know when to run a custom handler. // So we have to list all legal floating point result types here. if (isTypeLegal(MVT::f32)) { setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); } if (isTypeLegal(MVT::f64)) { setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom); } if (isTypeLegal(MVT::f80)) { setOperationAction(ISD::FP_ROUND, MVT::f80, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom); setOperationAction(ISD::FCANONICALIZE, MVT::f80, Custom); } setOperationAction(ISD::SETCC, MVT::f128, Custom); setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand); setTruncStoreAction(MVT::f128, MVT::f32, Expand); setTruncStoreAction(MVT::f128, MVT::f64, Expand); setTruncStoreAction(MVT::f128, MVT::f80, Expand); } // Always use a library call for pow. setOperationAction(ISD::FPOW , MVT::f32 , Expand); setOperationAction(ISD::FPOW , MVT::f64 , Expand); setOperationAction(ISD::FPOW , MVT::f80 , Expand); setOperationAction(ISD::FPOW , MVT::f128 , Expand); setOperationAction(ISD::FLOG, MVT::f80, Expand); setOperationAction(ISD::FLOG2, MVT::f80, Expand); setOperationAction(ISD::FLOG10, MVT::f80, Expand); setOperationAction(ISD::FEXP, MVT::f80, Expand); setOperationAction(ISD::FEXP2, MVT::f80, Expand); setOperationAction(ISD::FEXP10, MVT::f80, Expand); setOperationAction(ISD::FMINNUM, MVT::f80, Expand); setOperationAction(ISD::FMAXNUM, MVT::f80, Expand); // Some FP actions are always expanded for vector types. for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16, MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v2f64, MVT::v4f64, MVT::v8f64 }) { // clang-format off setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FTAN, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::FLOG, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); setOperationAction(ISD::FLOG10, VT, Expand); setOperationAction(ISD::FEXP, VT, Expand); setOperationAction(ISD::FEXP2, VT, Expand); setOperationAction(ISD::FEXP10, VT, Expand); // clang-format on } // First set operation action for all vector types to either promote // (for widening) or expand (for scalarization). Then we will selectively // turn on ones that can be effectively codegen'd. for (MVT VT : MVT::fixedlen_vector_valuetypes()) { setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand); setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); setOperationAction(ISD::FMA, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); setOperationAction(ISD::FCEIL, VT, Expand); setOperationAction(ISD::FTRUNC, VT, Expand); setOperationAction(ISD::FRINT, VT, Expand); setOperationAction(ISD::FNEARBYINT, VT, Expand); setOperationAction(ISD::FROUNDEVEN, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); setOperationAction(ISD::CTPOP, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); setOperationAction(ISD::CTLZ, VT, Expand); setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); setOperationAction(ISD::SETCC, VT, Expand); setOperationAction(ISD::FP_TO_UINT, VT, Expand); setOperationAction(ISD::FP_TO_SINT, VT, Expand); setOperationAction(ISD::UINT_TO_FP, VT, Expand); setOperationAction(ISD::SINT_TO_FP, VT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand); setOperationAction(ISD::TRUNCATE, VT, Expand); setOperationAction(ISD::SIGN_EXTEND, VT, Expand); setOperationAction(ISD::ZERO_EXTEND, VT, Expand); setOperationAction(ISD::ANY_EXTEND, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { setTruncStoreAction(InnerVT, VT, Expand); setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand); setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand); // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like // types, we have to deal with them whether we ask for Expansion or not. // Setting Expand causes its own optimisation problems though, so leave // them legal. if (VT.getVectorElementType() == MVT::i1) setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are // split/scalarized right now. if (VT.getVectorElementType() == MVT::f16 || VT.getVectorElementType() == MVT::bf16) setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); } } // FIXME: In order to prevent SSE instructions being expanded to MMX ones // with -msoft-float, disable use of MMX as well. if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) { addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); // No operations on x86mmx supported, everything uses intrinsics. } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) { addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom); setOperationAction(ISD::FMINIMUM, MVT::f32, Custom); setOperationAction(ISD::FMAXIMUMNUM, MVT::f32, Custom); setOperationAction(ISD::FMINIMUMNUM, MVT::f32, Custom); setOperationAction(ISD::FNEG, MVT::v4f32, Custom); setOperationAction(ISD::FABS, MVT::v4f32, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); setOperationAction(ISD::FCANONICALIZE, MVT::v4f32, Custom); setOperationAction(ISD::LOAD, MVT::v2f32, Custom); setOperationAction(ISD::STORE, MVT::v2f32, Custom); setOperationAction(ISD::FCANONICALIZE, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM // registers cannot be used even for integer operations. addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass); for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) { setOperationAction(ISD::FMAXIMUM, VT, Custom); setOperationAction(ISD::FMINIMUM, VT, Custom); setOperationAction(ISD::FMAXIMUMNUM, VT, Custom); setOperationAction(ISD::FMINIMUMNUM, VT, Custom); } for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16, MVT::v2i32 }) { setOperationAction(ISD::SDIV, VT, Custom); setOperationAction(ISD::SREM, VT, Custom); setOperationAction(ISD::UDIV, VT, Custom); setOperationAction(ISD::UREM, VT, Custom); } setOperationAction(ISD::MUL, MVT::v2i8, Custom); setOperationAction(ISD::MUL, MVT::v4i8, Custom); setOperationAction(ISD::MUL, MVT::v8i8, Custom); setOperationAction(ISD::MUL, MVT::v16i8, Custom); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); setOperationAction(ISD::MULHU, MVT::v4i32, Custom); setOperationAction(ISD::MULHS, MVT::v4i32, Custom); setOperationAction(ISD::MULHU, MVT::v16i8, Custom); setOperationAction(ISD::MULHS, MVT::v16i8, Custom); setOperationAction(ISD::MULHU, MVT::v8i16, Legal); setOperationAction(ISD::MULHS, MVT::v8i16, Legal); setOperationAction(ISD::MUL, MVT::v8i16, Legal); setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal); setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal); setOperationAction(ISD::SMULO, MVT::v16i8, Custom); setOperationAction(ISD::UMULO, MVT::v16i8, Custom); setOperationAction(ISD::UMULO, MVT::v2i32, Custom); setOperationAction(ISD::FNEG, MVT::v2f64, Custom); setOperationAction(ISD::FCANONICALIZE, MVT::v2f64, Custom); setOperationAction(ISD::FABS, MVT::v2f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom); setOperationAction(ISD::LRINT, MVT::v4f32, Custom); setOperationAction(ISD::LRINT, MVT::v2i32, Custom); setOperationAction(ISD::AND, MVT::i128, Custom); setOperationAction(ISD::OR, MVT::i128, Custom); setOperationAction(ISD::XOR, MVT::i128, Custom); if (Subtarget.hasPCLMUL()) { for (auto VT : {MVT::i64, MVT::v4i32, MVT::v2i64}) { setOperationAction(ISD::CLMUL, VT, Custom); setOperationAction(ISD::CLMULH, VT, Custom); } setOperationAction(ISD::CLMUL, MVT::i32, Custom); setOperationAction(ISD::CLMUL, MVT::i16, Custom); setOperationAction(ISD::CLMUL, MVT::i8, Custom); } for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom); setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom); setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom); setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom); } setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal); setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal); setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal); setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal); setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal); setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal); setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal); setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal); setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom); setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::ABS, VT, Custom); setOperationAction(ISD::ABDS, VT, Custom); setOperationAction(ISD::ABDU, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. setCondCodeAction(ISD::SETLT, VT, Custom); setCondCodeAction(ISD::SETLE, VT, Custom); } setOperationAction(ISD::SETCC, MVT::v2f64, Custom); setOperationAction(ISD::SETCC, MVT::v4f32, Custom); setOperationAction(ISD::STRICT_FSETCC, MVT::v2f64, Custom); setOperationAction(ISD::STRICT_FSETCC, MVT::v4f32, Custom); setOperationAction(ISD::STRICT_FSETCCS, MVT::v2f64, Custom); setOperationAction(ISD::STRICT_FSETCCS, MVT::v4f32, Custom); for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); if (VT == MVT::v2i64 && !Subtarget.is64Bit()) continue; setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } setF16Action(MVT::v8f16, Expand); setOperationAction(ISD::FADD, MVT::v8f16, Expand); setOperationAction(ISD::FSUB, MVT::v8f16, Expand); setOperationAction(ISD::FMUL, MVT::v8f16, Expand); setOperationAction(ISD::FDIV, MVT::v8f16, Expand); setOperationAction(ISD::FNEG, MVT::v8f16, Custom); setOperationAction(ISD::FABS, MVT::v8f16, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Custom); // Custom lower v2i64 and v2f64 selects. setOperationAction(ISD::SELECT, MVT::v2f64, Custom); setOperationAction(ISD::SELECT, MVT::v2i64, Custom); setOperationAction(ISD::SELECT, MVT::v4i32, Custom); setOperationAction(ISD::SELECT, MVT::v8i16, Custom); setOperationAction(ISD::SELECT, MVT::v8f16, Custom); setOperationAction(ISD::SELECT, MVT::v16i8, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom); // Custom legalize these to avoid over promotion or custom promotion. for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) { setOperationAction(ISD::FP_TO_SINT, VT, Custom); setOperationAction(ISD::FP_TO_UINT, VT, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom); } setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom); // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion. setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom); // We want to legalize this to an f64 load rather than an i64 load on // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for // store. setOperationAction(ISD::LOAD, MVT::v2i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i16, Custom); setOperationAction(ISD::LOAD, MVT::v8i8, Custom); setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i16, Custom); setOperationAction(ISD::STORE, MVT::v8i8, Custom); // Add 32-bit vector stores to help vectorization opportunities. setOperationAction(ISD::STORE, MVT::v2i16, Custom); setOperationAction(ISD::STORE, MVT::v4i8, Custom); setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); setOperationAction(ISD::BITCAST, MVT::v8i8, Custom); if (!Subtarget.hasAVX512()) setOperationAction(ISD::BITCAST, MVT::v16i1, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v2i64, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i64, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom); // In the customized shift lowering, the legal v4i32/v2i64 cases // in AVX2 will be recognized. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); if (VT == MVT::v2i64) continue; setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); setOperationAction(ISD::FSHL, VT, Custom); setOperationAction(ISD::FSHR, VT, Custom); } setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal); } if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) { setOperationAction(ISD::BITREVERSE, MVT::i8, Custom); setOperationAction(ISD::BITREVERSE, MVT::i16, Custom); setOperationAction(ISD::BITREVERSE, MVT::i32, Custom); setOperationAction(ISD::BITREVERSE, MVT::i64, Custom); for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) { setOperationAction(ISD::BITREVERSE, VT, Custom); } setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) { setOperationAction(ISD::ABS, MVT::v16i8, Legal); setOperationAction(ISD::ABS, MVT::v8i16, Legal); setOperationAction(ISD::ABS, MVT::v4i32, Legal); for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) { setOperationAction(ISD::BITREVERSE, VT, Custom); setOperationAction(ISD::CTLZ, VT, Custom); } // These might be better off as horizontal vector ops. setOperationAction(ISD::ADD, MVT::i16, Custom); setOperationAction(ISD::ADD, MVT::i32, Custom); setOperationAction(ISD::SUB, MVT::i16, Custom); setOperationAction(ISD::SUB, MVT::i32, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) { for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) { setOperationAction(ISD::FFLOOR, RoundedTy, Legal); setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal); setOperationAction(ISD::FCEIL, RoundedTy, Legal); setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal); setOperationAction(ISD::FTRUNC, RoundedTy, Legal); setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal); setOperationAction(ISD::FRINT, RoundedTy, Legal); setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal); setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal); setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal); setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal); setOperationAction(ISD::FROUND, RoundedTy, Custom); } setOperationAction(ISD::SMAX, MVT::v16i8, Legal); setOperationAction(ISD::SMAX, MVT::v4i32, Legal); setOperationAction(ISD::UMAX, MVT::v8i16, Legal); setOperationAction(ISD::UMAX, MVT::v4i32, Legal); setOperationAction(ISD::SMIN, MVT::v16i8, Legal); setOperationAction(ISD::SMIN, MVT::v4i32, Legal); setOperationAction(ISD::UMIN, MVT::v8i16, Legal); setOperationAction(ISD::UMIN, MVT::v4i32, Legal); setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom); setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom); setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom); // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); setOperationAction(ISD::SMULO, MVT::v2i32, Custom); // We directly match byte blends in the backend as they match the VSELECT // condition form. setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); // SSE41 brings specific instructions for doing vector sign extend even in // cases where we don't have SRA. for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal); } // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal); } if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) { // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can // do the pre and post work in the vector domain. setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom); // We need to mark SINT_TO_FP as Custom even though we want to expand it // so that DAG combine doesn't try to turn it into uint_to_fp. setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom); } } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) { setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) { for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); } // XOP can efficiently perform BITREVERSE with VPPERM. for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) setOperationAction(ISD::BITREVERSE, VT, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) { bool HasInt256 = Subtarget.hasInt256(); addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass); for (auto VT : { MVT::v8f32, MVT::v4f64 }) { setOperationAction(ISD::FFLOOR, VT, Legal); setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); setOperationAction(ISD::FCEIL, VT, Legal); setOperationAction(ISD::STRICT_FCEIL, VT, Legal); setOperationAction(ISD::FTRUNC, VT, Legal); setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); setOperationAction(ISD::FRINT, VT, Legal); setOperationAction(ISD::STRICT_FRINT, VT, Legal); setOperationAction(ISD::FNEARBYINT, VT, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); setOperationAction(ISD::FROUNDEVEN, VT, Legal); setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal); setOperationAction(ISD::FROUND, VT, Custom); setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); setOperationAction(ISD::FCOPYSIGN, VT, Custom); setOperationAction(ISD::FMAXIMUM, VT, Custom); setOperationAction(ISD::FMINIMUM, VT, Custom); setOperationAction(ISD::FMAXIMUMNUM, VT, Custom); setOperationAction(ISD::FMINIMUMNUM, VT, Custom); setOperationAction(ISD::FCANONICALIZE, VT, Custom); } setOperationAction(ISD::LRINT, MVT::v8f32, Custom); setOperationAction(ISD::LRINT, MVT::v4f64, Custom); setOperationAction(ISD::AND, MVT::i256, Custom); setOperationAction(ISD::OR, MVT::i256, Custom); setOperationAction(ISD::XOR, MVT::i256, Custom); // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted // even though v8i16 is a legal type. setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32); setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Custom); setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand); setOperationAction(ISD::FP_ROUND, MVT::v8f16, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal); setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal); if (!Subtarget.hasAVX512()) setOperationAction(ISD::BITCAST, MVT::v32i1, Custom); // In the customized shift lowering, the legal v8i32/v4i64 cases // in AVX2 will be recognized. for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::ABDS, VT, Custom); setOperationAction(ISD::ABDU, VT, Custom); if (VT == MVT::v4i64) continue; setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); setOperationAction(ISD::FSHL, VT, Custom); setOperationAction(ISD::FSHR, VT, Custom); } // These types need custom splitting if their input is a 128-bit vector. setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::SELECT, MVT::v4f64, Custom); setOperationAction(ISD::SELECT, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v8i32, Custom); setOperationAction(ISD::SELECT, MVT::v16i16, Custom); setOperationAction(ISD::SELECT, MVT::v16f16, Custom); setOperationAction(ISD::SELECT, MVT::v32i8, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::SIGN_EXTEND, VT, Custom); setOperationAction(ISD::ZERO_EXTEND, VT, Custom); setOperationAction(ISD::ANY_EXTEND, VT, Custom); } setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v32i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v32i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v32i64, Custom); for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTLZ, VT, Custom); setOperationAction(ISD::BITREVERSE, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. setCondCodeAction(ISD::SETLT, VT, Custom); setCondCodeAction(ISD::SETLE, VT, Custom); } setOperationAction(ISD::SETCC, MVT::v4f64, Custom); setOperationAction(ISD::SETCC, MVT::v8f32, Custom); setOperationAction(ISD::STRICT_FSETCC, MVT::v4f64, Custom); setOperationAction(ISD::STRICT_FSETCC, MVT::v8f32, Custom); setOperationAction(ISD::STRICT_FSETCCS, MVT::v4f64, Custom); setOperationAction(ISD::STRICT_FSETCCS, MVT::v8f32, Custom); if (Subtarget.hasAnyFMA()) { for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) { setOperationAction(ISD::FMA, VT, Legal); setOperationAction(ISD::STRICT_FMA, VT, Legal); } } for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom); } setOperationAction(ISD::MUL, MVT::v4i64, Custom); setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom); setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::MUL, MVT::v32i8, Custom); setOperationAction(ISD::MULHU, MVT::v8i32, Custom); setOperationAction(ISD::MULHS, MVT::v8i32, Custom); setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::MULHU, MVT::v32i8, Custom); setOperationAction(ISD::MULHS, MVT::v32i8, Custom); setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom); setOperationAction(ISD::SMULO, MVT::v32i8, Custom); setOperationAction(ISD::UMULO, MVT::v32i8, Custom); setOperationAction(ISD::ABS, MVT::v4i64, Custom); setOperationAction(ISD::SMAX, MVT::v4i64, Custom); setOperationAction(ISD::UMAX, MVT::v4i64, Custom); setOperationAction(ISD::SMIN, MVT::v4i64, Custom); setOperationAction(ISD::UMIN, MVT::v4i64, Custom); setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom); setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom); setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom); setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom); setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom); setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom); setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom); setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom); setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom); for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom); setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom); } for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) { setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); } if (HasInt256) { // The custom lowering for UINT_TO_FP for v8i32 becomes interesting // when we have a 256bit-wide blend with immediate. setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom); // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal); setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal); setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal); setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal); setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal); } } for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) { setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::MSTORE, VT, Legal); } // Extract subvector is special because the value type // (result) is 128-bit but the source is 256-bit wide. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); } // Custom lower several nodes for 256-bit types. for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, MVT::v16f16, MVT::v8f32, MVT::v4f64 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::STORE, VT, Custom); } setF16Action(MVT::v16f16, Expand); setOperationAction(ISD::FNEG, MVT::v16f16, Custom); setOperationAction(ISD::FABS, MVT::v16f16, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::v16f16, Custom); setOperationAction(ISD::FADD, MVT::v16f16, Expand); setOperationAction(ISD::FSUB, MVT::v16f16, Expand); setOperationAction(ISD::FMUL, MVT::v16f16, Expand); setOperationAction(ISD::FDIV, MVT::v16f16, Expand); // Only PCLMUL required as we always unroll clmul vectors. if (Subtarget.hasPCLMUL()) { for (auto VT : {MVT::v8i32, MVT::v4i64}) { setOperationAction(ISD::CLMUL, VT, Custom); setOperationAction(ISD::CLMULH, VT, Custom); } } if (HasInt256) { setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); // Custom legalize 2x32 to get a little better code. setOperationAction(ISD::MGATHER, MVT::v2f32, Custom); setOperationAction(ISD::MGATHER, MVT::v2i32, Custom); for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) setOperationAction(ISD::MGATHER, VT, Custom); } if (Subtarget.hasGFNI()) { setOperationAction(ISD::CTLZ, MVT::v32i8, Custom); setOperationAction(ISD::CTTZ, MVT::v32i8, Custom); } } if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() && Subtarget.hasF16C()) { for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) { setOperationAction(ISD::FP_ROUND, VT, Custom); setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom); } for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) { setOperationAction(ISD::FP_EXTEND, VT, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom); } for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) { setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32); setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32); } setOperationAction(ISD::SETCC, MVT::v8f16, Custom); setOperationAction(ISD::SETCC, MVT::v16f16, Custom); } // This block controls legalization of the mask vector sizes that are // available with AVX512. 512-bit vectors are in a separate block controlled // by useAVX512Regs. if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { addRegisterClass(MVT::v1i1, &X86::VK1RegClass); addRegisterClass(MVT::v2i1, &X86::VK2RegClass); addRegisterClass(MVT::v4i1, &X86::VK4RegClass); addRegisterClass(MVT::v8i1, &X86::VK8RegClass); addRegisterClass(MVT::v16i1, &X86::VK16RegClass); setOperationAction(ISD::SELECT, MVT::v1i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom); setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32); setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32); setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32); setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32); setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom); setOperationAction(ISD::FCANONICALIZE, MVT::v8f16, Custom); setOperationAction(ISD::FCANONICALIZE, MVT::v16f16, Custom); setOperationAction(ISD::FCANONICALIZE, MVT::v32f16, Custom); // There is no byte sized k-register load or store without AVX512DQ. if (!Subtarget.hasDQI()) { setOperationAction(ISD::LOAD, MVT::v1i1, Custom); setOperationAction(ISD::LOAD, MVT::v2i1, Custom); setOperationAction(ISD::LOAD, MVT::v4i1, Custom); setOperationAction(ISD::LOAD, MVT::v8i1, Custom); setOperationAction(ISD::STORE, MVT::v1i1, Custom); setOperationAction(ISD::STORE, MVT::v2i1, Custom); setOperationAction(ISD::STORE, MVT::v4i1, Custom); setOperationAction(ISD::STORE, MVT::v8i1, Custom); } // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SIGN_EXTEND, VT, Custom); setOperationAction(ISD::ZERO_EXTEND, VT, Custom); setOperationAction(ISD::ANY_EXTEND, VT, Custom); } for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) setOperationAction(ISD::VSELECT, VT, Expand); for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); } for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); } if (Subtarget.hasDQI() && Subtarget.hasVLX()) { for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) { setOperationAction(ISD::LRINT, VT, Legal); setOperationAction(ISD::LLRINT, VT, Legal); } } // This block controls legalization for 512-bit operations with 8/16/32/64 bit // elements. 512-bits can be disabled based on prefer-vector-width and // required-vector-width function attributes. if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) { bool HasBWI = Subtarget.hasBWI(); addRegisterClass(MVT::v16i32, &X86::VR512RegClass); addRegisterClass(MVT::v16f32, &X86::VR512RegClass); addRegisterClass(MVT::v8i64, &X86::VR512RegClass); addRegisterClass(MVT::v8f64, &X86::VR512RegClass); addRegisterClass(MVT::v32i16, &X86::VR512RegClass); addRegisterClass(MVT::v32f16, &X86::VR512RegClass); addRegisterClass(MVT::v64i8, &X86::VR512RegClass); for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal); setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal); if (HasBWI) setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal); } for (MVT VT : { MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::FMAXIMUM, VT, Custom); setOperationAction(ISD::FMINIMUM, VT, Custom); setOperationAction(ISD::FMAXIMUMNUM, VT, Custom); setOperationAction(ISD::FMINIMUMNUM, VT, Custom); setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); setOperationAction(ISD::FMA, VT, Legal); setOperationAction(ISD::STRICT_FMA, VT, Legal); setOperationAction(ISD::FCOPYSIGN, VT, Custom); setOperationAction(ISD::FCANONICALIZE, VT, Custom); } setOperationAction(ISD::LRINT, MVT::v16f32, Subtarget.hasDQI() ? Legal : Custom); setOperationAction(ISD::LRINT, MVT::v8f64, Subtarget.hasDQI() ? Legal : Custom); if (Subtarget.hasDQI()) setOperationAction(ISD::LLRINT, MVT::v8f64, Legal); setOperationAction(ISD::AND, MVT::i512, Custom); setOperationAction(ISD::OR, MVT::i512, Custom); setOperationAction(ISD::XOR, MVT::i512, Custom); setOperationAction(ISD::ADD, MVT::i512, Custom); setOperationAction(ISD::SUB, MVT::i512, Custom); for (MVT VT : { MVT::v16i1, MVT::v16i8 }) { setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32); setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32); } for (MVT VT : { MVT::v16i16, MVT::v16i32 }) { setOperationAction(ISD::FP_TO_SINT, VT, Custom); setOperationAction(ISD::FP_TO_UINT, VT, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom); } setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom); setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Custom); setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal); setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal); setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); if (HasBWI) setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE // to 512-bit rather than use the AVX2 instructions so that we can use // k-masks. if (!Subtarget.hasVLX()) { for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) { setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::MSTORE, VT, Custom); } } setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal); setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal); setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); if (HasBWI) { // Extends from v64i1 masks to 512-bit vectors. setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom); } for (auto VT : { MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::FFLOOR, VT, Legal); setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); setOperationAction(ISD::FCEIL, VT, Legal); setOperationAction(ISD::STRICT_FCEIL, VT, Legal); setOperationAction(ISD::FTRUNC, VT, Legal); setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); setOperationAction(ISD::FRINT, VT, Legal); setOperationAction(ISD::STRICT_FRINT, VT, Legal); setOperationAction(ISD::FNEARBYINT, VT, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); setOperationAction(ISD::FROUNDEVEN, VT, Legal); setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal); setOperationAction(ISD::FROUND, VT, Custom); } for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) { setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); } setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom); setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom); setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom); setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom); setOperationAction(ISD::MUL, MVT::v8i64, Custom); setOperationAction(ISD::MUL, MVT::v16i32, Legal); setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom); setOperationAction(ISD::MUL, MVT::v64i8, Custom); setOperationAction(ISD::MULHU, MVT::v16i32, Custom); setOperationAction(ISD::MULHS, MVT::v16i32, Custom); setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom); setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom); setOperationAction(ISD::MULHS, MVT::v64i8, Custom); setOperationAction(ISD::MULHU, MVT::v64i8, Custom); setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom); setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom); setOperationAction(ISD::SMULO, MVT::v64i8, Custom); setOperationAction(ISD::UMULO, MVT::v64i8, Custom); for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) { setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::ABDS, VT, Custom); setOperationAction(ISD::ABDU, VT, Custom); setOperationAction(ISD::BITREVERSE, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. setCondCodeAction(ISD::SETLT, VT, Custom); setCondCodeAction(ISD::SETLE, VT, Custom); } setOperationAction(ISD::SETCC, MVT::v8f64, Custom); setOperationAction(ISD::SETCC, MVT::v16f32, Custom); setOperationAction(ISD::STRICT_FSETCC, MVT::v8f64, Custom); setOperationAction(ISD::STRICT_FSETCC, MVT::v16f32, Custom); setOperationAction(ISD::STRICT_FSETCCS, MVT::v8f64, Custom); setOperationAction(ISD::STRICT_FSETCCS, MVT::v16f32, Custom); for (auto VT : { MVT::v16i32, MVT::v8i64 }) { setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); setOperationAction(ISD::SMIN, VT, Legal); setOperationAction(ISD::UMIN, VT, Legal); setOperationAction(ISD::ABS, VT, Legal); setOperationAction(ISD::CTPOP, VT, Custom); } for (auto VT : { MVT::v64i8, MVT::v32i16 }) { setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom); setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom); setOperationAction(ISD::CTLZ, VT, Custom); setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom); setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom); setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom); setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom); setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom); setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom); setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom); setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom); } setOperationAction(ISD::FSHL, MVT::v64i8, Custom); setOperationAction(ISD::FSHR, MVT::v64i8, Custom); setOperationAction(ISD::FSHL, MVT::v32i16, Custom); setOperationAction(ISD::FSHR, MVT::v32i16, Custom); setOperationAction(ISD::FSHL, MVT::v16i32, Custom); setOperationAction(ISD::FSHR, MVT::v16i32, Custom); if (Subtarget.hasDQI() || Subtarget.hasFP16()) for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) setOperationAction(Opc, MVT::v8i64, Custom); if (Subtarget.hasDQI()) setOperationAction(ISD::MUL, MVT::v8i64, Legal); if (Subtarget.hasCDI()) { // NonVLX sub-targets extend 128/256 vectors to use the 512 version. for (auto VT : { MVT::v16i32, MVT::v8i64} ) { setOperationAction(ISD::CTLZ, VT, Legal); } } // Subtarget.hasCDI() if (Subtarget.hasVPOPCNTDQ()) { for (auto VT : { MVT::v16i32, MVT::v8i64 }) setOperationAction(ISD::CTPOP, VT, Legal); } // Extract subvector is special because the value type // (result) is 256-bit but the source is 512-bit wide. // 128-bit was made Legal under AVX1. for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, MVT::v16f16, MVT::v8f32, MVT::v4f64 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64, MVT::v32f16, MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); } setF16Action(MVT::v32f16, Expand); setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom); setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Custom); for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32); setOperationAction(ISD::SETCC, MVT::v32f16, Custom); for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); } if (HasBWI) { for (auto VT : { MVT::v64i8, MVT::v32i16 }) { setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); } } else { setOperationAction(ISD::STORE, MVT::v32i16, Custom); setOperationAction(ISD::STORE, MVT::v64i8, Custom); } if (Subtarget.hasVBMI2()) { for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) { setOperationAction(ISD::FSHL, VT, Legal); setOperationAction(ISD::FSHR, VT, Legal); } setOperationAction(ISD::ROTL, MVT::v32i16, Custom); setOperationAction(ISD::ROTR, MVT::v32i16, Custom); } // Only PCLMUL required as we always unroll clmul vectors. if (Subtarget.hasPCLMUL()) { for (auto VT : {MVT::v16i32, MVT::v8i64}) { setOperationAction(ISD::CLMUL, VT, Custom); setOperationAction(ISD::CLMULH, VT, Custom); } } setOperationAction(ISD::FNEG, MVT::v32f16, Custom); setOperationAction(ISD::FABS, MVT::v32f16, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::v32f16, Custom); if (Subtarget.hasGFNI()) { setOperationAction(ISD::CTLZ, MVT::v64i8, Custom); setOperationAction(ISD::CTTZ, MVT::v64i8, Custom); } }// useAVX512Regs if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) { for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32, MVT::v4i64}) { setOperationAction(ISD::FSHL, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::FSHR, VT, Subtarget.hasVLX() ? Legal : Custom); } } // This block controls legalization for operations that don't have // pre-AVX512 equivalents. Without VLX we use 512-bit operations for // narrower widths. if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { for (MVT VT : {MVT::f16, MVT::f32, MVT::f64, MVT::v8f16, MVT::v4f32, MVT::v2f64, MVT::v16f16, MVT::v8f32, MVT::v4f64, MVT::v32f16, MVT::v16f32, MVT::v8f64}) setOperationAction(ISD::FLDEXP, VT, Custom); // These operations are handled on non-VLX by artificially widening in // isel patterns. setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom); if (Subtarget.hasDQI()) { // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion. // v2f32 UINT_TO_FP is already custom under SSE2. assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && "Unexpected operation action!"); // v2i64 FP_TO_S/UINT(v2f32) custom conversion. setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom); } for (auto VT : { MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); setOperationAction(ISD::SMIN, VT, Legal); setOperationAction(ISD::UMIN, VT, Legal); setOperationAction(ISD::ABS, VT, Legal); } for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); } // Custom legalize 2x32 to get a little better code. setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom); setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom); for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) setOperationAction(ISD::MSCATTER, VT, Custom); if (Subtarget.hasDQI()) { for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) { setOperationAction(Opc, MVT::v2i64, Custom); setOperationAction(Opc, MVT::v4i64, Custom); } setOperationAction(ISD::MUL, MVT::v2i64, Legal); setOperationAction(ISD::MUL, MVT::v4i64, Legal); } if (Subtarget.hasCDI()) { for (auto VT : {MVT::i256, MVT::i512}) { if (VT == MVT::i512 && !Subtarget.useAVX512Regs()) continue; setOperationAction(ISD::CTLZ, VT, Custom); setOperationAction(ISD::CTTZ, VT, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom); } for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::CTLZ, VT, Legal); } } // Subtarget.hasCDI() if (Subtarget.hasVPOPCNTDQ()) { for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) setOperationAction(ISD::CTPOP, VT, Legal); } // We can try to convert vectors to different sizes to leverage legal // `vpcompress` cases. So we mark these supported vector sizes as Custom and // then specialize to Legal below. for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64, MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16, MVT::v16i16, MVT::v8i8}) setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom); // Legal vpcompress depends on various AVX512 extensions. // Legal in AVX512F for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64}) setOperationAction(ISD::VECTOR_COMPRESS, VT, Legal); // Legal in AVX512F + AVX512VL if (Subtarget.hasVLX()) for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64, MVT::v4f64, MVT::v2i64, MVT::v2f64}) setOperationAction(ISD::VECTOR_COMPRESS, VT, Legal); // Legal in AVX512F + AVX512VBMI2 if (Subtarget.hasVBMI2()) for (MVT VT : {MVT::v32i16, MVT::v64i8}) setOperationAction(ISD::VECTOR_COMPRESS, VT, Legal); // Legal in AVX512F + AVX512VL + AVX512VBMI2 if (Subtarget.hasVBMI2() && Subtarget.hasVLX()) for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16}) setOperationAction(ISD::VECTOR_COMPRESS, VT, Legal); } // This block control legalization of v32i1/v64i1 which are available with // AVX512BW.. if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) { addRegisterClass(MVT::v32i1, &X86::VK32RegClass); addRegisterClass(MVT::v64i1, &X86::VK64RegClass); for (auto VT : { MVT::v32i1, MVT::v64i1 }) { setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::TRUNCATE, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); } for (auto VT : { MVT::v16i1, MVT::v32i1 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); // Extends from v32i1 masks to 256-bit vectors. setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom); for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16, MVT::v16f16, MVT::v8f16}) { setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom); } // These operations are handled on non-VLX by artificially widening in // isel patterns. // TODO: Custom widen in lowering on non-VLX and drop the isel patterns? if (Subtarget.hasBITALG()) { for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 }) setOperationAction(ISD::CTPOP, VT, Legal); } } if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) { auto setGroup = [&] (MVT VT) { setOperationAction(ISD::FADD, VT, Legal); setOperationAction(ISD::STRICT_FADD, VT, Legal); setOperationAction(ISD::FSUB, VT, Legal); setOperationAction(ISD::STRICT_FSUB, VT, Legal); setOperationAction(ISD::FMUL, VT, Legal); setOperationAction(ISD::STRICT_FMUL, VT, Legal); setOperationAction(ISD::FDIV, VT, Legal); setOperationAction(ISD::STRICT_FDIV, VT, Legal); setOperationAction(ISD::FSQRT, VT, Legal); setOperationAction(ISD::STRICT_FSQRT, VT, Legal); setOperationAction(ISD::FFLOOR, VT, Legal); setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); setOperationAction(ISD::FCEIL, VT, Legal); setOperationAction(ISD::STRICT_FCEIL, VT, Legal); setOperationAction(ISD::FTRUNC, VT, Legal); setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); setOperationAction(ISD::FRINT, VT, Legal); setOperationAction(ISD::STRICT_FRINT, VT, Legal); setOperationAction(ISD::FNEARBYINT, VT, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); setOperationAction(ISD::FROUNDEVEN, VT, Legal); setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal); setOperationAction(ISD::FROUND, VT, Custom); setOperationAction(ISD::LOAD, VT, Legal); setOperationAction(ISD::STORE, VT, Legal); setOperationAction(ISD::FMA, VT, Legal); setOperationAction(ISD::STRICT_FMA, VT, Legal); setOperationAction(ISD::VSELECT, VT, Legal); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); setOperationAction(ISD::FCOPYSIGN, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); }; // AVX512_FP16 scalar operations setGroup(MVT::f16); setOperationAction(ISD::FREM, MVT::f16, Promote); setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote); setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); setOperationAction(ISD::BR_CC, MVT::f16, Expand); setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote); setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal); setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal); setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom); setOperationAction(ISD::FMINIMUM, MVT::f16, Custom); setOperationAction(ISD::FMAXIMUMNUM, MVT::f16, Custom); setOperationAction(ISD::FMINIMUMNUM, MVT::f16, Custom); setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal); setOperationAction(ISD::LRINT, MVT::f16, Legal); setOperationAction(ISD::LLRINT, MVT::f16, Legal); setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand); setCondCodeAction(ISD::SETUNE, MVT::f16, Expand); if (Subtarget.useAVX512Regs()) { setGroup(MVT::v32f16); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal); setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom); setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8, MVT::v32i16); setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8, MVT::v32i16); setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1, MVT::v32i16); setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1, MVT::v32i16); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom); setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal); setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal); setOperationAction(ISD::FMINIMUM, MVT::v32f16, Custom); setOperationAction(ISD::FMAXIMUM, MVT::v32f16, Custom); setOperationAction(ISD::FMINIMUMNUM, MVT::v32f16, Custom); setOperationAction(ISD::FMAXIMUMNUM, MVT::v32f16, Custom); setOperationAction(ISD::LRINT, MVT::v32f16, Legal); setOperationAction(ISD::LLRINT, MVT::v8f16, Legal); } setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom); if (Subtarget.hasVLX()) { setGroup(MVT::v8f16); setGroup(MVT::v16f16); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal); setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal); // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom); setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal); setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal); setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal); setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal); // Need to custom widen these to prevent scalarization. setOperationAction(ISD::LOAD, MVT::v4f16, Custom); setOperationAction(ISD::STORE, MVT::v4f16, Custom); setOperationAction(ISD::FMINIMUM, MVT::v8f16, Custom); setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Custom); setOperationAction(ISD::FMINIMUMNUM, MVT::v8f16, Custom); setOperationAction(ISD::FMAXIMUMNUM, MVT::v8f16, Custom); setOperationAction(ISD::FMINIMUM, MVT::v16f16, Custom); setOperationAction(ISD::FMAXIMUM, MVT::v16f16, Custom); setOperationAction(ISD::FMINIMUMNUM, MVT::v16f16, Custom); setOperationAction(ISD::FMAXIMUMNUM, MVT::v16f16, Custom); setOperationAction(ISD::LRINT, MVT::v8f16, Legal); setOperationAction(ISD::LRINT, MVT::v16f16, Legal); } } if (!Subtarget.useSoftFloat() && (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) { addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass : &X86::VR128RegClass); addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass : &X86::VR256RegClass); // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT. // Set the operation action Custom to do the customization later. setOperationAction(ISD::BUILD_VECTOR, MVT::bf16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::bf16, Custom); for (auto VT : {MVT::v8bf16, MVT::v16bf16}) { setF16Action(VT, Expand); if (!Subtarget.hasBF16()) setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); } for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) { setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32); setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32); } setOperationAction(ISD::SETCC, MVT::v8bf16, Custom); setOperationAction(ISD::SETCC, MVT::v16bf16, Custom); setOperationAction(ISD::FP_ROUND, MVT::v8bf16, Custom); addLegalFPImmediate(APFloat::getZero(APFloat::BFloat())); } if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() && Subtarget.useAVX512Regs()) { addRegisterClass(MVT::v32bf16, &X86::VR512RegClass); setF16Action(MVT::v32bf16, Expand); for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32); setOperationAction(ISD::SETCC, MVT::v32bf16, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom); setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32bf16, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32bf16, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v32bf16, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) { setOperationAction(ISD::FADD, MVT::v32bf16, Legal); setOperationAction(ISD::FSUB, MVT::v32bf16, Legal); setOperationAction(ISD::FMUL, MVT::v32bf16, Legal); setOperationAction(ISD::FDIV, MVT::v32bf16, Legal); setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal); setOperationAction(ISD::FMA, MVT::v32bf16, Legal); setOperationAction(ISD::SETCC, MVT::v32bf16, Custom); setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom); setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom); setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom); setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom); for (auto VT : {MVT::v8bf16, MVT::v16bf16}) { setOperationAction(ISD::FADD, VT, Legal); setOperationAction(ISD::FSUB, VT, Legal); setOperationAction(ISD::FMUL, VT, Legal); setOperationAction(ISD::FDIV, VT, Legal); setOperationAction(ISD::FSQRT, VT, Legal); setOperationAction(ISD::FMA, VT, Legal); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::FMINIMUM, VT, Custom); setOperationAction(ISD::FMAXIMUM, VT, Custom); setOperationAction(ISD::FMINIMUMNUM, VT, Custom); setOperationAction(ISD::FMAXIMUMNUM, VT, Custom); } for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) { setCondCodeAction(ISD::SETOEQ, VT, Custom); setCondCodeAction(ISD::SETUNE, VT, Custom); } } if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) { setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal); setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal); setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal); setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal); setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal); setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal); setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); if (Subtarget.hasBWI()) { setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); } if (Subtarget.hasFP16()) { // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64 setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom); // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16 setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom); // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16 setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom); setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom); // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32 setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom); setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom); } } if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) { addRegisterClass(MVT::x86amx, &X86::TILERegClass); } // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); if (!Subtarget.is64Bit()) { setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); } // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't // handle type legalization for these operations here. // // FIXME: We really should do custom legalization for addition and // subtraction on x86-32 once PR3203 is fixed. We really can't do much better // than generic legalization for 64-bit multiplication-with-overflow, though. for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget.is64Bit()) continue; // Add/Sub/Mul with overflow operations are custom lowered. setOperationAction(ISD::SADDO, VT, Custom); setOperationAction(ISD::UADDO, VT, Custom); setOperationAction(ISD::SSUBO, VT, Custom); setOperationAction(ISD::USUBO, VT, Custom); setOperationAction(ISD::SMULO, VT, Custom); setOperationAction(ISD::UMULO, VT, Custom); // Support carry in as value rather than glue. setOperationAction(ISD::UADDO_CARRY, VT, Custom); setOperationAction(ISD::USUBO_CARRY, VT, Custom); setOperationAction(ISD::SETCCCARRY, VT, Custom); setOperationAction(ISD::SADDO_CARRY, VT, Custom); setOperationAction(ISD::SSUBO_CARRY, VT, Custom); } // Combine sin / cos into _sincos_stret if it is available. setOperationAction(ISD::FSINCOS, MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); if (Subtarget.isTargetWin64()) { setOperationAction(ISD::SDIV, MVT::i128, Custom); setOperationAction(ISD::UDIV, MVT::i128, Custom); setOperationAction(ISD::SREM, MVT::i128, Custom); setOperationAction(ISD::UREM, MVT::i128, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom); } // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)` // is. We should promote the value to 64-bits to solve this. // This is what the CRT headers do - `fmodf` is an inline header // function casting to f64 and calling `fmod`. if (Subtarget.is32Bit() && (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium())) // clang-format off for (ISD::NodeType Op : {ISD::FACOS, ISD::STRICT_FACOS, ISD::FASIN, ISD::STRICT_FASIN, ISD::FATAN, ISD::STRICT_FATAN, ISD::FATAN2, ISD::STRICT_FATAN2, ISD::FCEIL, ISD::STRICT_FCEIL, ISD::FCOS, ISD::STRICT_FCOS, ISD::FCOSH, ISD::STRICT_FCOSH, ISD::FEXP, ISD::STRICT_FEXP, ISD::FFLOOR, ISD::STRICT_FFLOOR, ISD::FREM, ISD::STRICT_FREM, ISD::FLOG, ISD::STRICT_FLOG, ISD::FLOG10, ISD::STRICT_FLOG10, ISD::FPOW, ISD::STRICT_FPOW, ISD::FSIN, ISD::STRICT_FSIN, ISD::FSINH, ISD::STRICT_FSINH, ISD::FTAN, ISD::STRICT_FTAN, ISD::FTANH, ISD::STRICT_FTANH, // TODO: Add ISD:::STRICT_FMODF too once implemented. ISD::FMODF}) if (isOperationExpandOrLibCall(Op, MVT::f32)) setOperationAction(Op, MVT::f32, Promote); // clang-format on // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has // it, but it's just a wrapper around ldexp. if (Subtarget.isOSWindows()) { for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP}) if (isOperationExpand(Op, MVT::f32)) setOperationAction(Op, MVT::f32, Promote); } setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16); setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32); setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64); // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine({ISD::VECTOR_SHUFFLE, ISD::SCALAR_TO_VECTOR, ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT, ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR, ISD::BITCAST, ISD::VSELECT, ISD::SELECT, ISD::SHL, ISD::SRA, ISD::SRL, ISD::OR, ISD::AND, ISD::AVGCEILS, ISD::AVGCEILU, ISD::AVGFLOORS, ISD::AVGFLOORU, ISD::BITREVERSE, ISD::ADD, ISD::SADDSAT, ISD::SSUBSAT, ISD::FADD, ISD::FSUB, ISD::FNEG, ISD::FMA, ISD::STRICT_FMA, ISD::FMINNUM, ISD::FMAXNUM, ISD::SUB, ISD::LOAD, ISD::LRINT, ISD::LLRINT, ISD::MLOAD, ISD::STORE, ISD::MSTORE, ISD::TRUNCATE, ISD::ZERO_EXTEND, ISD::ANY_EXTEND, ISD::SIGN_EXTEND, ISD::SIGN_EXTEND_INREG, ISD::ANY_EXTEND_VECTOR_INREG, ISD::SIGN_EXTEND_VECTOR_INREG, ISD::ZERO_EXTEND_VECTOR_INREG, ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::SETCC, ISD::MUL, ISD::XOR, ISD::MSCATTER, ISD::MGATHER, ISD::FP16_TO_FP, ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND, ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FSHL, ISD::FSHR, ISD::INTRINSIC_VOID, ISD::INTRINSIC_WO_CHAIN, ISD::INTRINSIC_W_CHAIN}); computeRegisterProperties(Subtarget.getRegisterInfo()); MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores MaxStoresPerMemcpyOptSize = 4; MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores MaxStoresPerMemmoveOptSize = 4; // TODO: These control memcmp expansion in CGP and could be raised higher, but // that needs to benchmarked and balanced with the potential use of vector // load/store types (PR33329, PR33914). MaxLoadsPerMemcmp = 2; MaxLoadsPerMemcmpOptSize = 2; // Default loop alignment, which can be overridden by -align-loops. setPrefLoopAlignment(Align(16)); // An out-of-order CPU can speculatively execute past a predictable branch, // but a conditional move could be stalled by an expensive earlier operation. PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder(); EnableExtLdPromotion = true; setPrefFunctionAlignment(Align(16)); verifyIntrinsicTables(); // Default to having -disable-strictnode-mutation on IsStrictFPEnabled = true; } // This has so far only been implemented for 64-bit MachO. bool X86TargetLowering::useLoadStackGuardNode(const Module &M) const { return Subtarget.isTargetMachO() && Subtarget.is64Bit(); } bool X86TargetLowering::useStackGuardXorFP() const { // Currently only MSVC CRTs XOR the frame pointer into the stack guard value. return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO(); } SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const { EVT PtrTy = getPointerTy(DAG.getDataLayout()); unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP; MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val); return SDValue(Node, 0); } TargetLoweringBase::LegalizeTypeAction X86TargetLowering::getPreferredVectorAction(MVT VT) const { if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return TypeSplitVector; // Since v8f16 is legal, widen anything over v4f16. if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16) return TypeSplitVector; if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && VT.getVectorElementType() != MVT::i1) return TypeWidenVector; return TargetLoweringBase::getPreferredVectorAction(VT); } FastISel *X86TargetLowering::createFastISel( FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo, const LibcallLoweringInfo *libcallLowering) const { return X86::createFastISel(funcInfo, libInfo, libcallLowering); } //===----------------------------------------------------------------------===// // Other Lowering Hooks //===----------------------------------------------------------------------===// bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse, bool IgnoreAlignment) { if (!AssumeSingleUse && !Op.hasOneUse()) return false; if (!ISD::isNormalLoad(Op.getNode())) return false; // If this is an unaligned vector, make sure the target supports folding it. auto *Ld = cast(Op.getNode()); if (!IgnoreAlignment && !Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() && Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16)) return false; // TODO: If this is a non-temporal load and the target has an instruction // for it, it should not be folded. See "useNonTemporalLoad()". return true; } bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse) { assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory"); if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse)) return false; // We can not replace a wide volatile load with a broadcast-from-memory, // because that would narrow the load, which isn't legal for volatiles. auto *Ld = cast(Op.getNode()); return !Ld->isVolatile() || Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits(); } bool X86::mayFoldIntoStore(SDValue Op) { if (!Op.hasOneUse()) return false; // Peek through (oneuse) bitcast users SDNode *User = *Op->user_begin(); while (User->getOpcode() == ISD::BITCAST) { if (!User->hasOneUse()) return false; User = *User->user_begin(); } return ISD::isNormalStore(User); } bool X86::mayFoldIntoZeroExtend(SDValue Op) { if (Op.hasOneUse()) { unsigned Opcode = Op.getNode()->user_begin()->getOpcode(); return (ISD::ZERO_EXTEND == Opcode); } return false; } // Return true if its cheap to bitcast this to a vector type. static bool mayFoldIntoVector(SDValue Op, const SelectionDAG &DAG, const X86Subtarget &Subtarget, bool AssumeSingleUse = false) { if (peekThroughBitcasts(Op).getValueType().isVector()) return true; if (isa(Op) || isa(Op)) return true; EVT VT = Op.getValueType(); unsigned Opcode = Op.getOpcode(); if ((VT == MVT::i128 || VT == MVT::i256 || VT == MVT::i512) && DAG.getTargetLoweringInfo().getOperationAction(Opcode, VT) == TargetLowering::LegalizeAction::Custom) { // Check for larger than legal scalar integer ops that might have been // custom lowered to vector instruction. switch (Opcode) { case ISD::AND: case ISD::OR: case ISD::XOR: case ISD::ADD: case ISD::SUB: return mayFoldIntoVector(Op.getOperand(0), DAG, Subtarget) && mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget); } } return X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse, /*IgnoreAlignment=*/true); } static bool isLogicOp(unsigned Opcode) { // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage. return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode; } static bool isTargetShuffle(unsigned Opcode) { switch(Opcode) { default: return false; case X86ISD::BLENDI: case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::SHUFP: case X86ISD::INSERTPS: case X86ISD::EXTRQI: case X86ISD::INSERTQI: case X86ISD::VALIGN: case X86ISD::PALIGNR: case X86ISD::VSHLDQ: case X86ISD::VSRLDQ: case X86ISD::MOVLHPS: case X86ISD::MOVHLPS: case X86ISD::MOVSHDUP: case X86ISD::MOVSLDUP: case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: case X86ISD::MOVSH: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VBROADCAST: case X86ISD::VPERMILPI: case X86ISD::VPERMILPV: case X86ISD::VPERM2X128: case X86ISD::SHUF128: case X86ISD::VPERMIL2: case X86ISD::VPERMI: case X86ISD::VPPERM: case X86ISD::VPERMV: case X86ISD::VPERMV3: case X86ISD::VZEXT_MOVL: case X86ISD::COMPRESS: case X86ISD::EXPAND: return true; } } static bool isTargetShuffleVariableMask(unsigned Opcode) { switch (Opcode) { default: return false; // Target Shuffles. case X86ISD::PSHUFB: case X86ISD::VPERMILPV: case X86ISD::VPERMIL2: case X86ISD::VPPERM: case X86ISD::VPERMV: case X86ISD::VPERMV3: return true; // 'Faux' Target Shuffles. case ISD::OR: case ISD::AND: case X86ISD::ANDNP: return true; } } SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); X86MachineFunctionInfo *FuncInfo = MF.getInfo(); int ReturnAddrIndex = FuncInfo->getRAIndex(); if (ReturnAddrIndex == 0) { // Set up a frame object for the return address. unsigned SlotSize = RegInfo->getSlotSize(); ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize, -(int64_t)SlotSize, false); FuncInfo->setRAIndex(ReturnAddrIndex); } return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout())); } bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model CM, bool HasSymbolicDisplacement) { // Offset should fit into 32 bit immediate field. if (!isInt<32>(Offset)) return false; // If we don't have a symbolic displacement - we don't have any extra // restrictions. if (!HasSymbolicDisplacement) return true; // We can fold large offsets in the large code model because we always use // 64-bit offsets. if (CM == CodeModel::Large) return true; // For kernel code model we know that all object resist in the negative half // of 32bits address space. We may not accept negative offsets, since they may // be just off and we may accept pretty large positive ones. if (CM == CodeModel::Kernel) return Offset >= 0; // For other non-large code models we assume that latest small object is 16MB // before end of 31 bits boundary. We may also accept pretty large negative // constants knowing that all objects are in the positive half of address // space. return Offset < 16 * 1024 * 1024; } /// Return true if the condition is an signed comparison operation. static bool isX86CCSigned(X86::CondCode X86CC) { switch (X86CC) { default: llvm_unreachable("Invalid integer condition!"); case X86::COND_E: case X86::COND_NE: case X86::COND_B: case X86::COND_A: case X86::COND_BE: case X86::COND_AE: return false; case X86::COND_G: case X86::COND_GE: case X86::COND_L: case X86::COND_LE: return true; } } static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) { switch (SetCCOpcode) { // clang-format off default: llvm_unreachable("Invalid integer condition!"); case ISD::SETEQ: return X86::COND_E; case ISD::SETGT: return X86::COND_G; case ISD::SETGE: return X86::COND_GE; case ISD::SETLT: return X86::COND_L; case ISD::SETLE: return X86::COND_LE; case ISD::SETNE: return X86::COND_NE; case ISD::SETULT: return X86::COND_B; case ISD::SETUGT: return X86::COND_A; case ISD::SETULE: return X86::COND_BE; case ISD::SETUGE: return X86::COND_AE; // clang-format on } } /// Do a one-to-one translation of a ISD::CondCode to the X86-specific /// condition code, returning the condition code and the LHS/RHS of the /// comparison to make. static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { if (!isFP) { if (ConstantSDNode *RHSC = dyn_cast(RHS)) { if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) { // X > -1 -> X == 0, jump !sign. RHS = DAG.getConstant(0, DL, RHS.getValueType()); return X86::COND_NS; } if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) { // X < 0 -> X == 0, jump on sign. return X86::COND_S; } if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) { // X >= 0 -> X == 0, jump on !sign. return X86::COND_NS; } if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) { // X < 1 -> X <= 0 RHS = DAG.getConstant(0, DL, RHS.getValueType()); return X86::COND_LE; } } return TranslateIntegerX86CC(SetCCOpcode); } // First determine if it is required or is profitable to flip the operands. // If LHS is a foldable load, but RHS is not, flip the condition. if (ISD::isNON_EXTLoad(LHS.getNode()) && !ISD::isNON_EXTLoad(RHS.getNode())) { SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); std::swap(LHS, RHS); } switch (SetCCOpcode) { default: break; case ISD::SETOLT: case ISD::SETOLE: case ISD::SETUGT: case ISD::SETUGE: std::swap(LHS, RHS); break; } // On a floating point condition, the flags are set as follows: // ZF PF CF op // 0 | 0 | 0 | X > Y // 0 | 0 | 1 | X < Y // 1 | 0 | 0 | X == Y // 1 | 1 | 1 | unordered switch (SetCCOpcode) { // clang-format off default: llvm_unreachable("Condcode should be pre-legalized away"); case ISD::SETUEQ: case ISD::SETEQ: return X86::COND_E; case ISD::SETOLT: // flipped case ISD::SETOGT: case ISD::SETGT: return X86::COND_A; case ISD::SETOLE: // flipped case ISD::SETOGE: case ISD::SETGE: return X86::COND_AE; case ISD::SETUGT: // flipped case ISD::SETULT: case ISD::SETLT: return X86::COND_B; case ISD::SETUGE: // flipped case ISD::SETULE: case ISD::SETLE: return X86::COND_BE; case ISD::SETONE: case ISD::SETNE: return X86::COND_NE; case ISD::SETUO: return X86::COND_P; case ISD::SETO: return X86::COND_NP; case ISD::SETOEQ: case ISD::SETUNE: return X86::COND_INVALID; // clang-format on } } /// Is there a floating point cmov for the specific X86 condition code? /// Current x86 isa includes the following FP cmov instructions: /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. static bool hasFPCMov(unsigned X86CC) { switch (X86CC) { default: return false; case X86::COND_B: case X86::COND_BE: case X86::COND_E: case X86::COND_P: case X86::COND_A: case X86::COND_AE: case X86::COND_NE: case X86::COND_NP: return true; } } static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) { return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() || VT.is512BitVector(); } void X86TargetLowering::getTgtMemIntrinsic( SmallVectorImpl &Infos, const CallBase &I, MachineFunction &MF, unsigned Intrinsic) const { IntrinsicInfo Info; Info.flags = MachineMemOperand::MONone; Info.offset = 0; const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic); if (!IntrData) { switch (Intrinsic) { case Intrinsic::x86_aesenc128kl: case Intrinsic::x86_aesdec128kl: Info.opc = ISD::INTRINSIC_W_CHAIN; Info.ptrVal = I.getArgOperand(1); Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48); Info.align = Align(1); Info.flags |= MachineMemOperand::MOLoad; Infos.push_back(Info); return; case Intrinsic::x86_aesenc256kl: case Intrinsic::x86_aesdec256kl: Info.opc = ISD::INTRINSIC_W_CHAIN; Info.ptrVal = I.getArgOperand(1); Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64); Info.align = Align(1); Info.flags |= MachineMemOperand::MOLoad; Infos.push_back(Info); return; case Intrinsic::x86_aesencwide128kl: case Intrinsic::x86_aesdecwide128kl: Info.opc = ISD::INTRINSIC_W_CHAIN; Info.ptrVal = I.getArgOperand(0); Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48); Info.align = Align(1); Info.flags |= MachineMemOperand::MOLoad; Infos.push_back(Info); return; case Intrinsic::x86_aesencwide256kl: case Intrinsic::x86_aesdecwide256kl: Info.opc = ISD::INTRINSIC_W_CHAIN; Info.ptrVal = I.getArgOperand(0); Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64); Info.align = Align(1); Info.flags |= MachineMemOperand::MOLoad; Infos.push_back(Info); return; case Intrinsic::x86_cmpccxadd32: case Intrinsic::x86_cmpccxadd64: case Intrinsic::x86_atomic_bts: case Intrinsic::x86_atomic_btc: case Intrinsic::x86_atomic_btr: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.ptrVal = I.getArgOperand(0); unsigned Size = I.getType()->getScalarSizeInBits(); Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size); Info.align = Align(Size); Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; Infos.push_back(Info); return; } case Intrinsic::x86_atomic_bts_rm: case Intrinsic::x86_atomic_btc_rm: case Intrinsic::x86_atomic_btr_rm: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.ptrVal = I.getArgOperand(0); unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits(); Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size); Info.align = Align(Size); Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; Infos.push_back(Info); return; } case Intrinsic::x86_aadd32: case Intrinsic::x86_aadd64: case Intrinsic::x86_aand32: case Intrinsic::x86_aand64: case Intrinsic::x86_aor32: case Intrinsic::x86_aor64: case Intrinsic::x86_axor32: case Intrinsic::x86_axor64: case Intrinsic::x86_atomic_add_cc: case Intrinsic::x86_atomic_sub_cc: case Intrinsic::x86_atomic_or_cc: case Intrinsic::x86_atomic_and_cc: case Intrinsic::x86_atomic_xor_cc: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.ptrVal = I.getArgOperand(0); unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits(); Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size); Info.align = Align(Size); Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; Infos.push_back(Info); return; } } return; } switch (IntrData->Type) { case TRUNCATE_TO_MEM_VI8: case TRUNCATE_TO_MEM_VI16: case TRUNCATE_TO_MEM_VI32: { Info.opc = ISD::INTRINSIC_VOID; Info.ptrVal = I.getArgOperand(0); MVT VT = MVT::getVT(I.getArgOperand(1)->getType()); MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE; if (IntrData->Type == TRUNCATE_TO_MEM_VI8) ScalarVT = MVT::i8; else if (IntrData->Type == TRUNCATE_TO_MEM_VI16) ScalarVT = MVT::i16; else if (IntrData->Type == TRUNCATE_TO_MEM_VI32) ScalarVT = MVT::i32; Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements()); Info.align = Align(1); Info.flags |= MachineMemOperand::MOStore; Infos.push_back(Info); return; } case GATHER: case GATHER_AVX2: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.ptrVal = nullptr; MVT DataVT = MVT::getVT(I.getType()); MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType()); unsigned NumElts = std::min(DataVT.getVectorNumElements(), IndexVT.getVectorNumElements()); Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); Info.align = Align(1); Info.flags |= MachineMemOperand::MOLoad; Infos.push_back(Info); return; } case SCATTER: { Info.opc = ISD::INTRINSIC_VOID; Info.ptrVal = nullptr; MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType()); MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType()); unsigned NumElts = std::min(DataVT.getVectorNumElements(), IndexVT.getVectorNumElements()); Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); Info.align = Align(1); Info.flags |= MachineMemOperand::MOStore; Infos.push_back(Info); return; } default: return; } } /// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const { for (const APFloat &FPImm : LegalFPImmediates) if (Imm.bitwiseIsEqual(FPImm)) return true; return false; } bool X86TargetLowering::shouldReduceLoadWidth( SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional ByteOffset) const { assert(cast(Load)->isSimple() && "illegal to narrow"); auto PeekThroughOneUserBitcasts = [](const SDNode *N) { while (N->getOpcode() == ISD::BITCAST && N->hasOneUse()) N = *N->user_begin(); return N; }; // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF // relocation target a movq or addq instruction: don't let the load shrink. SDValue BasePtr = cast(Load)->getBasePtr(); if (BasePtr.getOpcode() == X86ISD::WrapperRIP) if (const auto *GA = dyn_cast(BasePtr.getOperand(0))) return GA->getTargetFlags() != X86II::MO_GOTTPOFF; // If this is an (1) AVX vector load with (2) multiple uses and (3) all of // those uses are extracted directly into a store, then the extract + store // can be store-folded, or (4) any use will be used by legal full width // instruction. Then, it's probably not worth splitting the load. EVT VT = Load->getValueType(0); if ((VT.is256BitVector() || VT.is512BitVector()) && !SDValue(Load, 0).hasOneUse()) { bool FullWidthUse = false; bool AllExtractStores = true; for (SDUse &Use : Load->uses()) { // Skip uses of the chain value. Result 0 of the node is the load value. if (Use.getResNo() != 0) continue; const SDNode *User = PeekThroughOneUserBitcasts(Use.getUser()); // If this use is an extract + store, it's probably not worth splitting. if (User->getOpcode() == ISD::EXTRACT_SUBVECTOR && all_of(User->uses(), [&](const SDUse &U) { const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser()); return Inner->getOpcode() == ISD::STORE; })) continue; AllExtractStores = false; // If any use is a full width legal/target bin op, then assume its legal // and won't split. if (isBinOp(User->getOpcode()) && (isOperationLegal(User->getOpcode(), User->getValueType(0)) || User->getOpcode() > ISD::BUILTIN_OP_END)) FullWidthUse = true; } if (AllExtractStores) return false; // If we have an user that uses the full vector width, then this use is // only worth splitting if the offset isn't 0 (to avoid an // EXTRACT_SUBVECTOR) or we're loading a scalar integer. if (FullWidthUse) return (ByteOffset.value_or(0) > 0) || NewVT.isScalarInteger(); } return true; } /// Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); if (BitSize == 0 || BitSize > 64) return false; return true; } bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const { // If we are using XMM registers in the ABI and the condition of the select is // a floating-point compare and we have blendv or conditional move, then it is // cheaper to select instead of doing a cross-register move and creating a // load that depends on the compare result. bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128; return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX(); } bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const { // TODO: It might be a win to ease or lift this restriction, but the generic // folds in DAGCombiner conflict with vector folds for an AVX512 target. if (VT.isVector() && Subtarget.hasAVX512()) return false; return true; } bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const { // TODO: We handle scalars using custom code, but generic combining could make // that unnecessary. APInt MulC; if (!ISD::isConstantSplatVector(C.getNode(), MulC)) return false; // Find the type this will be legalized too. Otherwise we might prematurely // convert this to shl+add/sub and then still have to type legalize those ops. // Another choice would be to defer the decision for illegal types until // after type legalization. But constant splat vectors of i64 can't make it // through type legalization on 32-bit targets so we would need to special // case vXi64. while (getTypeAction(Context, VT) != TypeLegal) VT = getTypeToTransformTo(Context, VT); // If vector multiply is legal, assume that's faster than shl + add/sub. // Multiply is a complex op with higher latency and lower throughput in // most implementations, sub-vXi32 vector multiplies are always fast, // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64) // is always going to be slow. unsigned EltSizeInBits = VT.getScalarSizeInBits(); if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 && (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow())) return false; // shl+add, shl+sub, shl+add+neg return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() || (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2(); } bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const { if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) return false; // Mask vectors support all subregister combinations and operations that // extract half of vector. if (ResVT.getVectorElementType() == MVT::i1) return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) && (Index == ResVT.getVectorNumElements())); return (Index % ResVT.getVectorNumElements()) == 0; } bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const { unsigned Opc = VecOp.getOpcode(); // Assume target opcodes can't be scalarized. // TODO - do we have any exceptions? if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc)) return false; // If the vector op is not supported, try to convert to scalar. EVT VecVT = VecOp.getValueType(); if (!isOperationLegalOrCustomOrPromote(Opc, VecVT)) return true; // If the vector op is supported, but the scalar op is not, the transform may // not be worthwhile. EVT ScalarVT = VecVT.getScalarType(); return isOperationLegalOrCustomOrPromote(Opc, ScalarVT); } bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT, bool) const { // TODO: Allow vectors? if (VT.isVector()) return false; return VT.isSimple() || !isOperationExpand(Opcode, VT); } bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const { // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to // i32/i64 or can rely on BSF passthrough value. return Subtarget.hasBMI() || Subtarget.canUseCMOV() || Subtarget.hasBitScanPassThrough() || (!Ty->isVectorTy() && Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u)); } bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const { // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR // passthrough value. return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() || Subtarget.hasBitScanPassThrough(); } bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const { // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more // expensive than a straight movsd. On the other hand, it's important to // shrink long double fp constant since fldt is very slow. return !Subtarget.hasSSE2() || VT == MVT::f80; } bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const { return (VT == MVT::f64 && Subtarget.hasSSE2()) || (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16; } bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const { if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() && BitcastVT.getVectorElementType() == MVT::i1) return false; if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8) return false; if (LoadVT.isVector() && BitcastVT.isVector()) { // If both types are legal vectors, it's always ok to convert them. // Don't convert to an illegal type. if (isTypeLegal(LoadVT)) return isTypeLegal(BitcastVT); } // If we have a large vector type (even if illegal), don't bitcast to large // (illegal) scalar types. Better to load fewer vectors and extract. if (LoadVT.isVector() && !BitcastVT.isVector() && LoadVT.isInteger() && BitcastVT.isInteger() && (LoadVT.getSizeInBits() % 128) == 0) return false; return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO); } bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const { // Do not merge to float value size (128 bytes) if no implicit // float attribute is set. bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat); if (NoFloat) { unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32; return (MemVT.getSizeInBits() <= MaxIntSize); } // Make sure we don't merge greater than our preferred vector // width. if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth()) return false; return true; } bool X86TargetLowering::isCtlzFast() const { return Subtarget.hasFastLZCNT(); } bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial( const Instruction &AndI) const { return true; } bool X86TargetLowering::hasAndNotCompare(SDValue Y) const { EVT VT = Y.getValueType(); if (VT.isVector()) return false; if (!Subtarget.hasBMI()) return false; // There are only 32-bit and 64-bit forms for 'andn'. if (VT != MVT::i32 && VT != MVT::i64) return false; return !isa(Y) || cast(Y)->isOpaque(); } bool X86TargetLowering::hasAndNot(SDValue Y) const { EVT VT = Y.getValueType(); if (!VT.isVector()) return hasAndNotCompare(Y); // Vector. if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128) return false; if (VT == MVT::v4i32) return true; return Subtarget.hasSSE2(); } bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const { return X.getValueType().isScalarInteger(); // 'bt' } bool X86TargetLowering:: shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const { // Does baseline recommend not to perform the fold by default? if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG)) return false; // For scalars this transform is always beneficial. if (X.getValueType().isScalarInteger()) return true; // If all the shift amounts are identical, then transform is beneficial even // with rudimentary SSE2 shifts. if (DAG.isSplatValue(Y, /*AllowUndefs=*/true)) return true; // If we have AVX2 with it's powerful shift operations, then it's also good. if (Subtarget.hasAVX2()) return true; // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'. return NewShiftOpcode == ISD::SHL; } unsigned X86TargetLowering::preferedOpcodeForCmpEqPiecesOfOperand( EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional &AndMask) const { if (!VT.isInteger()) return ShiftOpc; bool PreferRotate = false; if (VT.isVector()) { // For vectors, if we have rotate instruction support, then its definetly // best. Otherwise its not clear what the best so just don't make changed. PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64); } else { // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer // rotate unless we have a zext mask+shr. PreferRotate = Subtarget.hasBMI2(); if (!PreferRotate) { unsigned MaskBits = VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue(); PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32); } } if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) { assert(AndMask.has_value() && "Null andmask when querying about shift+and"); if (PreferRotate && MayTransformRotate) return ISD::ROTL; // If vector we don't really get much benefit swapping around constants. // Maybe we could check if the DAG has the flipped node already in the // future. if (VT.isVector()) return ShiftOpc; // See if the beneficial to swap shift type. if (ShiftOpc == ISD::SHL) { // If the current setup has imm64 mask, then inverse will have // at least imm32 mask (or be zext i32 -> i64). if (VT == MVT::i64) return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL : ShiftOpc; // We can only benefit if req at least 7-bit for the mask. We // don't want to replace shl of 1,2,3 as they can be implemented // with lea/add. return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc; } if (VT == MVT::i64) // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is // extremely efficient. return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc; // Keep small shifts as shl so we can generate add/lea. return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc; } // We prefer rotate for vectors of if we won't get a zext mask with SRL // (PreferRotate will be set in the latter case). if (PreferRotate || !MayTransformRotate || VT.isVector()) return ShiftOpc; // Non-vector type and we have a zext mask with SRL. return ISD::SRL; } TargetLoweringBase::CondMergingParams X86TargetLowering::getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const { using namespace llvm::PatternMatch; int BaseCost = BrMergingBaseCostThresh.getValue(); // With CCMP, branches can be merged in a more efficient way. if (BaseCost >= 0 && Subtarget.hasCCMP()) BaseCost += BrMergingCcmpBias; // a == b && a == c is a fast pattern on x86. if (BaseCost >= 0 && Opc == Instruction::And && match(Lhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())) && match(Rhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value()))) BaseCost += 1; // For OR conditions with EQ comparisons, prefer splitting into branches // (unless CCMP is available). OR+EQ cannot be optimized via bitwise ops, // unlike OR+NE which becomes (P|Q)!=0. Similarly, don't split signed // comparisons (SLT, SGT) that can be optimized. if (BaseCost >= 0 && !Subtarget.hasCCMP() && Opc == Instruction::Or && match(Lhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())) && match(Rhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value()))) return {-1, -1, -1}; return {BaseCost, BrMergingLikelyBias.getValue(), BrMergingUnlikelyBias.getValue()}; } bool X86TargetLowering::preferScalarizeSplat(SDNode *N) const { return N->getOpcode() != ISD::FP_EXTEND; } bool X86TargetLowering::shouldFoldConstantShiftPairToMask( const SDNode *N) const { assert(((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && "Expected shift-shift mask"); // TODO: Should we always create i64 masks? Or only folded immediates? EVT VT = N->getValueType(0); if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) || (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) { // Only fold if the shift values are equal - so it folds to AND. // TODO - we should fold if either is a non-uniform vector but we don't do // the fold for non-splats yet. return N->getOperand(1) == N->getOperand(0).getOperand(1); } return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N); } bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const { EVT VT = Y.getValueType(); // For vectors, we don't have a preference, but we probably want a mask. if (VT.isVector()) return false; unsigned MaxWidth = Subtarget.is64Bit() ? 64 : 32; return VT.getScalarSizeInBits() <= MaxWidth; } TargetLowering::ShiftLegalizationStrategy X86TargetLowering::preferredShiftLegalizationStrategy( SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const { if (DAG.getMachineFunction().getFunction().hasMinSize() && !Subtarget.isOSWindows()) return ShiftLegalizationStrategy::LowerToLibcall; return TargetLowering::preferredShiftLegalizationStrategy(DAG, N, ExpansionFactor); } bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const { // Any legal vector type can be splatted more efficiently than // loading/spilling from memory. return isTypeLegal(VT); } MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const { MVT VT = MVT::getIntegerVT(NumBits); if (isTypeLegal(VT)) return VT; // PMOVMSKB can handle this. if (NumBits == 128 && isTypeLegal(MVT::v16i8)) return MVT::v16i8; // VPMOVMSKB can handle this. if (NumBits == 256 && isTypeLegal(MVT::v32i8)) return MVT::v32i8; // TODO: Allow 64-bit type for 32-bit target. // TODO: 512-bit types should be allowed, but make sure that those // cases are handled in combineVectorSizedSetCCEquality(). return MVT::INVALID_SIMPLE_VALUE_TYPE; } /// Val is the undef sentinel value or equal to the specified value. static bool isUndefOrEqual(int Val, int CmpVal) { return ((Val == SM_SentinelUndef) || (Val == CmpVal)); } /// Return true if every element in Mask is the undef sentinel value or equal to /// the specified value. static bool isUndefOrEqual(ArrayRef Mask, int CmpVal) { return llvm::all_of(Mask, [CmpVal](int M) { return (M == SM_SentinelUndef) || (M == CmpVal); }); } /// Return true if every element in Mask, beginning from position Pos and ending /// in Pos+Size is the undef sentinel value or equal to the specified value. static bool isUndefOrEqualInRange(ArrayRef Mask, int CmpVal, unsigned Pos, unsigned Size) { return llvm::all_of(Mask.slice(Pos, Size), [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); }); } /// Val is either the undef or zero sentinel value. static bool isUndefOrZero(int Val) { return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero)); } /// Return true if every element in Mask, beginning from position Pos and ending /// in Pos+Size is the undef sentinel value. static bool isUndefInRange(ArrayRef Mask, unsigned Pos, unsigned Size) { return llvm::all_of(Mask.slice(Pos, Size), equal_to(SM_SentinelUndef)); } /// Return true if the mask creates a vector whose lower half is undefined. static bool isUndefLowerHalf(ArrayRef Mask) { unsigned NumElts = Mask.size(); return isUndefInRange(Mask, 0, NumElts / 2); } /// Return true if the mask creates a vector whose upper half is undefined. static bool isUndefUpperHalf(ArrayRef Mask) { unsigned NumElts = Mask.size(); return isUndefInRange(Mask, NumElts / 2, NumElts / 2); } /// Return true if Val falls within the specified range (L, H]. static bool isInRange(int Val, int Low, int Hi) { return (Val >= Low && Val < Hi); } /// Return true if the value of any element in Mask falls within the specified /// range (L, H]. static bool isAnyInRange(ArrayRef Mask, int Low, int Hi) { return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); }); } /// Return true if the value of any element in Mask is the zero sentinel value. static bool isAnyZero(ArrayRef Mask) { return llvm::any_of(Mask, equal_to(SM_SentinelZero)); } /// Return true if Val is undef or if its value falls within the /// specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi); } /// Return true if every element in Mask is undef or if its value /// falls within the specified range (L, H]. static bool isUndefOrInRange(ArrayRef Mask, int Low, int Hi) { return llvm::all_of( Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); }); } /// Return true if Val is undef, zero or if its value falls within the /// specified range (L, H]. static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) { return isUndefOrZero(Val) || isInRange(Val, Low, Hi); } /// Return true if every element in Mask is undef, zero or if its value /// falls within the specified range (L, H]. static bool isUndefOrZeroOrInRange(ArrayRef Mask, int Low, int Hi) { return llvm::all_of( Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); }); } /// Return true if every element in Mask, is an in-place blend/select mask or is /// undef. [[maybe_unused]] static bool isBlendOrUndef(ArrayRef Mask) { unsigned NumElts = Mask.size(); for (auto [I, M] : enumerate(Mask)) if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts)) return false; return true; } /// Return true if every element in Mask, beginning /// from position Pos and ending in Pos + Size, falls within the specified /// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef. static bool isSequentialOrUndefInRange(ArrayRef Mask, unsigned Pos, unsigned Size, int Low, int Step = 1) { for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step) if (!isUndefOrEqual(Mask[i], Low)) return false; return true; } /// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size, falls within the specified /// sequential range (Low, Low+Size], or is undef or is zero. static bool isSequentialOrUndefOrZeroInRange(ArrayRef Mask, unsigned Pos, unsigned Size, int Low, int Step = 1) { for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step) if (!isUndefOrZero(Mask[i]) && Mask[i] != Low) return false; return true; } /// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size is undef or is zero. static bool isUndefOrZeroInRange(ArrayRef Mask, unsigned Pos, unsigned Size) { return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero); } /// Return true if every element of a single input is referenced by the shuffle /// mask. i.e. it just permutes them all. static bool isCompletePermute(ArrayRef Mask) { unsigned NumElts = Mask.size(); APInt DemandedElts = APInt::getZero(NumElts); for (int M : Mask) if (isInRange(M, 0, NumElts)) DemandedElts.setBit(M); return DemandedElts.isAllOnes(); } /// Helper function to test whether a shuffle mask could be /// simplified by widening the elements being shuffled. /// /// Appends the mask for wider elements in WidenedMask if valid. Otherwise /// leaves it in an unspecified state. /// /// NOTE: This must handle normal vector shuffle masks and *target* vector /// shuffle masks. The latter have the special property of a '-2' representing /// a zero-ed lane of a vector. static bool canWidenShuffleElements(ArrayRef Mask, SmallVectorImpl &WidenedMask) { WidenedMask.assign(Mask.size() / 2, 0); for (int i = 0, Size = Mask.size(); i < Size; i += 2) { int M0 = Mask[i]; int M1 = Mask[i + 1]; // If both elements are undef, its trivial. if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) { WidenedMask[i / 2] = SM_SentinelUndef; continue; } // Check for an undef mask and a mask value properly aligned to fit with // a pair of values. If we find such a case, use the non-undef mask's value. if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) { WidenedMask[i / 2] = M1 / 2; continue; } if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) { WidenedMask[i / 2] = M0 / 2; continue; } // When zeroing, we need to spread the zeroing across both lanes to widen. if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) { if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) && (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) { WidenedMask[i / 2] = SM_SentinelZero; continue; } return false; } // Finally check if the two mask values are adjacent and aligned with // a pair. if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) { WidenedMask[i / 2] = M0 / 2; continue; } // Otherwise we can't safely widen the elements used in this shuffle. return false; } assert(WidenedMask.size() == Mask.size() / 2 && "Incorrect size of mask after widening the elements!"); return true; } static bool canWidenShuffleElements(ArrayRef Mask, const APInt &Zeroable, bool V2IsZero, SmallVectorImpl &WidenedMask) { // Create an alternative mask with info about zeroable elements. // Here we do not set undef elements as zeroable. SmallVector ZeroableMask(Mask); if (V2IsZero) { assert(!Zeroable.isZero() && "V2's non-undef elements are used?!"); for (int i = 0, Size = Mask.size(); i != Size; ++i) if (Mask[i] != SM_SentinelUndef && Zeroable[i]) ZeroableMask[i] = SM_SentinelZero; } return canWidenShuffleElements(ZeroableMask, WidenedMask); } static bool canWidenShuffleElements(ArrayRef Mask) { SmallVector WidenedMask; return canWidenShuffleElements(Mask, WidenedMask); } // Attempt to narrow/widen shuffle mask until it matches the target number of // elements. static bool scaleShuffleElements(ArrayRef Mask, unsigned NumDstElts, SmallVectorImpl &ScaledMask) { unsigned NumSrcElts = Mask.size(); assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"); // Narrowing is guaranteed to work. if (NumDstElts >= NumSrcElts) { int Scale = NumDstElts / NumSrcElts; llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask); return true; } // We have to repeat the widening until we reach the target size, but we can // split out the first widening as it sets up ScaledMask for us. if (canWidenShuffleElements(Mask, ScaledMask)) { while (ScaledMask.size() > NumDstElts) { SmallVector WidenedMask; if (!canWidenShuffleElements(ScaledMask, WidenedMask)) return false; ScaledMask = std::move(WidenedMask); } return true; } return false; } static bool canScaleShuffleElements(ArrayRef Mask, unsigned NumDstElts) { SmallVector ScaledMask; return scaleShuffleElements(Mask, NumDstElts, ScaledMask); } // Helper to grow the shuffle mask for a larger value type. // NOTE: This is different to scaleShuffleElements which is a same size type. static void growShuffleMask(ArrayRef SrcMask, SmallVectorImpl &DstMask, unsigned SrcSizeInBits, unsigned DstSizeInBits) { assert(DstMask.empty() && "Expected an empty shuffle mas"); assert((DstSizeInBits % SrcSizeInBits) == 0 && "Illegal shuffle scale"); unsigned Scale = DstSizeInBits / SrcSizeInBits; unsigned NumSrcElts = SrcMask.size(); DstMask.assign(SrcMask.begin(), SrcMask.end()); for (int &M : DstMask) { if (M < 0) continue; M = (M % NumSrcElts) + ((M / NumSrcElts) * Scale * NumSrcElts); } DstMask.append((Scale - 1) * NumSrcElts, SM_SentinelUndef); } /// Returns true if Elt is a constant zero or a floating point constant +0.0. bool X86::isZeroNode(SDValue Elt) { return isNullConstant(Elt) || isNullFPConstant(Elt); } // Build a vector of constants. // Use an UNDEF node if MaskElt == -1. // Split 64-bit constants in the 32-bit mode. static SDValue getConstVector(ArrayRef Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask = false) { SmallVector Ops; bool Split = false; MVT ConstVecVT = VT; unsigned NumElts = VT.getVectorNumElements(); bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64); if (!In64BitMode && VT.getVectorElementType() == MVT::i64) { ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2); Split = true; } MVT EltVT = ConstVecVT.getVectorElementType(); for (unsigned i = 0; i < NumElts; ++i) { bool IsUndef = Values[i] < 0 && IsMask; SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) : DAG.getConstant(Values[i], dl, EltVT); Ops.push_back(OpNode); if (Split) Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) : DAG.getConstant(0, dl, EltVT)); } SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops); if (Split) ConstsNode = DAG.getBitcast(VT, ConstsNode); return ConstsNode; } static SDValue getConstVector(ArrayRef Bits, const APInt &Undefs, MVT VT, SelectionDAG &DAG, const SDLoc &dl) { assert(Bits.size() == Undefs.getBitWidth() && "Unequal constant and undef arrays"); SmallVector Ops; bool Split = false; MVT ConstVecVT = VT; unsigned NumElts = VT.getVectorNumElements(); bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64); if (!In64BitMode && VT.getVectorElementType() == MVT::i64) { ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2); Split = true; } MVT EltVT = ConstVecVT.getVectorElementType(); MVT EltIntVT = EltVT.changeTypeToInteger(); for (unsigned i = 0, e = Bits.size(); i != e; ++i) { if (Undefs[i]) { Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT)); continue; } const APInt &V = Bits[i]; assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes"); if (Split) { Ops.push_back(DAG.getConstant(V.extractBits(32, 0), dl, EltVT)); Ops.push_back(DAG.getConstant(V.extractBits(32, 32), dl, EltVT)); } else { Ops.push_back(DAG.getBitcast(EltVT, DAG.getConstant(V, dl, EltIntVT))); } } SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops); return DAG.getBitcast(VT, ConstsNode); } static SDValue getConstVector(ArrayRef Bits, MVT VT, SelectionDAG &DAG, const SDLoc &dl) { APInt Undefs = APInt::getZero(Bits.size()); return getConstVector(Bits, Undefs, VT, DAG, dl); } /// Returns a vector of specified type with all zero elements. static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl) { assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && "Unexpected vector type"); // Try to build SSE/AVX zero vectors as bitcasted to their dest // type. This ensures they get CSE'd. But if the integer type is not // available, use a floating-point +0.0 instead. SDValue Vec; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!Subtarget.hasSSE2() && VT.is128BitVector()) { Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32); } else if (VT.isFloatingPoint() && TLI.isTypeLegal(VT.getVectorElementType())) { Vec = DAG.getConstantFP(+0.0, dl, VT); } else if (VT.getVectorElementType() == MVT::i1) { assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && "Unexpected vector type"); Vec = DAG.getConstant(0, dl, VT); } else { unsigned Num32BitElts = VT.getSizeInBits() / 32; Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts)); } return DAG.getBitcast(VT, Vec); } // Helper to determine if the ops are all the extracted subvectors come from a // single source. If we allow commute they don't have to be in order (Lo/Hi). static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) { if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR || RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR || LHS.getValueType() != RHS.getValueType() || LHS.getOperand(0) != RHS.getOperand(0)) return SDValue(); SDValue Src = LHS.getOperand(0); if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2)) return SDValue(); unsigned NumElts = LHS.getValueType().getVectorNumElements(); if ((LHS.getConstantOperandAPInt(1) == 0 && RHS.getConstantOperandAPInt(1) == NumElts) || (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 && LHS.getConstantOperandAPInt(1) == NumElts)) return Src; return SDValue(); } static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth) { EVT VT = Vec.getValueType(); EVT ElVT = VT.getVectorElementType(); unsigned ResultNumElts = (VT.getVectorNumElements() * vectorWidth) / VT.getSizeInBits(); EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, ResultNumElts); assert(ResultVT.getSizeInBits() == vectorWidth && "Illegal subvector extraction"); // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); // This is the index of the first element of the vectorWidth-bit chunk // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. IdxVal &= ~(ElemsPerChunk - 1); // If the input is a buildvector just emit a smaller one. if (Vec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getBuildVector(ResultVT, dl, Vec->ops().slice(IdxVal, ElemsPerChunk)); // Check if we're extracting the upper undef of a widening pattern. if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() && Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal && isNullConstant(Vec.getOperand(2))) return DAG.getUNDEF(ResultVT); return DAG.getExtractSubvector(dl, ResultVT, Vec, IdxVal); } /// Generate a DAG to grab 128-bits from a vector > 128 bits. This /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 /// instructions or a simple subregister reference. Idx is an index in the /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes /// lowering EXTRACT_VECTOR_ELT operations easier. static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl) { assert((Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); return extractSubVector(Vec, IdxVal, DAG, dl, 128); } /// Generate a DAG to grab 256-bits from a 512-bit vector. static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl) { assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); return extractSubVector(Vec, IdxVal, DAG, dl, 256); } static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth) { assert((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"); // Inserting UNDEF is Result if (Vec.isUndef()) return Result; // Insert the relevant vectorWidth bits. EVT VT = Vec.getValueType(); unsigned ElemsPerChunk = vectorWidth / VT.getScalarSizeInBits(); assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); // This is the index of the first element of the vectorWidth-bit chunk // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. IdxVal &= ~(ElemsPerChunk - 1); return DAG.getInsertSubvector(dl, Result, Vec, IdxVal); } /// Generate a DAG to put 128-bits into a vector > 128 bits. This /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a /// simple superregister reference. Idx is an index in the 128 bits /// we want. It need not be aligned to a 128-bit boundary. That makes /// lowering INSERT_VECTOR_ELT operations easier. static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl) { assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128); } /// Widen a vector to a larger size with the same scalar type, with the new /// elements either zero or undef. static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl) { EVT VecVT = Vec.getValueType(); assert(VecVT.getFixedSizeInBits() <= VT.getFixedSizeInBits() && VecVT.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"); // If the upper 128-bits of a build vector are already undef/zero, then try to // widen from the lower 128-bits. if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) { unsigned NumSrcElts = VecVT.getVectorNumElements(); ArrayRef Hi = Vec->ops().drop_front(NumSrcElts / 2); if (all_of(Hi, [&](SDValue V) { return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V)); })) Vec = extract128BitVector(Vec, 0, DAG, dl); } SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl) : DAG.getUNDEF(VT); return DAG.getInsertSubvector(dl, Res, Vec, 0); } /// Widen a vector to a larger size with the same scalar type, with the new /// elements either zero or undef. static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl, unsigned WideSizeInBits) { assert(Vec.getValueSizeInBits() <= WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && "Unsupported vector widening type"); unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits(); MVT SVT = Vec.getSimpleValueType().getScalarType(); MVT VT = MVT::getVectorVT(SVT, WideNumElts); return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl); } /// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT /// and bitcast with integer types. static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) { assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector"); unsigned NumElts = VT.getVectorNumElements(); if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8) return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; return VT; } /// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and /// bitcast with integer types. static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl) { MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget); return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl); } // Helper function to collect subvector ops that are concatenated together, // either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series. // The subvectors in Ops are guaranteed to be the same type. static bool collectConcatOps(SDNode *N, SmallVectorImpl &Ops, SelectionDAG &DAG) { assert(Ops.empty() && "Expected an empty ops vector"); if (N->getOpcode() == ISD::CONCAT_VECTORS) { Ops.append(N->op_begin(), N->op_end()); return true; } if (N->getOpcode() == ISD::INSERT_SUBVECTOR) { SDValue Src = N->getOperand(0); SDValue Sub = N->getOperand(1); const APInt &Idx = N->getConstantOperandAPInt(2); EVT VT = Src.getValueType(); EVT SubVT = Sub.getValueType(); if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) { // insert_subvector(undef, x, lo) if (Idx == 0 && Src.isUndef()) { Ops.push_back(Sub); Ops.push_back(DAG.getUNDEF(SubVT)); return true; } if (Idx == (VT.getVectorNumElements() / 2)) { // insert_subvector(insert_subvector(undef, x, lo), y, hi) if (Src.getOpcode() == ISD::INSERT_SUBVECTOR && Src.getOperand(1).getValueType() == SubVT && isNullConstant(Src.getOperand(2))) { // Attempt to recurse into inner (matching) concats. SDValue Lo = Src.getOperand(1); SDValue Hi = Sub; SmallVector LoOps, HiOps; if (collectConcatOps(Lo.getNode(), LoOps, DAG) && collectConcatOps(Hi.getNode(), HiOps, DAG) && LoOps.size() == HiOps.size()) { Ops.append(LoOps); Ops.append(HiOps); return true; } Ops.push_back(Lo); Ops.push_back(Hi); return true; } // insert_subvector(x, extract_subvector(x, lo), hi) if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR && Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) { Ops.append(2, Sub); return true; } // insert_subvector(undef, x, hi) if (Src.isUndef()) { Ops.push_back(DAG.getUNDEF(SubVT)); Ops.push_back(Sub); return true; } } } } if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) { EVT VT = N->getValueType(0); SDValue Src = N->getOperand(0); uint64_t Idx = N->getConstantOperandVal(1); // Collect all the subvectors from the source vector and slice off the // extraction. SmallVector SrcOps; if (collectConcatOps(Src.getNode(), SrcOps, DAG) && VT.getSizeInBits() > SrcOps[0].getValueSizeInBits() && (VT.getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 && (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) { unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements(); unsigned NumSubs = VT.getSizeInBits() / SrcOps[0].getValueSizeInBits(); Ops.append(SrcOps.begin() + SubIdx, SrcOps.begin() + SubIdx + NumSubs); return true; } } assert(Ops.empty() && "Expected an empty ops vector"); return false; } // Helper to check if \p V can be split into subvectors and the upper subvectors // are all undef. In which case return the lower subvector. static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG) { SmallVector SubOps; if (!collectConcatOps(V.getNode(), SubOps, DAG)) return SDValue(); unsigned NumSubOps = SubOps.size(); unsigned HalfNumSubOps = NumSubOps / 2; assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors"); ArrayRef UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end()); if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); })) return SDValue(); EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext()); ArrayRef LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps); return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps); } // Helper to check if we can access all the constituent subvectors without any // extract ops. static bool isFreeToSplitVector(SDValue V, SelectionDAG &DAG) { SmallVector Ops; return collectConcatOps(V.getNode(), Ops, DAG); } static std::pair splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) { EVT VT = Op.getValueType(); unsigned NumElems = VT.getVectorNumElements(); unsigned SizeInBits = VT.getSizeInBits(); assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 && "Can't split odd sized vector"); SmallVector SubOps; if (collectConcatOps(Op.getNode(), SubOps, DAG)) { assert((SubOps.size() % 2) == 0 && "Can't split odd sized vector concat"); unsigned HalfOps = SubOps.size() / 2; EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); SmallVector LoOps(SubOps.begin(), SubOps.begin() + HalfOps); SmallVector HiOps(SubOps.begin() + HalfOps, SubOps.end()); SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps); SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps); return std::make_pair(Lo, Hi); } // If this is a splat value (with no-undefs) then use the lower subvector, // which should be a free extraction. SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2); if (DAG.isSplatValue(Op, /*AllowUndefs*/ false)) return std::make_pair(Lo, Lo); SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2); return std::make_pair(Lo, Hi); } /// Break an operation into 2 half sized ops and then concatenate the results. static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) { unsigned NumOps = Op.getNumOperands(); EVT VT = Op.getValueType(); // Extract the LHS Lo/Hi vectors SmallVector LoOps(NumOps, SDValue()); SmallVector HiOps(NumOps, SDValue()); for (unsigned I = 0; I != NumOps; ++I) { SDValue SrcOp = Op.getOperand(I); if (!SrcOp.getValueType().isVector()) { LoOps[I] = HiOps[I] = SrcOp; continue; } std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl); } EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps), DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps)); } /// Break an unary integer operation into 2 half sized ops and then /// concatenate the result back. static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) { // Make sure we only try to split 256/512-bit types to avoid creating // narrow vectors. [[maybe_unused]] EVT VT = Op.getValueType(); assert((Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"); assert(Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && "Unexpected VTs!"); return splitVectorOp(Op, DAG, dl); } /// Break a binary integer operation into 2 half sized ops and then /// concatenate the result back. static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl) { // Assert that all the types match. [[maybe_unused]] EVT VT = Op.getValueType(); assert(Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && "Unexpected VTs!"); assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"); return splitVectorOp(Op, DAG, dl); } // Helper for splitting operands of an operation to legal target size and // apply a function on each part. // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in // 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for // deciding if/how to split Ops. Ops elements do *not* have to be of type VT. // The argument Builder is a function that will be applied on each split part: // SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef) template SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef Ops, F Builder, bool CheckBWI = true, bool AllowAVX512 = true) { assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2"); unsigned NumSubs = 1; if (AllowAVX512 && ((CheckBWI && Subtarget.useBWIRegs()) || (!CheckBWI && Subtarget.useAVX512Regs()))) { if (VT.getSizeInBits() > 512) { NumSubs = VT.getSizeInBits() / 512; assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size"); } } else if (Subtarget.hasAVX2()) { if (VT.getSizeInBits() > 256) { NumSubs = VT.getSizeInBits() / 256; assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size"); } } else { if (VT.getSizeInBits() > 128) { NumSubs = VT.getSizeInBits() / 128; assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size"); } } if (NumSubs == 1) return Builder(DAG, DL, Ops); SmallVector Subs; for (unsigned i = 0; i != NumSubs; ++i) { SmallVector SubOps; for (SDValue Op : Ops) { EVT OpVT = Op.getValueType(); unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs; unsigned SizeSub = OpVT.getSizeInBits() / NumSubs; SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub)); } Subs.push_back(Builder(DAG, DL, SubOps)); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs); } // Helper function that extends a non-512-bit vector op to 512-bits on non-VLX // targets. static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(Subtarget.hasAVX512() && "AVX512 target expected"); MVT SVT = VT.getScalarType(); // If we have a 32/64 splatted constant, splat it to DstTy to // encourage a foldable broadcast'd operand. auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) { unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits(); // AVX512 broadcasts 32/64-bit operands. // TODO: Support float once getAVX512Node is used by fp-ops. if (!OpVT.isInteger() || OpEltSizeInBits < 32 || !DAG.getTargetLoweringInfo().isTypeLegal(SVT)) return SDValue(); // If we're not widening, don't bother if we're not bitcasting. if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST) return SDValue(); if (auto *BV = dyn_cast(peekThroughBitcasts(Op))) { APInt SplatValue, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs, OpEltSizeInBits) && !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits) return DAG.getConstant(SplatValue, DL, DstVT); } return SDValue(); }; bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector()); MVT DstVT = VT; if (Widen) DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits()); // Canonicalize src operands. SmallVector SrcOps(Ops); for (SDValue &Op : SrcOps) { MVT OpVT = Op.getSimpleValueType(); // Just pass through scalar operands. if (!OpVT.isVector()) continue; assert(OpVT == VT && "Vector type mismatch"); if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) { Op = BroadcastOp; continue; } // Just widen the subvector by inserting into an undef wide vector. if (Widen) Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512); } SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps); // Perform the 512-bit op then extract the bottom subvector. if (Widen) Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits()); return Res; } /// Insert i1-subvector to i1-vector. static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc dl(Op); SDValue Vec = Op.getOperand(0); SDValue SubVec = Op.getOperand(1); SDValue Idx = Op.getOperand(2); unsigned IdxVal = Op.getConstantOperandVal(2); // Inserting undef is a nop. We can just return the original vector. if (SubVec.isUndef()) return Vec; if (IdxVal == 0 && Vec.isUndef()) // the operation is legal return Op; MVT OpVT = Op.getSimpleValueType(); unsigned NumElems = OpVT.getVectorNumElements(); SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl); // Extend to natively supported kshift. MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget); // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts // if necessary. if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) { // May need to promote to a legal type. Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, DAG.getConstant(0, dl, WideOpVT), SubVec, Idx); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } MVT SubVecVT = SubVec.getSimpleValueType(); unsigned SubVecNumElems = SubVecVT.getVectorNumElements(); assert(IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR"); SDValue Undef = DAG.getUNDEF(WideOpVT); if (IdxVal == 0) { // Zero lower bits of the Vec SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8); Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); // Merge them together, SubVec should be zero extended. SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, DAG.getConstant(0, dl, WideOpVT), SubVec, ZeroIdx); Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, SubVec, ZeroIdx); if (Vec.isUndef()) { assert(IdxVal != 0 && "Unexpected index"); SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, DAG.getTargetConstant(IdxVal, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } if (ISD::isBuildVectorAllZeros(Vec.getNode())) { assert(IdxVal != 0 && "Unexpected index"); // If upper elements of Vec are known undef, then just shift into place. if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems), [](SDValue V) { return V.isUndef(); })) { SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, DAG.getTargetConstant(IdxVal, dl, MVT::i8)); } else { NumElems = WideOpVT.getVectorNumElements(); unsigned ShiftLeft = NumElems - SubVecNumElems; unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); if (ShiftRight != 0) SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); } return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } // Simple case when we put subvector in the upper part if (IdxVal + SubVecNumElems == NumElems) { SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, DAG.getTargetConstant(IdxVal, dl, MVT::i8)); if (SubVecNumElems * 2 == NumElems) { // Special case, use legal zero extending insert_subvector. This allows // isel to optimize when bits are known zero. Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx); Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, DAG.getConstant(0, dl, WideOpVT), Vec, ZeroIdx); } else { // Otherwise use explicit shifts to zero the bits. Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); NumElems = WideOpVT.getVectorNumElements(); SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8); Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); } Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } // Inserting into the middle is more complicated. NumElems = WideOpVT.getVectorNumElements(); // Widen the vector if needed. Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); unsigned ShiftLeft = NumElems - SubVecNumElems; unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; // Do an optimization for the most frequently used types. if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) { APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems); Mask0.flipAllBits(); SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems)); SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0); Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0); SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); // Reduce to original width if needed. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } // Clear the upper bits of the subvector and move it to its insert position. SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); // Isolate the bits below the insertion point. unsigned LowShift = NumElems - IdxVal; SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, DAG.getTargetConstant(LowShift, dl, MVT::i8)); Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low, DAG.getTargetConstant(LowShift, dl, MVT::i8)); // Isolate the bits after the last inserted bit. unsigned HighShift = IdxVal + SubVecNumElems; SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, DAG.getTargetConstant(HighShift, dl, MVT::i8)); High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High, DAG.getTargetConstant(HighShift, dl, MVT::i8)); // Now OR all 3 pieces together. Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High); SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec); // Reduce to original width if needed. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl) { assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch"); EVT SubVT = V1.getValueType(); EVT SubSVT = SubVT.getScalarType(); unsigned SubNumElts = SubVT.getVectorNumElements(); unsigned SubVectorWidth = SubVT.getSizeInBits(); EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts); SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth); return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth); } /// Returns a vector of specified type with all bits set. /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>. /// Then bitcast to their original type, ensuring they get CSE'd. static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"); unsigned NumElts = VT.getSizeInBits() / 32; SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts)); return DAG.getBitcast(VT, Vec); } static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG) { EVT InVT = In.getValueType(); assert(VT.isVector() && InVT.isVector() && "Expected vector VTs."); // Canonicalize Opcode to general extension version. switch (Opcode) { case ISD::ANY_EXTEND: case ISD::ANY_EXTEND_VECTOR_INREG: Opcode = ISD::ANY_EXTEND; break; case ISD::SIGN_EXTEND: case ISD::SIGN_EXTEND_VECTOR_INREG: Opcode = ISD::SIGN_EXTEND; break; case ISD::ZERO_EXTEND: case ISD::ZERO_EXTEND_VECTOR_INREG: Opcode = ISD::ZERO_EXTEND; break; default: llvm_unreachable("Unknown extension opcode"); } // For 256-bit vectors, we only need the lower (128-bit) input half. // For 512-bit vectors, we only need the lower input half or quarter. if (InVT.getSizeInBits() > 128) { assert(VT.getSizeInBits() == InVT.getSizeInBits() && "Expected VTs to be the same size!"); unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits(); In = extractSubVector(In, 0, DAG, DL, std::max(128U, (unsigned)VT.getSizeInBits() / Scale)); InVT = In.getValueType(); } if (VT.getVectorNumElements() != InVT.getVectorNumElements()) Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode); return DAG.getNode(Opcode, DL, VT, In); } // Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG) { LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask); RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS); return DAG.getNode(ISD::OR, DL, VT, LHS, RHS); } void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl &Mask, bool Lo, bool Unary) { assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"); assert(Mask.empty() && "Expected an empty shuffle mask vector"); int NumElts = VT.getVectorNumElements(); int NumEltsInLane = 128 / VT.getScalarSizeInBits(); for (int i = 0; i < NumElts; ++i) { unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane; int Pos = (i % NumEltsInLane) / 2 + LaneStart; Pos += (Unary ? 0 : NumElts * (i % 2)); Pos += (Lo ? 0 : NumEltsInLane / 2); Mask.push_back(Pos); } } /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation /// imposed by AVX and specific to the unary pattern. Example: /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3> /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7> void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl &Mask, bool Lo) { assert(Mask.empty() && "Expected an empty shuffle mask vector"); int NumElts = VT.getVectorNumElements(); for (int i = 0; i < NumElts; ++i) { int Pos = i / 2; Pos += (Lo ? 0 : NumElts / 2); Mask.push_back(Pos); } } // Attempt to constant fold, else just create a VECTOR_SHUFFLE. static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef Mask) { if ((ISD::isBuildVectorOfConstantSDNodes(V1.getNode()) || V1.isUndef()) && (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) { SmallVector Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType())); for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) { int M = Mask[I]; if (M < 0) continue; SDValue V = (M < NumElts) ? V1 : V2; if (V.isUndef()) continue; Ops[I] = V.getOperand(M % NumElts); } return DAG.getBuildVector(VT, dl, Ops); } return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); } /// Returns a vector_shuffle node for an unpackl operation. static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2) { SmallVector Mask; createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false); return getVectorShuffle(DAG, VT, dl, V1, V2, Mask); } /// Returns a vector_shuffle node for an unpackh operation. static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2) { SmallVector Mask; createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false); return getVectorShuffle(DAG, VT, dl, V1, V2, Mask); } /// Returns a node that packs the LHS + RHS nodes together at half width. /// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half. /// TODO: Add subvector splitting if/when we have a need for it. static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf = false) { MVT OpVT = LHS.getSimpleValueType(); unsigned EltSizeInBits = VT.getScalarSizeInBits(); bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8; assert(OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"); assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && "Unexpected PACK result type"); // Rely on vector shuffles for vXi64 -> vXi32 packing. if (EltSizeInBits == 32) { SmallVector PackMask; int Offset = PackHiHalf ? 1 : 0; int NumElts = VT.getVectorNumElements(); for (int I = 0; I != NumElts; I += 4) { PackMask.push_back(I + Offset); PackMask.push_back(I + Offset + 2); PackMask.push_back(I + Offset + NumElts); PackMask.push_back(I + Offset + NumElts + 2); } return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS), DAG.getBitcast(VT, RHS), PackMask); } // See if we already have sufficient leading bits for PACKSS/PACKUS. if (!PackHiHalf) { if (UsePackUS && DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits && DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits) return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS); if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits && DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits) return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS); } // Fallback to sign/zero extending the requested half and pack. SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8); if (UsePackUS) { if (PackHiHalf) { LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt); RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt); } else { SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT); LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask); RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask); }; return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS); }; if (!PackHiHalf) { LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt); RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt); } LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt); RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt); return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS); } /// Return a vector_shuffle of the specified vector of zero or undef vector. /// This produces a shuffle where the low element of V2 is swizzled into the /// zero/undef vector, landing at element Idx. /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = V2.getSimpleValueType(); SDValue V1 = IsZero ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT); int NumElems = VT.getVectorNumElements(); SmallVector MaskVec(NumElems); for (int i = 0; i != NumElems; ++i) // If this is the insertion idx, put the low elt of V2 here. MaskVec[i] = (i == Idx) ? NumElems : i; return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec); } static ConstantPoolSDNode *getTargetConstantPoolFromBasePtr(SDValue Ptr) { if (Ptr.getOpcode() == X86ISD::Wrapper || Ptr.getOpcode() == X86ISD::WrapperRIP) Ptr = Ptr.getOperand(0); return dyn_cast(Ptr); } // TODO: Add support for non-zero offsets. static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) { ConstantPoolSDNode *CNode = getTargetConstantPoolFromBasePtr(Ptr); if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0) return nullptr; return CNode->getConstVal(); } static const Constant *getTargetConstantFromNode(LoadSDNode *Load) { if (!Load || !ISD::isNormalLoad(Load)) return nullptr; return getTargetConstantFromBasePtr(Load->getBasePtr()); } static const Constant *getTargetConstantFromNode(SDValue Op) { Op = peekThroughBitcasts(Op); return getTargetConstantFromNode(dyn_cast(Op)); } const Constant * X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const { assert(LD && "Unexpected null LoadSDNode"); return getTargetConstantFromNode(LD); } bool X86TargetLowering::isTargetCanonicalSelect(SDNode *N) const { // Do not fold (vselect not(C), X, 0s) to (vselect C, Os, X) SDValue Cond = N->getOperand(0); SDValue RHS = N->getOperand(2); EVT CondVT = Cond.getValueType(); return N->getOpcode() == ISD::VSELECT && Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 && ISD::isBuildVectorAllZeros(RHS.getNode()); } // Extract raw constant bits from constant pools. static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl &EltBits, bool AllowWholeUndefs = true, bool AllowPartialUndefs = false) { assert(EltBits.empty() && "Expected an empty EltBits vector"); Op = peekThroughBitcasts(Op); EVT VT = Op.getValueType(); unsigned SizeInBits = VT.getSizeInBits(); unsigned NumElts = SizeInBits / EltSizeInBits; // Can't split constant. if ((SizeInBits % EltSizeInBits) != 0) return false; // Bitcast a source array of element bits to the target size. auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef SrcEltBits) { unsigned NumSrcElts = UndefSrcElts.getBitWidth(); unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth(); assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits && "Constant bit sizes don't match"); // Don't split if we don't allow undef bits. bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs; if (UndefSrcElts.getBoolValue() && !AllowUndefs) return false; // If we're already the right size, don't bother bitcasting. if (NumSrcElts == NumElts) { UndefElts = UndefSrcElts; EltBits.assign(SrcEltBits.begin(), SrcEltBits.end()); return true; } // Extract all the undef/constant element data and pack into single bitsets. APInt UndefBits(SizeInBits, 0); APInt MaskBits(SizeInBits, 0); for (unsigned i = 0; i != NumSrcElts; ++i) { unsigned BitOffset = i * SrcEltSizeInBits; if (UndefSrcElts[i]) UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits); MaskBits.insertBits(SrcEltBits[i], BitOffset); } // Split the undef/constant single bitset data into the target elements. UndefElts = APInt(NumElts, 0); EltBits.resize(NumElts, APInt(EltSizeInBits, 0)); for (unsigned i = 0; i != NumElts; ++i) { unsigned BitOffset = i * EltSizeInBits; APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset); // Only treat an element as UNDEF if all bits are UNDEF. if (UndefEltBits.isAllOnes()) { if (!AllowWholeUndefs) return false; UndefElts.setBit(i); continue; } // If only some bits are UNDEF then treat them as zero (or bail if not // supported). if (UndefEltBits.getBoolValue() && !AllowPartialUndefs) return false; EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset); } return true; }; // Collect constant bits and insert into mask/undef bit masks. auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs, unsigned UndefBitIndex) { if (!Cst) return false; if (isa(Cst)) { Undefs.setBit(UndefBitIndex); return true; } if (auto *CInt = dyn_cast(Cst)) { Mask = CInt->getValue(); return true; } if (auto *CFP = dyn_cast(Cst)) { Mask = CFP->getValueAPF().bitcastToAPInt(); return true; } if (auto *CDS = dyn_cast(Cst)) { Type *Ty = CDS->getType(); Mask = APInt::getZero(Ty->getPrimitiveSizeInBits()); Type *EltTy = CDS->getElementType(); bool IsInteger = EltTy->isIntegerTy(); bool IsFP = EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy(); if (!IsInteger && !IsFP) return false; unsigned EltBits = EltTy->getPrimitiveSizeInBits(); for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I) if (IsInteger) Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits); else Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(), I * EltBits); return true; } return false; }; // Handle UNDEFs. if (Op.isUndef()) { APInt UndefSrcElts = APInt::getAllOnes(NumElts); SmallVector SrcEltBits(NumElts, APInt(EltSizeInBits, 0)); return CastBitData(UndefSrcElts, SrcEltBits); } // Extract scalar constant bits. if (auto *Cst = dyn_cast(Op)) { APInt UndefSrcElts = APInt::getZero(1); SmallVector SrcEltBits(1, Cst->getAPIntValue()); return CastBitData(UndefSrcElts, SrcEltBits); } if (auto *Cst = dyn_cast(Op)) { APInt UndefSrcElts = APInt::getZero(1); APInt RawBits = Cst->getValueAPF().bitcastToAPInt(); SmallVector SrcEltBits(1, RawBits); return CastBitData(UndefSrcElts, SrcEltBits); } // Extract constant bits from build vector. if (auto *BV = dyn_cast(Op)) { BitVector Undefs; SmallVector SrcEltBits; unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) { APInt UndefSrcElts = APInt::getZero(SrcEltBits.size()); for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I) if (Undefs[I]) UndefSrcElts.setBit(I); return CastBitData(UndefSrcElts, SrcEltBits); } } // Extract constant bits from constant pool vector. if (auto *Cst = getTargetConstantFromNode(Op)) { Type *CstTy = Cst->getType(); unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits(); if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0) return false; unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits(); unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; if ((SizeInBits % SrcEltSizeInBits) != 0) return false; APInt UndefSrcElts(NumSrcElts, 0); SmallVector SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0)); for (unsigned i = 0; i != NumSrcElts; ++i) if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i], UndefSrcElts, i)) return false; return CastBitData(UndefSrcElts, SrcEltBits); } // Extract constant bits from a broadcasted constant pool scalar. if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD && EltSizeInBits <= VT.getScalarSizeInBits()) { auto *MemIntr = cast(Op); if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits()) return false; SDValue Ptr = MemIntr->getBasePtr(); if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) { unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; APInt UndefSrcElts(NumSrcElts, 0); SmallVector SrcEltBits(1, APInt(SrcEltSizeInBits, 0)); if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) { if (UndefSrcElts[0]) UndefSrcElts.setBits(0, NumSrcElts); if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits) SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits); SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]); return CastBitData(UndefSrcElts, SrcEltBits); } } } // Extract constant bits from a subvector broadcast. if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) { auto *MemIntr = cast(Op); SDValue Ptr = MemIntr->getBasePtr(); // The source constant may be larger than the subvector broadcast, // ensure we extract the correct subvector constants. if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) { Type *CstTy = Cst->getType(); unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits(); unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits(); if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 || (SizeInBits % SubVecSizeInBits) != 0) return false; unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits(); unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits; unsigned NumSubVecs = SizeInBits / SubVecSizeInBits; APInt UndefSubElts(NumSubElts, 0); SmallVector SubEltBits(NumSubElts * NumSubVecs, APInt(CstEltSizeInBits, 0)); for (unsigned i = 0; i != NumSubElts; ++i) { if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i], UndefSubElts, i)) return false; for (unsigned j = 1; j != NumSubVecs; ++j) SubEltBits[i + (j * NumSubElts)] = SubEltBits[i]; } UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(), UndefSubElts); return CastBitData(UndefSubElts, SubEltBits); } } // Extract a rematerialized scalar constant insertion. if (Op.getOpcode() == X86ISD::VZEXT_MOVL && Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && isa(Op.getOperand(0).getOperand(0))) { unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; APInt UndefSrcElts(NumSrcElts, 0); SmallVector SrcEltBits; const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0); SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits)); SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0)); return CastBitData(UndefSrcElts, SrcEltBits); } // Insert constant bits from a base and sub vector sources. if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) { // If bitcasts to larger elements we might lose track of undefs - don't // allow any to be safe. unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits; APInt UndefSrcElts, UndefSubElts; SmallVector EltSrcBits, EltSubBits; if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits, UndefSubElts, EltSubBits, AllowWholeUndefs && AllowUndefs, AllowPartialUndefs && AllowUndefs) && getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits, UndefSrcElts, EltSrcBits, AllowWholeUndefs && AllowUndefs, AllowPartialUndefs && AllowUndefs)) { unsigned BaseIdx = Op.getConstantOperandVal(2); UndefSrcElts.insertBits(UndefSubElts, BaseIdx); for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i) EltSrcBits[BaseIdx + i] = EltSubBits[i]; return CastBitData(UndefSrcElts, EltSrcBits); } } // Extract constant bits from a subvector's source. if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts, EltBits, AllowWholeUndefs, AllowPartialUndefs)) { EVT SrcVT = Op.getOperand(0).getValueType(); unsigned NumSrcElts = SrcVT.getSizeInBits() / EltSizeInBits; unsigned NumSubElts = VT.getSizeInBits() / EltSizeInBits; unsigned BaseOfs = Op.getConstantOperandVal(1) * VT.getScalarSizeInBits(); unsigned BaseIdx = BaseOfs / EltSizeInBits; assert((SrcVT.getSizeInBits() % EltSizeInBits) == 0 && (VT.getSizeInBits() % EltSizeInBits) == 0 && (BaseOfs % EltSizeInBits) == 0 && "Bad subvector index"); UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx); if ((BaseIdx + NumSubElts) != NumSrcElts) EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end()); if (BaseIdx != 0) EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx); return true; } // Extract constant bits from shuffle node sources. if (auto *SVN = dyn_cast(Op)) { // TODO - support shuffle through bitcasts. if (EltSizeInBits != VT.getScalarSizeInBits()) return false; ArrayRef Mask = SVN->getMask(); if ((!AllowWholeUndefs || !AllowPartialUndefs) && llvm::any_of(Mask, [](int M) { return M < 0; })) return false; APInt UndefElts0, UndefElts1; SmallVector EltBits0, EltBits1; if (isAnyInRange(Mask, 0, NumElts) && !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts0, EltBits0, AllowWholeUndefs, AllowPartialUndefs)) return false; if (isAnyInRange(Mask, NumElts, 2 * NumElts) && !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits, UndefElts1, EltBits1, AllowWholeUndefs, AllowPartialUndefs)) return false; UndefElts = APInt::getZero(NumElts); for (int i = 0; i != (int)NumElts; ++i) { int M = Mask[i]; if (M < 0) { UndefElts.setBit(i); EltBits.push_back(APInt::getZero(EltSizeInBits)); } else if (M < (int)NumElts) { if (UndefElts0[M]) UndefElts.setBit(i); EltBits.push_back(EltBits0[M]); } else { if (UndefElts1[M - NumElts]) UndefElts.setBit(i); EltBits.push_back(EltBits1[M - NumElts]); } } return true; } return false; } namespace llvm { namespace X86 { bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) { APInt UndefElts; SmallVector EltBits; if (getTargetConstantBitsFromNode( Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits, /*AllowWholeUndefs*/ true, AllowPartialUndefs)) { int SplatIndex = -1; for (int i = 0, e = EltBits.size(); i != e; ++i) { if (UndefElts[i]) continue; if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) { SplatIndex = -1; break; } SplatIndex = i; } if (0 <= SplatIndex) { SplatVal = EltBits[SplatIndex]; return true; } } return false; } int getRoundingModeX86(unsigned RM) { switch (static_cast<::llvm::RoundingMode>(RM)) { // clang-format off case ::llvm::RoundingMode::NearestTiesToEven: return X86::rmToNearest; case ::llvm::RoundingMode::TowardNegative: return X86::rmDownward; case ::llvm::RoundingMode::TowardPositive: return X86::rmUpward; case ::llvm::RoundingMode::TowardZero: return X86::rmTowardZero; default: return X86::rmInvalid; // clang-format on } } } // namespace X86 } // namespace llvm static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl &RawMask, APInt &UndefElts) { // Extract the raw target constant bits. SmallVector EltBits; if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts, EltBits, /* AllowWholeUndefs */ true, /* AllowPartialUndefs */ false)) return false; // Insert the extracted elements into the mask. for (const APInt &Elt : EltBits) RawMask.push_back(Elt.getZExtValue()); return true; } static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts, bool AllowUndefs) { APInt UndefElts; SmallVector EltBits; if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits, /*AllowWholeUndefs*/ AllowUndefs, /*AllowPartialUndefs*/ false)) return false; bool IsPow2OrUndef = true; for (unsigned I = 0, E = EltBits.size(); I != E; ++I) IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2(); return IsPow2OrUndef; } // Helper to attempt to return a cheaper, bit-inverted version of \p V. static SDValue IsNOT(SDValue V, SelectionDAG &DAG) { // TODO: don't always ignore oneuse constraints. V = peekThroughBitcasts(V); EVT VT = V.getValueType(); // Match not(xor X, -1) -> X. if (V.getOpcode() == ISD::XOR && (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) || isAllOnesConstant(V.getOperand(1)))) return V.getOperand(0); // Match not(extract_subvector(not(X)) -> extract_subvector(X). if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR && (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) { if (SDValue Not = IsNOT(V.getOperand(0), DAG)) { Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not, V.getOperand(1)); } } // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1). if (V.getOpcode() == X86ISD::PCMPGT && !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) && !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) && V.getOperand(0).hasOneUse()) { APInt UndefElts; SmallVector EltBits; if (getTargetConstantBitsFromNode(V.getOperand(0), V.getScalarValueSizeInBits(), UndefElts, EltBits) && !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) { // Don't fold min_signed_value -> (min_signed_value - 1) bool MinSigned = false; for (APInt &Elt : EltBits) { MinSigned |= Elt.isMinSignedValue(); Elt -= 1; } if (!MinSigned) { SDLoc DL(V); MVT VT = V.getSimpleValueType(); return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1), getConstVector(EltBits, UndefElts, VT, DAG, DL)); } } } // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y). SmallVector CatOps; if (collectConcatOps(V.getNode(), CatOps, DAG)) { for (SDValue &CatOp : CatOps) { SDValue NotCat = IsNOT(CatOp, DAG); if (!NotCat) return SDValue(); CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat); } return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps); } // Match not(or(not(X),not(Y))) -> and(X, Y). if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) && V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) { // TODO: Handle cases with single NOT operand -> ANDNP if (SDValue Op1 = IsNOT(V.getOperand(1), DAG)) if (SDValue Op0 = IsNOT(V.getOperand(0), DAG)) return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0), DAG.getBitcast(VT, Op1)); } return SDValue(); } /// Create a shuffle mask that matches the PACKSS/PACKUS truncation. /// A multi-stage pack shuffle mask is created by specifying NumStages > 1. /// Note: This ignores saturation, so inputs must be checked first. static void createPackShuffleMask(MVT VT, SmallVectorImpl &Mask, bool Unary, unsigned NumStages = 1) { assert(Mask.empty() && "Expected an empty shuffle mask vector"); unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits() / 128; unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits(); unsigned Offset = Unary ? 0 : NumElts; unsigned Repetitions = 1u << (NumStages - 1); unsigned Increment = 1u << NumStages; assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction"); for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { for (unsigned Stage = 0; Stage != Repetitions; ++Stage) { for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment) Mask.push_back(Elt + (Lane * NumEltsPerLane)); for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment) Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset); } } } // Split the demanded elts of a PACKSS/PACKUS node between its operands. static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS) { int NumLanes = VT.getSizeInBits() / 128; int NumElts = DemandedElts.getBitWidth(); int NumInnerElts = NumElts / 2; int NumEltsPerLane = NumElts / NumLanes; int NumInnerEltsPerLane = NumInnerElts / NumLanes; DemandedLHS = APInt::getZero(NumInnerElts); DemandedRHS = APInt::getZero(NumInnerElts); // Map DemandedElts to the packed operands. for (int Lane = 0; Lane != NumLanes; ++Lane) { for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) { int OuterIdx = (Lane * NumEltsPerLane) + Elt; int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt; if (DemandedElts[OuterIdx]) DemandedLHS.setBit(InnerIdx); if (DemandedElts[OuterIdx + NumInnerEltsPerLane]) DemandedRHS.setBit(InnerIdx); } } } // Split the demanded elts of a HADD/HSUB node between its operands. static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS) { getHorizDemandedEltsForFirstOperand(VT.getSizeInBits(), DemandedElts, DemandedLHS, DemandedRHS); DemandedLHS |= DemandedLHS << 1; DemandedRHS |= DemandedRHS << 1; } /// Calculates the shuffle mask corresponding to the target-specific opcode. /// If the mask could be calculated, returns it in \p Mask, returns the shuffle /// operands in \p Ops, and returns true. /// Sets \p IsUnary to true if only one source is used. Note that this will set /// IsUnary for shuffles which use a single input multiple times, and in those /// cases it will adjust the mask to only have indices within that single input. /// It is an error to call this with non-empty Mask/Ops vectors. static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl &Ops, SmallVectorImpl &Mask, bool &IsUnary) { if (!isTargetShuffle(N.getOpcode())) return false; MVT VT = N.getSimpleValueType(); unsigned NumElems = VT.getVectorNumElements(); unsigned MaskEltSize = VT.getScalarSizeInBits(); SmallVector RawMask; APInt RawUndefs; uint64_t ImmN; assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"); assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"); IsUnary = false; bool IsFakeUnary = false; switch (N.getOpcode()) { case X86ISD::BLENDI: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); DecodeBLENDMask(NumElems, ImmN, Mask); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); break; case X86ISD::SHUFP: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); break; case X86ISD::INSERTPS: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); break; case X86ISD::EXTRQI: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); if (isa(N.getOperand(1)) && isa(N.getOperand(2))) { int BitLen = N.getConstantOperandVal(1); int BitIdx = N.getConstantOperandVal(2); DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask); IsUnary = true; } break; case X86ISD::INSERTQI: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); if (isa(N.getOperand(2)) && isa(N.getOperand(3))) { int BitLen = N.getConstantOperandVal(2); int BitIdx = N.getConstantOperandVal(3); DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); } break; case X86ISD::UNPCKH: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeUNPCKHMask(NumElems, MaskEltSize, Mask); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); break; case X86ISD::UNPCKL: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeUNPCKLMask(NumElems, MaskEltSize, Mask); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); break; case X86ISD::MOVHLPS: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeMOVHLPSMask(NumElems, Mask); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); break; case X86ISD::MOVLHPS: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeMOVLHPSMask(NumElems, Mask); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); break; case X86ISD::VALIGN: assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"); assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); DecodeVALIGNMask(NumElems, ImmN, Mask); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); Ops.push_back(N.getOperand(1)); Ops.push_back(N.getOperand(0)); break; case X86ISD::PALIGNR: assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); DecodePALIGNRMask(NumElems, ImmN, Mask); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); Ops.push_back(N.getOperand(1)); Ops.push_back(N.getOperand(0)); break; case X86ISD::VSHLDQ: assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); DecodePSLLDQMask(NumElems, ImmN, Mask); IsUnary = true; break; case X86ISD::VSRLDQ: assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); DecodePSRLDQMask(NumElems, ImmN, Mask); IsUnary = true; break; case X86ISD::PSHUFD: case X86ISD::VPERMILPI: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask); IsUnary = true; break; case X86ISD::PSHUFHW: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); DecodePSHUFHWMask(NumElems, ImmN, Mask); IsUnary = true; break; case X86ISD::PSHUFLW: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); DecodePSHUFLWMask(NumElems, ImmN, Mask); IsUnary = true; break; case X86ISD::VZEXT_MOVL: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); DecodeZeroMoveLowMask(NumElems, Mask); IsUnary = true; break; case X86ISD::VBROADCAST: // We only decode broadcasts of same-sized vectors, peeking through to // extracted subvectors is likely to cause hasOneUse issues with // SimplifyDemandedBits etc. if (N.getOperand(0).getValueType() == VT) { DecodeVectorBroadcast(NumElems, Mask); IsUnary = true; break; } return false; case X86ISD::VPERMILPV: { assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); IsUnary = true; SDValue MaskNode = N.getOperand(1); if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, RawUndefs)) { DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask); break; } return false; } case X86ISD::PSHUFB: { assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); IsUnary = true; SDValue MaskNode = N.getOperand(1); if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) { DecodePSHUFBMask(RawMask, RawUndefs, Mask); break; } return false; } case X86ISD::VPERMI: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); DecodeVPERMMask(NumElems, ImmN, Mask); IsUnary = true; break; case X86ISD::MOVSS: case X86ISD::MOVSD: case X86ISD::MOVSH: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask); break; case X86ISD::VPERM2X128: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); DecodeVPERM2X128Mask(NumElems, ImmN, Mask); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); break; case X86ISD::SHUF128: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N.getConstantOperandVal(N.getNumOperands() - 1); decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); break; case X86ISD::MOVSLDUP: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); DecodeMOVSLDUPMask(NumElems, Mask); IsUnary = true; break; case X86ISD::MOVSHDUP: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); DecodeMOVSHDUPMask(NumElems, Mask); IsUnary = true; break; case X86ISD::MOVDDUP: assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); DecodeMOVDDUPMask(NumElems, Mask); IsUnary = true; break; case X86ISD::VPERMIL2: { assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); SDValue MaskNode = N.getOperand(2); SDValue CtrlNode = N.getOperand(3); if (ConstantSDNode *CtrlOp = dyn_cast(CtrlNode)) { unsigned CtrlImm = CtrlOp->getZExtValue(); if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, RawUndefs)) { DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs, Mask); break; } } return false; } case X86ISD::VPPERM: { assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1); SDValue MaskNode = N.getOperand(2); if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) { DecodeVPPERMMask(RawMask, RawUndefs, Mask); break; } return false; } case X86ISD::VPERMV: { assert(N.getOperand(1).getValueType() == VT && "Unexpected value type"); IsUnary = true; // Unlike most shuffle nodes, VPERMV's mask operand is operand 0. Ops.push_back(N.getOperand(1)); SDValue MaskNode = N.getOperand(0); if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, RawUndefs)) { DecodeVPERMVMask(RawMask, RawUndefs, Mask); break; } return false; } case X86ISD::VPERMV3: { assert(N.getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N.getOperand(2).getValueType() == VT && "Unexpected value type"); IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2); // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one. Ops.push_back(N.getOperand(0)); Ops.push_back(N.getOperand(2)); SDValue MaskNode = N.getOperand(1); if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, RawUndefs)) { DecodeVPERMV3Mask(RawMask, RawUndefs, Mask); break; } return false; } case X86ISD::COMPRESS: { SDValue CmpVec = N.getOperand(0); SDValue PassThru = N.getOperand(1); SDValue CmpMask = N.getOperand(2); APInt UndefElts; SmallVector EltBits; if (!getTargetConstantBitsFromNode(CmpMask, 1, UndefElts, EltBits)) return false; assert(UndefElts.getBitWidth() == NumElems && EltBits.size() == NumElems && "Illegal compression mask"); for (unsigned I = 0; I != NumElems; ++I) { if (!EltBits[I].isZero()) Mask.push_back(I); } while (Mask.size() != NumElems) { Mask.push_back(NumElems + Mask.size()); } Ops.push_back(CmpVec); Ops.push_back(PassThru); return true; } case X86ISD::EXPAND: { SDValue ExpVec = N.getOperand(0); SDValue PassThru = N.getOperand(1); SDValue ExpMask = N.getOperand(2); APInt UndefElts; SmallVector EltBits; if (!getTargetConstantBitsFromNode(ExpMask, 1, UndefElts, EltBits)) return false; assert(UndefElts.getBitWidth() == NumElems && EltBits.size() == NumElems && "Illegal expansion mask"); unsigned ExpIndex = 0; for (unsigned I = 0; I != NumElems; ++I) { if (EltBits[I].isZero()) Mask.push_back(I + NumElems); else Mask.push_back(ExpIndex++); } Ops.push_back(ExpVec); Ops.push_back(PassThru); return true; } default: llvm_unreachable("unknown target shuffle node"); } // Empty mask indicates the decode failed. if (Mask.empty()) return false; // Check if we're getting a shuffle mask with zero'd elements. if (!AllowSentinelZero && isAnyZero(Mask)) return false; // If we have a fake unary shuffle, the shuffle mask is spread across two // inputs that are actually the same node. Re-map the mask to always point // into the first input. if (IsFakeUnary) for (int &M : Mask) if (M >= (int)Mask.size()) M -= Mask.size(); // If we didn't already add operands in the opcode-specific code, default to // adding 1 or 2 operands starting at 0. if (Ops.empty()) { Ops.push_back(N.getOperand(0)); if (!IsUnary || IsFakeUnary) Ops.push_back(N.getOperand(1)); } return true; } // Wrapper for getTargetShuffleMask with InUnary; static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl &Ops, SmallVectorImpl &Mask) { bool IsUnary; return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary); } /// Compute whether each element of a shuffle is zeroable. /// /// A "zeroable" vector shuffle element is one which can be lowered to zero. /// Either it is an undef element in the shuffle mask, the element of the input /// referenced is undef, or the element of the input referenced is known to be /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle /// as many lanes with this technique as possible to simplify the remaining /// shuffle. static void computeZeroableShuffleElements(ArrayRef Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero) { int Size = Mask.size(); KnownUndef = KnownZero = APInt::getZero(Size); V1 = peekThroughBitcasts(V1); V2 = peekThroughBitcasts(V2); bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); int VectorSizeInBits = V1.getValueSizeInBits(); int ScalarSizeInBits = VectorSizeInBits / Size; assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size"); for (int i = 0; i < Size; ++i) { int M = Mask[i]; // Handle the easy cases. if (M < 0) { KnownUndef.setBit(i); continue; } if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { KnownZero.setBit(i); continue; } // Determine shuffle input and normalize the mask. SDValue V = M < Size ? V1 : V2; M %= Size; // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements. if (V.getOpcode() != ISD::BUILD_VECTOR) continue; // If the BUILD_VECTOR has fewer elements then the bitcasted portion of // the (larger) source element must be UNDEF/ZERO. if ((Size % V.getNumOperands()) == 0) { int Scale = Size / V->getNumOperands(); SDValue Op = V.getOperand(M / Scale); if (Op.isUndef()) KnownUndef.setBit(i); if (X86::isZeroNode(Op)) KnownZero.setBit(i); else if (ConstantSDNode *Cst = dyn_cast(Op)) { APInt Val = Cst->getAPIntValue(); Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits); if (Val == 0) KnownZero.setBit(i); } else if (ConstantFPSDNode *Cst = dyn_cast(Op)) { APInt Val = Cst->getValueAPF().bitcastToAPInt(); Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits); if (Val == 0) KnownZero.setBit(i); } continue; } // If the BUILD_VECTOR has more elements then all the (smaller) source // elements must be UNDEF or ZERO. if ((V.getNumOperands() % Size) == 0) { int Scale = V->getNumOperands() / Size; bool AllUndef = true; bool AllZero = true; for (int j = 0; j < Scale; ++j) { SDValue Op = V.getOperand((M * Scale) + j); AllUndef &= Op.isUndef(); AllZero &= X86::isZeroNode(Op); } if (AllUndef) KnownUndef.setBit(i); if (AllZero) KnownZero.setBit(i); continue; } } } /// Decode a target shuffle mask and inputs and see if any values are /// known to be undef or zero from their inputs. /// Returns true if the target shuffle mask was decoded. /// FIXME: Merge this with computeZeroableShuffleElements? static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl &Mask, SmallVectorImpl &Ops, APInt &KnownUndef, APInt &KnownZero) { bool IsUnary; if (!isTargetShuffle(N.getOpcode())) return false; MVT VT = N.getSimpleValueType(); if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary)) return false; int Size = Mask.size(); SDValue V1 = Ops[0]; SDValue V2 = IsUnary ? V1 : Ops[1]; KnownUndef = KnownZero = APInt::getZero(Size); V1 = peekThroughBitcasts(V1); V2 = peekThroughBitcasts(V2); assert((VT.getSizeInBits() % Size) == 0 && "Illegal split of shuffle value type"); unsigned EltSizeInBits = VT.getSizeInBits() / Size; // Extract known constant input data. APInt UndefSrcElts[2]; SmallVector SrcEltBits[2]; bool IsSrcConstant[2] = { getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0], SrcEltBits[0], /*AllowWholeUndefs*/ true, /*AllowPartialUndefs*/ false), getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1], SrcEltBits[1], /*AllowWholeUndefs*/ true, /*AllowPartialUndefs*/ false)}; for (int i = 0; i < Size; ++i) { int M = Mask[i]; // Already decoded as SM_SentinelZero / SM_SentinelUndef. if (M < 0) { assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!"); if (SM_SentinelUndef == M) KnownUndef.setBit(i); if (SM_SentinelZero == M) KnownZero.setBit(i); continue; } // Determine shuffle input and normalize the mask. unsigned SrcIdx = M / Size; SDValue V = M < Size ? V1 : V2; M %= Size; // We are referencing an UNDEF input. if (V.isUndef()) { KnownUndef.setBit(i); continue; } // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF. // TODO: We currently only set UNDEF for integer types - floats use the same // registers as vectors and many of the scalar folded loads rely on the // SCALAR_TO_VECTOR pattern. if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && (Size % V.getValueType().getVectorNumElements()) == 0) { int Scale = Size / V.getValueType().getVectorNumElements(); int Idx = M / Scale; if (Idx != 0 && !VT.isFloatingPoint()) KnownUndef.setBit(i); else if (Idx == 0 && X86::isZeroNode(V.getOperand(0))) KnownZero.setBit(i); continue; } // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF // base vectors. if (V.getOpcode() == ISD::INSERT_SUBVECTOR) { SDValue Vec = V.getOperand(0); int NumVecElts = Vec.getValueType().getVectorNumElements(); if (Vec.isUndef() && Size == NumVecElts) { int Idx = V.getConstantOperandVal(2); int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements(); if (M < Idx || (Idx + NumSubElts) <= M) KnownUndef.setBit(i); } continue; } // Attempt to extract from the source's constant bits. if (IsSrcConstant[SrcIdx]) { if (UndefSrcElts[SrcIdx][M]) KnownUndef.setBit(i); else if (SrcEltBits[SrcIdx][M] == 0) KnownZero.setBit(i); } } assert(VT.getVectorNumElements() == (unsigned)Size && "Different mask size from vector size!"); return true; } // Replace target shuffle mask elements with known undef/zero sentinels. static void resolveTargetShuffleFromZeroables(SmallVectorImpl &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros= true) { unsigned NumElts = Mask.size(); assert(KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"); for (unsigned i = 0; i != NumElts; ++i) { if (KnownUndef[i]) Mask[i] = SM_SentinelUndef; else if (ResolveKnownZeros && KnownZero[i]) Mask[i] = SM_SentinelZero; } } // Extract target shuffle mask sentinel elements to known undef/zero bitmasks. static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl &Mask, APInt &KnownUndef, APInt &KnownZero) { unsigned NumElts = Mask.size(); KnownUndef = KnownZero = APInt::getZero(NumElts); for (unsigned i = 0; i != NumElts; ++i) { int M = Mask[i]; if (SM_SentinelUndef == M) KnownUndef.setBit(i); if (SM_SentinelZero == M) KnownZero.setBit(i); } } // Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask. static bool createShuffleMaskFromVSELECT(SmallVectorImpl &Mask, SDValue Cond, bool IsBLENDV = false) { EVT CondVT = Cond.getValueType(); unsigned EltSizeInBits = CondVT.getScalarSizeInBits(); unsigned NumElts = CondVT.getVectorNumElements(); APInt UndefElts; SmallVector EltBits; if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits, /*AllowWholeUndefs*/ true, /*AllowPartialUndefs*/ false)) return false; Mask.resize(NumElts, SM_SentinelUndef); for (int i = 0; i != (int)NumElts; ++i) { Mask[i] = i; // Arbitrarily choose from the 2nd operand if the select condition element // is undef. // TODO: Can we do better by matching patterns such as even/odd? if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) || (IsBLENDV && EltBits[i].isNonNegative())) Mask[i] += NumElts; } return true; } // Forward declaration (for getFauxShuffleMask recursive check). static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl &Inputs, SmallVectorImpl &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts); // Attempt to decode ops that could be represented as a shuffle mask. // The decoded shuffle mask may contain a different number of elements to the // destination value type. // TODO: Merge into getTargetShuffleInputs() static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl &Mask, SmallVectorImpl &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts) { Mask.clear(); Ops.clear(); MVT VT = N.getSimpleValueType(); unsigned NumElts = VT.getVectorNumElements(); unsigned NumSizeInBits = VT.getSizeInBits(); unsigned NumBitsPerElt = VT.getScalarSizeInBits(); if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0) return false; assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size"); unsigned NumSizeInBytes = NumSizeInBits / 8; unsigned NumBytesPerElt = NumBitsPerElt / 8; unsigned Opcode = N.getOpcode(); switch (Opcode) { case ISD::VECTOR_SHUFFLE: { // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here. ArrayRef ShuffleMask = cast(N)->getMask(); if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) { Mask.append(ShuffleMask.begin(), ShuffleMask.end()); Ops.push_back(N.getOperand(0)); Ops.push_back(N.getOperand(1)); return true; } return false; } case ISD::AND: case X86ISD::ANDNP: { // Attempt to decode as a per-byte mask. APInt UndefElts; SmallVector EltBits; SDValue N0 = N.getOperand(0); SDValue N1 = N.getOperand(1); bool IsAndN = (X86ISD::ANDNP == Opcode); uint64_t ZeroMask = IsAndN ? 255 : 0; if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits, /*AllowWholeUndefs*/ false, /*AllowPartialUndefs*/ false)) return false; // We can't assume an undef src element gives an undef dst - the other src // might be zero. assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask"); for (int i = 0, e = (int)EltBits.size(); i != e; ++i) { const APInt &ByteBits = EltBits[i]; if (ByteBits != 0 && ByteBits != 255) return false; Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i); } Ops.push_back(IsAndN ? N1 : N0); return true; } case ISD::OR: { // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other // is a valid shuffle index. SDValue N0 = peekThroughBitcasts(N.getOperand(0)); SDValue N1 = peekThroughBitcasts(N.getOperand(1)); if (!N0.getValueType().isVector() || !N1.getValueType().isVector()) return false; SmallVector SrcMask0, SrcMask1; SmallVector SrcInputs0, SrcInputs1; APInt Demand0 = APInt::getAllOnes(N0.getValueType().getVectorNumElements()); APInt Demand1 = APInt::getAllOnes(N1.getValueType().getVectorNumElements()); if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG, Depth + 1, true) || !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG, Depth + 1, true)) return false; size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size()); SmallVector Mask0, Mask1; narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0); narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1); for (int i = 0; i != (int)MaskSize; ++i) { // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite // loops converting between OR and BLEND shuffles due to // canWidenShuffleElements merging away undef elements, meaning we // fail to recognise the OR as the undef element isn't known zero. if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero) Mask.push_back(SM_SentinelZero); else if (Mask1[i] == SM_SentinelZero) Mask.push_back(i); else if (Mask0[i] == SM_SentinelZero) Mask.push_back(i + MaskSize); else return false; } Ops.push_back(N.getOperand(0)); Ops.push_back(N.getOperand(1)); return true; } case ISD::CONCAT_VECTORS: { // Limit this to vXi64 vector cases to make the most of cross lane shuffles. unsigned NumSubElts = N.getOperand(0).getValueType().getVectorNumElements(); if (NumBitsPerElt == 64) { for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) { for (unsigned M = 0; M != NumSubElts; ++M) Mask.push_back((I * NumElts) + M); Ops.push_back(N.getOperand(I)); } return true; } return false; } case ISD::INSERT_SUBVECTOR: { SDValue Src = N.getOperand(0); SDValue Sub = N.getOperand(1); EVT SubVT = Sub.getValueType(); unsigned NumSubElts = SubVT.getVectorNumElements(); uint64_t InsertIdx = N.getConstantOperandVal(2); // Subvector isn't demanded - just return the base vector. if (DemandedElts.extractBits(NumSubElts, InsertIdx) == 0) { Mask.resize(NumElts); std::iota(Mask.begin(), Mask.end(), 0); Ops.push_back(Src); return true; } // Handle CONCAT(SUB0, SUB1). // Limit to vXi64/splat cases to make the most of cross lane shuffles. if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) && Src.getOpcode() == ISD::INSERT_SUBVECTOR && Src.getOperand(0).isUndef() && Src.getOperand(1).getValueType() == SubVT && Src.getConstantOperandVal(2) == 0 && (NumBitsPerElt == 64 || Src.getOperand(1) == Sub) && SDNode::areOnlyUsersOf({N.getNode(), Src.getNode()}, Sub.getNode())) { Mask.resize(NumElts); std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0); std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts); Ops.push_back(Src.getOperand(1)); Ops.push_back(Sub); return true; } if (!N->isOnlyUserOf(Sub.getNode())) return false; SmallVector SubMask; SmallVector SubInputs; SDValue SubSrc = peekThroughOneUseBitcasts(Sub); EVT SubSrcVT = SubSrc.getValueType(); if (!SubSrcVT.isVector()) return false; // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)). if (SubSrc.getOpcode() == ISD::EXTRACT_SUBVECTOR && SubSrc.getOperand(0).getValueSizeInBits() == NumSizeInBits) { uint64_t ExtractIdx = SubSrc.getConstantOperandVal(1); SDValue SubSrcSrc = SubSrc.getOperand(0); unsigned NumSubSrcSrcElts = SubSrcSrc.getValueType().getVectorNumElements(); unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts); assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 && "Subvector valuetype mismatch"); InsertIdx *= (MaxElts / NumElts); ExtractIdx *= (MaxElts / NumSubSrcSrcElts); NumSubElts *= (MaxElts / NumElts); bool SrcIsUndef = Src.isUndef(); for (int i = 0; i != (int)MaxElts; ++i) Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i); for (int i = 0; i != (int)NumSubElts; ++i) Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i; if (!SrcIsUndef) Ops.push_back(Src); Ops.push_back(SubSrcSrc); return true; } // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)). APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements()); if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG, Depth + 1, ResolveKnownElts)) return false; // Subvector shuffle inputs must not be larger than the subvector. if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) { return SubVT.getFixedSizeInBits() < SubInput.getValueSizeInBits().getFixedValue(); })) return false; if (SubMask.size() != NumSubElts) { assert(((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"); if ((NumSubElts % SubMask.size()) == 0) { int Scale = NumSubElts / SubMask.size(); SmallVector ScaledSubMask; narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask); SubMask = ScaledSubMask; } else { int Scale = SubMask.size() / NumSubElts; NumSubElts = SubMask.size(); NumElts *= Scale; InsertIdx *= Scale; } } Ops.push_back(Src); Ops.append(SubInputs.begin(), SubInputs.end()); if (ISD::isBuildVectorAllZeros(Src.getNode())) Mask.append(NumElts, SM_SentinelZero); else for (int i = 0; i != (int)NumElts; ++i) Mask.push_back(i); for (int i = 0; i != (int)NumSubElts; ++i) { int M = SubMask[i]; if (0 <= M) { int InputIdx = M / NumSubElts; M = (NumElts * (1 + InputIdx)) + (M % NumSubElts); } Mask[i + InsertIdx] = M; } return true; } case X86ISD::PINSRB: case X86ISD::PINSRW: case ISD::SCALAR_TO_VECTOR: case ISD::INSERT_VECTOR_ELT: { // Match against a insert_vector_elt/scalar_to_vector of an extract from a // vector, for matching src/dst vector types. SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1); unsigned DstIdx = 0; if (Opcode != ISD::SCALAR_TO_VECTOR) { // Check we have an in-range constant insertion index. if (!isa(N.getOperand(2)) || N.getConstantOperandAPInt(2).uge(NumElts)) return false; DstIdx = N.getConstantOperandVal(2); // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern. if (X86::isZeroNode(Scl)) { Ops.push_back(N.getOperand(0)); for (unsigned i = 0; i != NumElts; ++i) Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i); return true; } } // Peek through trunc/aext/zext/bitcast. // TODO: aext shouldn't require SM_SentinelZero padding. // TODO: handle shift of scalars. unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits(); while (Scl.getOpcode() == ISD::TRUNCATE || Scl.getOpcode() == ISD::ANY_EXTEND || Scl.getOpcode() == ISD::ZERO_EXTEND || (Scl.getOpcode() == ISD::BITCAST && Scl.getScalarValueSizeInBits() == Scl.getOperand(0).getScalarValueSizeInBits())) { Scl = Scl.getOperand(0); MinBitsPerElt = std::min(MinBitsPerElt, Scl.getScalarValueSizeInBits()); } if ((MinBitsPerElt % 8) != 0) return false; // Attempt to find the source vector the scalar was extracted from. SDValue SrcExtract; if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT || Scl.getOpcode() == X86ISD::PEXTRW || Scl.getOpcode() == X86ISD::PEXTRB) && Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) { SrcExtract = Scl; } if (!SrcExtract || !isa(SrcExtract.getOperand(1))) return false; SDValue SrcVec = SrcExtract.getOperand(0); EVT SrcVT = SrcVec.getValueType(); if (!SrcVT.getScalarType().isByteSized()) return false; unsigned SrcIdx = SrcExtract.getConstantOperandVal(1); unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8); unsigned DstByte = DstIdx * NumBytesPerElt; MinBitsPerElt = std::min(MinBitsPerElt, SrcVT.getScalarSizeInBits()); // Create 'identity' byte level shuffle mask and then add inserted bytes. if (Opcode == ISD::SCALAR_TO_VECTOR) { Ops.push_back(SrcVec); Mask.append(NumSizeInBytes, SM_SentinelUndef); } else { Ops.push_back(SrcVec); Ops.push_back(N.getOperand(0)); for (int i = 0; i != (int)NumSizeInBytes; ++i) Mask.push_back(NumSizeInBytes + i); } unsigned MinBytesPerElts = MinBitsPerElt / 8; MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt); for (unsigned i = 0; i != MinBytesPerElts; ++i) Mask[DstByte + i] = SrcByte + i; for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i) Mask[DstByte + i] = SM_SentinelZero; return true; } case X86ISD::PACKSS: case X86ISD::PACKUS: { SDValue N0 = N.getOperand(0); SDValue N1 = N.getOperand(1); assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && "Unexpected input value type"); APInt EltsLHS, EltsRHS; getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS); // If we know input saturation won't happen (or we don't care for particular // lanes), we can treat this as a truncation shuffle. bool Offset0 = false, Offset1 = false; if (Opcode == X86ISD::PACKSS) { if ((!(N0.isUndef() || EltsLHS.isZero()) && DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) || (!(N1.isUndef() || EltsRHS.isZero()) && DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt)) return false; // We can't easily fold ASHR into a shuffle, but if it was feeding a // PACKSS then it was likely being used for sign-extension for a // truncation, so just peek through and adjust the mask accordingly. if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) && N0.getConstantOperandAPInt(1) == NumBitsPerElt) { Offset0 = true; N0 = N0.getOperand(0); } if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) && N1.getConstantOperandAPInt(1) == NumBitsPerElt) { Offset1 = true; N1 = N1.getOperand(0); } } else { APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt); if ((!(N0.isUndef() || EltsLHS.isZero()) && !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) || (!(N1.isUndef() || EltsRHS.isZero()) && !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1))) return false; } bool IsUnary = (N0 == N1); Ops.push_back(N0); if (!IsUnary) Ops.push_back(N1); createPackShuffleMask(VT, Mask, IsUnary); if (Offset0 || Offset1) { for (int &M : Mask) if ((Offset0 && isInRange(M, 0, NumElts)) || (Offset1 && isInRange(M, NumElts, 2 * NumElts))) ++M; } return true; } case ISD::VSELECT: case X86ISD::BLENDV: { SDValue Cond = N.getOperand(0); if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) { Ops.push_back(N.getOperand(1)); Ops.push_back(N.getOperand(2)); return true; } return false; } case X86ISD::VTRUNC: { SDValue Src = N.getOperand(0); EVT SrcVT = Src.getValueType(); if (SrcVT.getSizeInBits() != NumSizeInBits) return false; unsigned NumSrcElts = SrcVT.getVectorNumElements(); unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits(); unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt; assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation"); for (unsigned i = 0; i != NumSrcElts; ++i) Mask.push_back(i * Scale); Mask.append(NumElts - NumSrcElts, SM_SentinelZero); Ops.push_back(Src); return true; } case ISD::SHL: case ISD::SRL: { APInt UndefElts; SmallVector EltBits; if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt, UndefElts, EltBits, /*AllowWholeUndefs*/ true, /*AllowPartialUndefs*/ false)) return false; // We can only decode 'whole byte' bit shifts as shuffles. for (unsigned I = 0; I != NumElts; ++I) if (DemandedElts[I] && !UndefElts[I] && (EltBits[I].urem(8) != 0 || EltBits[I].uge(NumBitsPerElt))) return false; Mask.append(NumSizeInBytes, SM_SentinelUndef); Ops.push_back(N.getOperand(0)); for (unsigned I = 0; I != NumElts; ++I) { if (!DemandedElts[I] || UndefElts[I]) continue; unsigned ByteShift = EltBits[I].getZExtValue() / 8; unsigned Lo = I * NumBytesPerElt; unsigned Hi = Lo + NumBytesPerElt; // Clear mask to all zeros and insert the shifted byte indices. std::fill(Mask.begin() + Lo, Mask.begin() + Hi, SM_SentinelZero); if (ISD::SHL == Opcode) std::iota(Mask.begin() + Lo + ByteShift, Mask.begin() + Hi, Lo); else std::iota(Mask.begin() + Lo, Mask.begin() + Hi - ByteShift, Lo + ByteShift); } return true; } case X86ISD::VSHLI: case X86ISD::VSRLI: { uint64_t ShiftVal = N.getConstantOperandVal(1); // Out of range bit shifts are guaranteed to be zero. if (NumBitsPerElt <= ShiftVal) { Mask.append(NumElts, SM_SentinelZero); return true; } // We can only decode 'whole byte' bit shifts as shuffles. if ((ShiftVal % 8) != 0) break; uint64_t ByteShift = ShiftVal / 8; Ops.push_back(N.getOperand(0)); // Clear mask to all zeros and insert the shifted byte indices. Mask.append(NumSizeInBytes, SM_SentinelZero); if (X86ISD::VSHLI == Opcode) { for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) for (unsigned j = ByteShift; j != NumBytesPerElt; ++j) Mask[i + j] = i + j - ByteShift; } else { for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) for (unsigned j = ByteShift; j != NumBytesPerElt; ++j) Mask[i + j - ByteShift] = i + j; } return true; } case X86ISD::VROTLI: case X86ISD::VROTRI: { // We can only decode 'whole byte' bit rotates as shuffles. uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt); if ((RotateVal % 8) != 0) return false; Ops.push_back(N.getOperand(0)); int Offset = RotateVal / 8; Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset); for (int i = 0; i != (int)NumElts; ++i) { int BaseIdx = i * NumBytesPerElt; for (int j = 0; j != (int)NumBytesPerElt; ++j) { Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt)); } } return true; } case X86ISD::VBROADCAST: { SDValue Src = N.getOperand(0); if (!Src.getSimpleValueType().isVector()) { if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isNullConstant(Src.getOperand(1)) || Src.getOperand(0).getValueType().getScalarType() != VT.getScalarType()) return false; Src = Src.getOperand(0); } Ops.push_back(Src); Mask.append(NumElts, 0); return true; } case ISD::SIGN_EXTEND_VECTOR_INREG: { SDValue Src = N.getOperand(0); EVT SrcVT = Src.getValueType(); unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits(); // Extended source must be a simple vector. if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 || (NumBitsPerSrcElt % 8) != 0) return false; // We can only handle all-signbits extensions. APInt DemandedSrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements()); if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt) return false; assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension"); unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt; for (unsigned I = 0; I != NumElts; ++I) Mask.append(Scale, I); Ops.push_back(Src); return true; } case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND_VECTOR_INREG: case ISD::ANY_EXTEND_VECTOR_INREG: { SDValue Src = N.getOperand(0); EVT SrcVT = Src.getValueType(); // Extended source must be a simple vector. if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 || (SrcVT.getScalarSizeInBits() % 8) != 0) return false; bool IsAnyExtend = (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode); DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts, IsAnyExtend, Mask); Ops.push_back(Src); return true; } } return false; } /// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask. static void resolveTargetShuffleInputsAndMask(SmallVectorImpl &Inputs, SmallVectorImpl &Mask) { int MaskWidth = Mask.size(); SmallVector UsedInputs; for (int i = 0, e = Inputs.size(); i < e; ++i) { int lo = UsedInputs.size() * MaskWidth; int hi = lo + MaskWidth; // Strip UNDEF input usage. if (Inputs[i].isUndef()) for (int &M : Mask) if ((lo <= M) && (M < hi)) M = SM_SentinelUndef; // Check for unused inputs. if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) { for (int &M : Mask) if (lo <= M) M -= MaskWidth; continue; } // Check for repeated inputs. bool IsRepeat = false; for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) { if (UsedInputs[j] != Inputs[i]) continue; for (int &M : Mask) if (lo <= M) M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth); IsRepeat = true; break; } if (IsRepeat) continue; UsedInputs.push_back(Inputs[i]); } Inputs = std::move(UsedInputs); } /// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs /// and then sets the SM_SentinelUndef and SM_SentinelZero values. /// Returns true if the target shuffle mask was decoded. static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl &Inputs, SmallVectorImpl &Mask, APInt &KnownUndef, APInt &KnownZero, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts) { if (Depth >= SelectionDAG::MaxRecursionDepth) return false; // Limit search depth. EVT VT = Op.getValueType(); if (!VT.isSimple() || !VT.isVector()) return false; if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) { if (ResolveKnownElts) resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero); return true; } if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth, ResolveKnownElts)) { resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero); return true; } return false; } static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl &Inputs, SmallVectorImpl &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts) { APInt KnownUndef, KnownZero; return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef, KnownZero, DAG, Depth, ResolveKnownElts); } static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl &Inputs, SmallVectorImpl &Mask, const SelectionDAG &DAG, unsigned Depth = 0, bool ResolveKnownElts = true) { EVT VT = Op.getValueType(); if (!VT.isSimple() || !VT.isVector()) return false; unsigned NumElts = Op.getValueType().getVectorNumElements(); APInt DemandedElts = APInt::getAllOnes(NumElts); return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth, ResolveKnownElts); } // Attempt to create a scalar/subvector broadcast from the base MemSDNode. static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG) { assert((Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"); // Ensure this is a simple (non-atomic, non-voltile), temporal read memop. if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal()) return SDValue(); SDValue Ptr = DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::getFixed(Offset), DL); SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = {Mem->getChain(), Ptr}; SDValue BcstLd = DAG.getMemIntrinsicNode( Opcode, DL, Tys, Ops, MemVT, DAG.getMachineFunction().getMachineMemOperand( Mem->getMemOperand(), Offset, MemVT.getStoreSize())); DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1)); return BcstLd; } /// Returns the scalar element that will make up the i'th /// element of the result of the vector shuffle. static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth) { if (Depth >= SelectionDAG::MaxRecursionDepth) return SDValue(); // Limit search depth. EVT VT = Op.getValueType(); unsigned Opcode = Op.getOpcode(); unsigned NumElems = VT.getVectorNumElements(); // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. if (auto *SV = dyn_cast(Op)) { int Elt = SV->getMaskElt(Index); if (Elt < 0) return DAG.getUNDEF(VT.getVectorElementType()); SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1); return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1); } // Recurse into target specific vector shuffles to find scalars. if (isTargetShuffle(Opcode)) { MVT ShufVT = VT.getSimpleVT(); MVT ShufSVT = ShufVT.getVectorElementType(); int NumElems = (int)ShufVT.getVectorNumElements(); SmallVector ShuffleMask; SmallVector ShuffleOps; if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask)) return SDValue(); int Elt = ShuffleMask[Index]; if (Elt == SM_SentinelZero) return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT) : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT); if (Elt == SM_SentinelUndef) return DAG.getUNDEF(ShufSVT); assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range"); SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1]; return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1); } // Recurse into insert_subvector base/sub vector to find scalars. if (Opcode == ISD::INSERT_SUBVECTOR) { SDValue Vec = Op.getOperand(0); SDValue Sub = Op.getOperand(1); uint64_t SubIdx = Op.getConstantOperandVal(2); unsigned NumSubElts = Sub.getValueType().getVectorNumElements(); if (SubIdx <= Index && Index < (SubIdx + NumSubElts)) return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1); return getShuffleScalarElt(Vec, Index, DAG, Depth + 1); } // Recurse into concat_vectors sub vector to find scalars. if (Opcode == ISD::CONCAT_VECTORS) { EVT SubVT = Op.getOperand(0).getValueType(); unsigned NumSubElts = SubVT.getVectorNumElements(); uint64_t SubIdx = Index / NumSubElts; uint64_t SubElt = Index % NumSubElts; return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1); } // Recurse into extract_subvector src vector to find scalars. if (Opcode == ISD::EXTRACT_SUBVECTOR) { SDValue Src = Op.getOperand(0); uint64_t SrcIdx = Op.getConstantOperandVal(1); return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1); } // We only peek through bitcasts of the same vector width. if (Opcode == ISD::BITCAST) { SDValue Src = Op.getOperand(0); EVT SrcVT = Src.getValueType(); if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems) return getShuffleScalarElt(Src, Index, DAG, Depth + 1); return SDValue(); } // Actual nodes that may contain scalar elements // For insert_vector_elt - either return the index matching scalar or recurse // into the base vector. if (Opcode == ISD::INSERT_VECTOR_ELT && isa(Op.getOperand(2))) { if (Op.getConstantOperandAPInt(2) == Index) return Op.getOperand(1); return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1); } if (Opcode == ISD::SCALAR_TO_VECTOR) return (Index == 0) ? Op.getOperand(0) : DAG.getUNDEF(VT.getVectorElementType()); if (Opcode == ISD::BUILD_VECTOR) return Op.getOperand(Index); return SDValue(); } // Use PINSRB/PINSRW/PINSRD to create a build vector. static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); unsigned NumElts = VT.getVectorNumElements(); assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && "Illegal vector insertion"); SDValue V; bool First = true; for (unsigned i = 0; i < NumElts; ++i) { bool IsNonZero = NonZeroMask[i]; if (!IsNonZero) continue; // If the build vector contains zeros or our first insertion is not the // first index then insert into zero vector to break any register // dependency else use SCALAR_TO_VECTOR. if (First) { First = false; if (NumZero || 0 != i) V = getZeroVector(VT, Subtarget, DAG, DL); else { assert(0 == i && "Expected insertion into zero-index"); V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32); V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V); V = DAG.getBitcast(VT, V); continue; } } V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i), DAG.getVectorIdxConstant(i, DL)); } return V; } /// Custom lower build_vector of v16i8. static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (NumNonZero > 8 && !Subtarget.hasSSE41()) return SDValue(); // SSE4.1 - use PINSRB to insert each byte directly. if (Subtarget.hasSSE41()) return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG, Subtarget); SDValue V; // Pre-SSE4.1 - merge byte pairs and insert with PINSRW. // If both the lowest 16-bits are non-zero, then convert to MOVD. if (!NonZeroMask.extractBits(2, 0).isZero() && !NonZeroMask.extractBits(2, 2).isZero()) { for (unsigned I = 0; I != 4; ++I) { if (!NonZeroMask[I]) continue; SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32); if (I != 0) Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, DAG.getConstant(I * 8, DL, MVT::i8)); V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt; } assert(V && "Failed to fold v16i8 vector to zero"); V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V); V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V); V = DAG.getBitcast(MVT::v8i16, V); } for (unsigned i = V ? 4 : 0; i < 16; i += 2) { bool ThisIsNonZero = NonZeroMask[i]; bool NextIsNonZero = NonZeroMask[i + 1]; if (!ThisIsNonZero && !NextIsNonZero) continue; SDValue Elt; if (ThisIsNonZero) { if (NumZero || NextIsNonZero) Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32); else Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32); } if (NextIsNonZero) { SDValue NextElt = Op.getOperand(i + 1); if (i == 0 && NumZero) NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32); else NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32); NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt, DAG.getConstant(8, DL, MVT::i8)); if (ThisIsNonZero) Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt); else Elt = NextElt; } // If our first insertion is not the first index or zeros are needed, then // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high // elements undefined). if (!V) { if (i != 0 || NumZero) V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL); else { V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt); V = DAG.getBitcast(MVT::v8i16, V); continue; } } Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt); V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt, DAG.getVectorIdxConstant(i / 2, DL)); } return DAG.getBitcast(MVT::v16i8, V); } /// Custom lower build_vector of v8i16. static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget) { if (NumNonZero > 4 && !Subtarget.hasSSE41()) return SDValue(); // Use PINSRW to insert each byte directly. return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG, Subtarget); } /// Custom lower build_vector of v4i32 or v4f32. static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // If this is a splat of a pair of elements, use MOVDDUP (unless the target // has XOP; in that case defer lowering to potentially use VPERMIL2PS). // Because we're creating a less complicated build vector here, we may enable // further folding of the MOVDDUP via shuffle transforms. if (Subtarget.hasSSE3() && !Subtarget.hasXOP() && Op.getOperand(0) == Op.getOperand(2) && Op.getOperand(1) == Op.getOperand(3) && Op.getOperand(0) != Op.getOperand(1)) { MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); // Create a new build vector with the first 2 elements followed by undef // padding, bitcast to v2f64, duplicate, and bitcast back. SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1), DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) }; SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops)); SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV); return DAG.getBitcast(VT, Dup); } // Find all zeroable elements. std::bitset<4> Zeroable, Undefs; for (int i = 0; i < 4; ++i) { SDValue Elt = Op.getOperand(i); Undefs[i] = Elt.isUndef(); Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt)); } assert(Zeroable.size() - Zeroable.count() > 1 && "We expect at least two non-zero elements!"); // We only know how to deal with build_vector nodes where elements are either // zeroable or extract_vector_elt with constant index. SDValue FirstNonZero; unsigned FirstNonZeroIdx; for (unsigned i = 0; i < 4; ++i) { if (Zeroable[i]) continue; SDValue Elt = Op.getOperand(i); if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa(Elt.getOperand(1))) return SDValue(); // Make sure that this node is extracting from a 128-bit vector. MVT VT = Elt.getOperand(0).getSimpleValueType(); if (!VT.is128BitVector()) return SDValue(); if (!FirstNonZero.getNode()) { FirstNonZero = Elt; FirstNonZeroIdx = i; } } assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!"); SDValue V1 = FirstNonZero.getOperand(0); MVT VT = V1.getSimpleValueType(); // See if this build_vector can be lowered as a blend with zero. SDValue Elt; unsigned EltMaskIdx, EltIdx; int Mask[4]; for (EltIdx = 0; EltIdx < 4; ++EltIdx) { if (Zeroable[EltIdx]) { // The zero vector will be on the right hand side. Mask[EltIdx] = EltIdx+4; continue; } Elt = Op->getOperand(EltIdx); // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index. EltMaskIdx = Elt.getConstantOperandVal(1); if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx) break; Mask[EltIdx] = EltIdx; } if (EltIdx == 4) { // Let the shuffle legalizer deal with blend operations. SDValue VZeroOrUndef = (Zeroable == Undefs) ? DAG.getUNDEF(VT) : getZeroVector(VT, Subtarget, DAG, DL); if (V1.getSimpleValueType() != VT) V1 = DAG.getBitcast(VT, V1); return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask); } // See if we can lower this build_vector to a INSERTPS. if (!Subtarget.hasSSE41()) return SDValue(); SDValue V2 = Elt.getOperand(0); if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx) V1 = SDValue(); bool CanFold = true; for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) { if (Zeroable[i]) continue; SDValue Current = Op->getOperand(i); SDValue SrcVector = Current->getOperand(0); if (!V1.getNode()) V1 = SrcVector; CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i); } if (!CanFold) return SDValue(); assert(V1.getNode() && "Expected at least two non-zero elements!"); if (V1.getSimpleValueType() != MVT::v4f32) V1 = DAG.getBitcast(MVT::v4f32, V1); if (V2.getSimpleValueType() != MVT::v4f32) V2 = DAG.getBitcast(MVT::v4f32, V2); // Ok, we can emit an INSERTPS instruction. unsigned ZMask = Zeroable.to_ulong(); unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask; assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); return DAG.getBitcast(VT, Result); } /// Return a vector logical shift node. static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl) { assert(VT.is128BitVector() && "Unknown type for VShift"); MVT ShVT = MVT::v16i8; unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; SrcOp = DAG.getBitcast(ShVT, SrcOp); assert(NumBits % 8 == 0 && "Only support byte sized shifts"); SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8); return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal)); } static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG) { // Check if the scalar load can be widened into a vector load. And if // the address is "base + cst" see if the cst can be "absorbed" into // the shuffle mask. if (LoadSDNode *LD = dyn_cast(SrcOp)) { SDValue Ptr = LD->getBasePtr(); if (!ISD::isNormalLoad(LD) || !LD->isSimple()) return SDValue(); EVT PVT = LD->getValueType(0); if (PVT != MVT::i32 && PVT != MVT::f32) return SDValue(); int FI = -1; int64_t Offset = 0; if (FrameIndexSDNode *FINode = dyn_cast(Ptr)) { FI = FINode->getIndex(); Offset = 0; } else if (DAG.isBaseWithConstantOffset(Ptr) && isa(Ptr.getOperand(0))) { FI = cast(Ptr.getOperand(0))->getIndex(); Offset = Ptr.getConstantOperandVal(1); Ptr = Ptr.getOperand(0); } else { return SDValue(); } // FIXME: 256-bit vector instructions don't require a strict alignment, // improve this code to support it better. Align RequiredAlign(VT.getSizeInBits() / 8); SDValue Chain = LD->getChain(); // Make sure the stack object alignment is at least 16 or 32. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr); if (!InferredAlign || *InferredAlign < RequiredAlign) { if (MFI.isFixedObjectIndex(FI)) { // Can't change the alignment. FIXME: It's possible to compute // the exact stack offset and reference FI + adjust offset instead. // If someone *really* cares about this. That's the way to implement it. return SDValue(); } else { MFI.setObjectAlignment(FI, RequiredAlign); } } // (Offset % 16 or 32) must be multiple of 4. Then address is then // Ptr + (Offset & ~15). if (Offset < 0) return SDValue(); if ((Offset % RequiredAlign.value()) & 3) return SDValue(); int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1); if (StartOffset) { SDLoc DL(Ptr); Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, DAG.getConstant(StartOffset, DL, Ptr.getValueType())); } int EltNo = (Offset - StartOffset) >> 2; unsigned NumElems = VT.getVectorNumElements(); EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(StartOffset)); SmallVector Mask(NumElems, EltNo); return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask); } return SDValue(); } // Recurse to find a LoadSDNode source and the accumulated ByteOffest. static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) { if (ISD::isNON_EXTLoad(Elt.getNode())) { auto *BaseLd = cast(Elt); if (!BaseLd->isSimple()) return false; Ld = BaseLd; ByteOffset = 0; return true; } switch (Elt.getOpcode()) { case ISD::BITCAST: case ISD::TRUNCATE: case ISD::SCALAR_TO_VECTOR: return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset); case ISD::SRL: if (auto *AmtC = dyn_cast(Elt.getOperand(1))) { uint64_t Amt = AmtC->getZExtValue(); if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) { ByteOffset += Amt / 8; return true; } } break; case ISD::EXTRACT_VECTOR_ELT: if (auto *IdxC = dyn_cast(Elt.getOperand(1))) { SDValue Src = Elt.getOperand(0); unsigned SrcSizeInBits = Src.getScalarValueSizeInBits(); unsigned DstSizeInBits = Elt.getScalarValueSizeInBits(); if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 && findEltLoadSrc(Src, Ld, ByteOffset)) { uint64_t Idx = IdxC->getZExtValue(); ByteOffset += Idx * (SrcSizeInBits / 8); return true; } } break; } return false; } /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the /// elements can be replaced by a single large load which has the same value as /// a build_vector or insert_subvector whose loaded operands are 'Elts'. /// /// Example: -> zextload a static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize, unsigned Depth = 0) { if (Depth >= SelectionDAG::MaxRecursionDepth) return SDValue(); // Limit search depth. if ((VT.getScalarSizeInBits() % 8) != 0) return SDValue(); unsigned NumElems = Elts.size(); int LastLoadedElt = -1; APInt LoadMask = APInt::getZero(NumElems); APInt ZeroMask = APInt::getZero(NumElems); APInt UndefMask = APInt::getZero(NumElems); SmallVector Loads(NumElems, nullptr); SmallVector ByteOffsets(NumElems, 0); // For each element in the initializer, see if we've found a load, zero or an // undef. for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = peekThroughBitcasts(Elts[i]); if (!Elt.getNode()) return SDValue(); if (Elt.isUndef()) { UndefMask.setBit(i); continue; } if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) { ZeroMask.setBit(i); continue; } // Each loaded element must be the correct fractional portion of the // requested vector load. unsigned EltSizeInBits = Elt.getValueSizeInBits(); if ((NumElems * EltSizeInBits) != VT.getSizeInBits()) return SDValue(); if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0) return SDValue(); unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0); if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits) return SDValue(); LoadMask.setBit(i); LastLoadedElt = i; } assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && "Incomplete element masks"); // Handle Special Cases - all undef or undef/zero. if (UndefMask.popcount() == NumElems) return DAG.getUNDEF(VT); if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems) return VT.isInteger() ? DAG.getConstant(0, DL, VT) : DAG.getConstantFP(0.0, DL, VT); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); int FirstLoadedElt = LoadMask.countr_zero(); SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]); EVT EltBaseVT = EltBase.getValueType(); assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && "Register/Memory size mismatch"); LoadSDNode *LDBase = Loads[FirstLoadedElt]; assert(LDBase && "Did not find base load for merging consecutive loads"); unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits(); unsigned BaseSizeInBytes = BaseSizeInBits / 8; int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt); int LoadSizeInBits = NumLoadedElts * BaseSizeInBits; assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected"); // TODO: Support offsetting the base load. if (ByteOffsets[FirstLoadedElt] != 0) return SDValue(); // Check to see if the element's load is consecutive to the base load // or offset from a previous (already checked) load. auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) { LoadSDNode *Ld = Loads[EltIdx]; int64_t ByteOffset = ByteOffsets[EltIdx]; if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) { int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes); return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] && Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0); } int Stride = EltIdx - FirstLoadedElt; if (DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes, Stride)) return true; // Try again using the memory load size (we might have broken a large load // into smaller elements), ensure the stride is the full memory load size // apart and a whole number of elements fit in each memory load. unsigned BaseMemSizeInBits = Base->getMemoryVT().getSizeInBits(); if (((Stride * BaseSizeInBits) % BaseMemSizeInBits) == 0 && (BaseMemSizeInBits % BaseSizeInBits) == 0) { unsigned Scale = BaseMemSizeInBits / BaseSizeInBits; return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseMemSizeInBits / 8, Stride / Scale); } return false; }; // Consecutive loads can contain UNDEFS but not ZERO elements. // Consecutive loads with UNDEFs and ZEROs elements require a // an additional shuffle stage to clear the ZERO elements. bool IsConsecutiveLoad = true; bool IsConsecutiveLoadWithZeros = true; for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) { if (LoadMask[i]) { if (!CheckConsecutiveLoad(LDBase, i)) { IsConsecutiveLoad = false; IsConsecutiveLoadWithZeros = false; break; } } else if (ZeroMask[i]) { IsConsecutiveLoad = false; } } auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) { auto MMOFlags = LDBase->getMemOperand()->getFlags(); assert(LDBase->isSimple() && "Cannot merge volatile or atomic loads."); SDValue NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), LDBase->getBaseAlign(), MMOFlags); for (auto *LD : Loads) if (LD) DAG.makeEquivalentMemoryOrdering(LD, NewLd); return NewLd; }; // Check if the base load is entirely dereferenceable. bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable( VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout()); // LOAD - all consecutive load/undefs (must start/end with a load or be // entirely dereferenceable). If we have found an entire vector of loads and // undefs, then return a large load of the entire vector width starting at the // base pointer. If the vector contains zeros, then attempt to shuffle those // elements. if (FirstLoadedElt == 0 && (NumLoadedElts == (int)NumElems || IsDereferenceable) && (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) { if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT)) return SDValue(); // Don't create 256-bit non-temporal aligned loads without AVX2 as these // will lower to regular temporal loads and use the cache. if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) && VT.is256BitVector() && !Subtarget.hasInt256()) return SDValue(); if (NumElems == 1) return DAG.getBitcast(VT, Elts[FirstLoadedElt]); if (!ZeroMask) return CreateLoad(VT, LDBase); // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded // vector and a zero vector to clear out the zero elements. if (!IsAfterLegalize && VT.isVector()) { unsigned NumMaskElts = VT.getVectorNumElements(); if ((NumMaskElts % NumElems) == 0) { unsigned Scale = NumMaskElts / NumElems; SmallVector ClearMask(NumMaskElts, -1); for (unsigned i = 0; i < NumElems; ++i) { if (UndefMask[i]) continue; int Offset = ZeroMask[i] ? NumMaskElts : 0; for (unsigned j = 0; j != Scale; ++j) ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset; } SDValue V = CreateLoad(VT, LDBase); SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT) : DAG.getConstantFP(0.0, DL, VT); return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask); } } } // If the upper half of a ymm/zmm load is undef then just load the lower half. if (VT.is256BitVector() || VT.is512BitVector()) { unsigned HalfNumElems = NumElems / 2; if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) { EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems); SDValue HalfLD = EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL, DAG, Subtarget, IsAfterLegalize, Depth + 1); if (HalfLD) return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), HalfLD, DAG.getVectorIdxConstant(0, DL)); } } // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs. if (IsConsecutiveLoad && FirstLoadedElt == 0 && ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 || LoadSizeInBits == 64) && ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) { MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits) : MVT::getIntegerVT(LoadSizeInBits); MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits); // Allow v4f32 on SSE1 only targets. // FIXME: Add more isel patterns so we can just use VT directly. if (!Subtarget.hasSSE2() && VT == MVT::v4f32) VecVT = MVT::v4f32; if (TLI.isTypeLegal(VecVT)) { SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; SDValue ResNode = DAG.getMemIntrinsicNode( X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(), LDBase->getBaseAlign(), MachineMemOperand::MOLoad); for (auto *LD : Loads) if (LD) DAG.makeEquivalentMemoryOrdering(LD, ResNode); return DAG.getBitcast(VT, ResNode); } } // BROADCAST - match the smallest possible repetition pattern, load that // scalar/subvector element and then broadcast to the entire vector. if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() && (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) { for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) { unsigned RepeatSize = SubElems * BaseSizeInBits; unsigned ScalarSize = std::min(RepeatSize, 64u); if (!Subtarget.hasAVX2() && ScalarSize < 32) continue; // Don't attempt a 1:N subvector broadcast - it should be caught by // combineConcatVectorOps, else will cause infinite loops. if (RepeatSize > ScalarSize && SubElems == 1) continue; bool Match = true; SmallVector RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT)); for (unsigned i = 0; i != NumElems && Match; ++i) { if (!LoadMask[i]) continue; SDValue Elt = peekThroughBitcasts(Elts[i]); if (RepeatedLoads[i % SubElems].isUndef()) RepeatedLoads[i % SubElems] = Elt; else Match &= (RepeatedLoads[i % SubElems] == Elt); } // We must have loads at both ends of the repetition. Match &= !RepeatedLoads.front().isUndef(); Match &= !RepeatedLoads.back().isUndef(); if (!Match) continue; EVT RepeatVT = VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64)) ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize) : EVT::getFloatingPointVT(ScalarSize); if (RepeatSize > ScalarSize) RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT, RepeatSize / ScalarSize); EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(), VT.getSizeInBits() / ScalarSize); if (TLI.isTypeLegal(BroadcastVT)) { if (SDValue RepeatLoad = EltsFromConsecutiveLoads( RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize, Depth + 1)) { SDValue Broadcast = RepeatLoad; if (RepeatSize > ScalarSize) { while (Broadcast.getValueSizeInBits() < VT.getSizeInBits()) Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL); } else { if (!Subtarget.hasAVX2() && !X86::mayFoldLoadIntoBroadcastFromMem( RepeatLoad, RepeatVT.getScalarType().getSimpleVT(), Subtarget, /*AssumeSingleUse=*/true)) return SDValue(); Broadcast = DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad); } return DAG.getBitcast(VT, Broadcast); } } } } // REVERSE - attempt to match the loads in reverse and then shuffle back. // TODO: Do this for any permute or mismatching element counts. if (Depth == 0 && ZeroMask.isZero() && UndefMask.isZero() && TLI.isTypeLegal(VT) && VT.isVector() && NumElems == VT.getVectorNumElements()) { SmallVector ReverseElts(Elts.rbegin(), Elts.rend()); if (SDValue RevLd = EltsFromConsecutiveLoads( VT, ReverseElts, DL, DAG, Subtarget, IsAfterLegalize, Depth + 1)) { SmallVector ReverseMask(NumElems); std::iota(ReverseMask.rbegin(), ReverseMask.rend(), 0); return DAG.getVectorShuffle(VT, DL, RevLd, DAG.getUNDEF(VT), ReverseMask); } } return SDValue(); } // Combine a vector ops (shuffles etc.) that is equal to build_vector load1, // load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses // are consecutive, non-overlapping, and in the right order. static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize) { SmallVector Elts; for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) { Elts.push_back(Elt); continue; } return SDValue(); } assert(Elts.size() == VT.getVectorNumElements()); return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget, IsAfterLegalize); } static Constant *getConstantVector(MVT VT, ArrayRef Bits, const APInt &Undefs, LLVMContext &C) { unsigned ScalarSize = VT.getScalarSizeInBits(); Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C); auto getConstantScalar = [&](const APInt &Val) -> Constant * { if (VT.isFloatingPoint()) { if (ScalarSize == 16) return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val)); if (ScalarSize == 32) return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val)); assert(ScalarSize == 64 && "Unsupported floating point scalar size"); return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val)); } return Constant::getIntegerValue(Ty, Val); }; SmallVector ConstantVec; for (unsigned I = 0, E = Bits.size(); I != E; ++I) ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty) : getConstantScalar(Bits[I])); return ConstantVector::get(ArrayRef(ConstantVec)); } static Constant *getConstantVector(MVT VT, const APInt &SplatValue, unsigned SplatBitSize, LLVMContext &C) { unsigned ScalarSize = VT.getScalarSizeInBits(); auto getConstantScalar = [&](const APInt &Val) -> Constant * { if (VT.isFloatingPoint()) { if (ScalarSize == 16) return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val)); if (ScalarSize == 32) return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val)); assert(ScalarSize == 64 && "Unsupported floating point scalar size"); return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val)); } return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val); }; if (ScalarSize == SplatBitSize) return getConstantScalar(SplatValue); unsigned NumElm = SplatBitSize / ScalarSize; SmallVector ConstantVec; for (unsigned I = 0; I != NumElm; ++I) { APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I); ConstantVec.push_back(getConstantScalar(Val)); } return ConstantVector::get(ArrayRef(ConstantVec)); } static bool isFoldableUseOfShuffle(SDNode *N) { for (auto *U : N->users()) { unsigned Opc = U->getOpcode(); // VPERMV/VPERMV3 shuffles can never fold their index operands. if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N) return false; if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N) return false; if (isTargetShuffle(Opc)) return true; if (Opc == ISD::BITCAST) // Ignore bitcasts return isFoldableUseOfShuffle(U); if (N->hasOneUse()) { // TODO, there may be some general way to know if a SDNode can // be folded. We now only know whether an MI is foldable. if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N) return false; return true; } } return false; } // If the node has a single use by a VSELECT then AVX512 targets may be able to // fold as a predicated instruction. static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget) { unsigned SizeInBits = V.getValueSizeInBits(); if ((SizeInBits == 512 && Subtarget.hasAVX512()) || (SizeInBits >= 128 && Subtarget.hasVLX())) { if (V.hasOneUse() && V->user_begin()->getOpcode() == ISD::VSELECT && V->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) { return true; } } return false; } /// Attempt to use the vbroadcast instruction to generate a splat value /// from a splat BUILD_VECTOR which uses: /// a. A single scalar load, or a constant. /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>). /// /// The VBROADCAST node is returned when a pattern is found, /// or SDValue() otherwise. static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // VBROADCAST requires AVX. // TODO: Splats could be generated for non-AVX CPUs using SSE // instructions, but there's less potential gain for only 128-bit vectors. if (!Subtarget.hasAVX()) return SDValue(); MVT VT = BVOp->getSimpleValueType(0); unsigned NumElts = VT.getVectorNumElements(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Unsupported vector type for broadcast."); // See if the build vector is a repeating sequence of scalars (inc. splat). SDValue Ld; BitVector UndefElements; SmallVector Sequence; if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) { assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit."); if (Sequence.size() == 1) Ld = Sequence[0]; } // Attempt to use VBROADCASTM // From this pattern: // a. t0 = (zext_i64 (bitcast_i8 v2i1 X)) // b. t1 = (build_vector t0 t0) // // Create (VBROADCASTM v2i1 X) if (!Sequence.empty() && Subtarget.hasCDI()) { // If not a splat, are the upper sequence values zeroable? unsigned SeqLen = Sequence.size(); bool UpperZeroOrUndef = SeqLen == 1 || llvm::all_of(ArrayRef(Sequence).drop_front(), [](SDValue V) { return !V || isNullConstantOrUndef(V); }); SDValue Op0 = Sequence[0]; if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) || (Op0.getOpcode() == ISD::ZERO_EXTEND && Op0.getOperand(0).getOpcode() == ISD::BITCAST))) { SDValue BOperand = Op0.getOpcode() == ISD::BITCAST ? Op0.getOperand(0) : Op0.getOperand(0).getOperand(0); MVT MaskVT = BOperand.getSimpleValueType(); MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen); if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen); if (!VT.is512BitVector() && !Subtarget.hasVLX()) { unsigned Scale = 512 / VT.getSizeInBits(); BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen)); } SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand); if (BcstVT.getSizeInBits() != VT.getSizeInBits()) Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits()); return DAG.getBitcast(VT, Bcst); } } } unsigned NumUndefElts = UndefElements.count(); if (!Ld || (NumElts - NumUndefElts) <= 1) { APInt SplatValue, Undef; unsigned SplatBitSize; bool HasUndef; // Check if this is a repeated constant pattern suitable for broadcasting. if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) && SplatBitSize > VT.getScalarSizeInBits() && SplatBitSize < VT.getSizeInBits()) { // Avoid replacing with broadcast when it's a use of a shuffle // instruction to preserve the present custom lowering of shuffles. if (isFoldableUseOfShuffle(BVOp)) return SDValue(); // replace BUILD_VECTOR with broadcast of the repeated constants. LLVMContext *Ctx = DAG.getContext(); MVT PVT = TLI.getPointerTy(DAG.getDataLayout()); if (SplatBitSize == 32 || SplatBitSize == 64 || (SplatBitSize < 32 && Subtarget.hasAVX2())) { // Load the constant scalar/subvector and broadcast it. MVT CVT = MVT::getIntegerVT(SplatBitSize); Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx); SDValue CP = DAG.getConstantPool(C, PVT); unsigned Repeat = VT.getSizeInBits() / SplatBitSize; Align Alignment = cast(CP)->getAlign(); SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other); SDValue Ops[] = {DAG.getEntryNode(), CP}; MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(DAG.getMachineFunction()); SDValue Brdcst = DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment, MachineMemOperand::MOLoad); return DAG.getBitcast(VT, Brdcst); } if (SplatBitSize > 64) { // Load the vector of constants and broadcast it. Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx); SDValue VCP = DAG.getConstantPool(VecC, PVT); unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits(); MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm); Align Alignment = cast(VCP)->getAlign(); SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = {DAG.getEntryNode(), VCP}; MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(DAG.getMachineFunction()); return DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment, MachineMemOperand::MOLoad); } } // If we are moving a scalar into a vector (Ld must be set and all elements // but 1 are undef) and that operation is not obviously supported by // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast. // That's better than general shuffling and may eliminate a load to GPR and // move from scalar to vector register. if (!Ld || NumElts - NumUndefElts != 1) return SDValue(); unsigned ScalarSize = Ld.getValueSizeInBits(); if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64))) return SDValue(); } bool ConstSplatVal = (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP); bool IsLoad = ISD::isNormalLoad(Ld.getNode()); // TODO: Handle broadcasts of non-constant sequences. // Make sure that all of the users of a non-constant load are from the // BUILD_VECTOR node. // FIXME: Is the use count needed for non-constant, non-load case? if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode())) return SDValue(); unsigned ScalarSize = Ld.getValueSizeInBits(); bool IsGE256 = (VT.getSizeInBits() >= 256); // When optimizing for size, generate up to 5 extra bytes for a broadcast // instruction to save 8 or more bytes of constant pool data. // TODO: If multiple splats are generated to load the same constant, // it may be detrimental to overall size. There needs to be a way to detect // that condition to know if this is truly a size win. bool OptForSize = DAG.shouldOptForSize(); // Handle broadcasting a single constant scalar from the constant pool // into a vector. // On Sandybridge (no AVX2), it is still better to load a constant vector // from the constant pool and not to broadcast it from a scalar. // But override that restriction when optimizing for size. // TODO: Check if splatting is recommended for other AVX-capable CPUs. if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) { EVT CVT = Ld.getValueType(); assert(!CVT.isVector() && "Must not broadcast a vector type"); // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2. // For size optimization, also splat v2f64 and v2i64, and for size opt // with AVX2, also splat i8 and i16. // With pattern matching, the VBROADCAST node may become a VMOVDDUP. if (ScalarSize == 32 || (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) || (CVT == MVT::f16 && Subtarget.hasAVX2()) || (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) { const Constant *C = nullptr; if (ConstantSDNode *CI = dyn_cast(Ld)) C = CI->getConstantIntValue(); else if (ConstantFPSDNode *CF = dyn_cast(Ld)) C = CF->getConstantFPValue(); assert(C && "Invalid constant type"); SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); Align Alignment = cast(CP)->getAlign(); SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = {DAG.getEntryNode(), CP}; MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(DAG.getMachineFunction()); return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment, MachineMemOperand::MOLoad); } } // Handle AVX2 in-register broadcasts. if (!IsLoad && Subtarget.hasInt256() && (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The scalar source must be a normal load. if (!IsLoad) return SDValue(); // Make sure the non-chain result is only used by this build vector. if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0)) return SDValue(); if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || (Subtarget.hasVLX() && ScalarSize == 64)) { auto *LN = cast(Ld); SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = {LN->getChain(), LN->getBasePtr()}; SDValue BCast = DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, LN->getMemoryVT(), LN->getMemOperand()); DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1)); return BCast; } // The integer check is needed for the 64-bit into 128-bit so it doesn't match // double since there is no vbroadcastsd xmm if (Subtarget.hasInt256() && Ld.getValueType().isInteger() && (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) { auto *LN = cast(Ld); SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = {LN->getChain(), LN->getBasePtr()}; SDValue BCast = DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, LN->getMemoryVT(), LN->getMemOperand()); DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1)); return BCast; } if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // Unsupported broadcast. return SDValue(); } /// For an EXTRACT_VECTOR_ELT with a constant index return the real /// underlying vector and index. /// /// Modifies \p ExtractedFromVec to the real vector and returns the real /// index. static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx) { int Idx = ExtIdx->getAsZExtVal(); if (!isa(ExtractedFromVec)) return Idx; // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already // lowered this: // (extract_vector_elt (v8f32 %1), Constant<6>) // to: // (extract_vector_elt (vector_shuffle<2,u,u,u> // (extract_subvector (v8f32 %0), Constant<4>), // undef) // Constant<0>) // In this case the vector is the extract_subvector expression and the index // is 2, as specified by the shuffle. ShuffleVectorSDNode *SVOp = cast(ExtractedFromVec); SDValue ShuffleVec = SVOp->getOperand(0); MVT ShuffleVecVT = ShuffleVec.getSimpleValueType(); assert(ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()); int ShuffleIdx = SVOp->getMaskElt(Idx); if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) { ExtractedFromVec = ShuffleVec; return ShuffleIdx; } return Idx; } static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); // Skip if insert_vec_elt is not supported. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) return SDValue(); unsigned NumElems = Op.getNumOperands(); SDValue VecIn1; SDValue VecIn2; SmallVector InsertIndices; SmallVector Mask(NumElems, -1); for (unsigned i = 0; i != NumElems; ++i) { unsigned Opc = Op.getOperand(i).getOpcode(); if (Opc == ISD::POISON || Opc == ISD::UNDEF) continue; if (Opc != ISD::EXTRACT_VECTOR_ELT) { // Quit if more than 1 elements need inserting. if (InsertIndices.size() > 1) return SDValue(); InsertIndices.push_back(i); continue; } SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); SDValue ExtIdx = Op.getOperand(i).getOperand(1); // Quit if non-constant index. if (!isa(ExtIdx)) return SDValue(); int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx); // Quit if extracted from vector of different type. if (ExtractedFromVec.getValueType() != VT) return SDValue(); if (!VecIn1.getNode()) VecIn1 = ExtractedFromVec; else if (VecIn1 != ExtractedFromVec) { if (!VecIn2.getNode()) VecIn2 = ExtractedFromVec; else if (VecIn2 != ExtractedFromVec) // Quit if more than 2 vectors to shuffle return SDValue(); } if (ExtractedFromVec == VecIn1) Mask[i] = Idx; else if (ExtractedFromVec == VecIn2) Mask[i] = Idx + NumElems; } if (!VecIn1.getNode()) return SDValue(); VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getPOISON(VT); SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask); for (unsigned Idx : InsertIndices) NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), DAG.getVectorIdxConstant(Idx, DL)); return NV; } // Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types. static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); MVT SVT = Subtarget.hasFP16() ? MVT::f16 : MVT::i16; MVT IVT = VT.changeVectorElementType(SVT); SmallVector NewOps; for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) NewOps.push_back(DAG.getBitcast(SVT, Op.getOperand(I))); SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps); return DAG.getBitcast(VT, Res); } // Lower BUILD_VECTOR operation for v8i1 and v16i1 types. static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); assert((VT.getVectorElementType() == MVT::i1) && "Unexpected type in LowerBUILD_VECTORvXi1!"); if (ISD::isBuildVectorAllZeros(Op.getNode()) || ISD::isBuildVectorAllOnes(Op.getNode())) return Op; uint64_t Immediate = 0; SmallVector NonConstIdx; bool IsSplat = true; bool HasConstElts = false; int SplatIdx = -1; for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); if (In.isUndef()) continue; if (auto *InC = dyn_cast(In)) { Immediate |= (InC->getZExtValue() & 0x1) << idx; HasConstElts = true; } else { NonConstIdx.push_back(idx); } if (SplatIdx < 0) SplatIdx = idx; else if (In != Op.getOperand(SplatIdx)) IsSplat = false; } // for splat use " (select i1 splat_elt, all-ones, all-zeroes)" if (IsSplat) { // The build_vector allows the scalar element to be larger than the vector // element type. We need to mask it to use as a condition unless we know // the upper bits are zero. // FIXME: Use computeKnownBits instead of checking specific opcode? SDValue Cond = Op.getOperand(SplatIdx); assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!"); if (Cond.getOpcode() != ISD::SETCC) Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond, DAG.getConstant(1, dl, MVT::i8)); // Perform the select in the scalar domain so we can use cmov. if (VT == MVT::v64i1 && !Subtarget.is64Bit()) { SDValue Select = DAG.getSelect(dl, MVT::i32, Cond, DAG.getAllOnesConstant(dl, MVT::i32), DAG.getConstant(0, dl, MVT::i32)); Select = DAG.getBitcast(MVT::v32i1, Select); return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select); } else { MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U)); SDValue Select = DAG.getSelect(dl, ImmVT, Cond, DAG.getAllOnesConstant(dl, ImmVT), DAG.getConstant(0, dl, ImmVT)); MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1; Select = DAG.getBitcast(VecVT, Select); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select, DAG.getVectorIdxConstant(0, dl)); } } // insert elements one by one SDValue DstVec; if (HasConstElts) { if (VT == MVT::v64i1 && !Subtarget.is64Bit()) { SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32); SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32); ImmL = DAG.getBitcast(MVT::v32i1, ImmL); ImmH = DAG.getBitcast(MVT::v32i1, ImmH); DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH); } else { MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U)); SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT); MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1; DstVec = DAG.getBitcast(VecVT, Imm); DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec, DAG.getVectorIdxConstant(0, dl)); } } else DstVec = DAG.getUNDEF(VT); for (unsigned InsertIdx : NonConstIdx) { DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, Op.getOperand(InsertIdx), DAG.getVectorIdxConstant(InsertIdx, dl)); } return DstVec; } [[maybe_unused]] static bool isHorizOp(unsigned Opcode) { switch (Opcode) { case X86ISD::PACKSS: case X86ISD::PACKUS: case X86ISD::FHADD: case X86ISD::FHSUB: case X86ISD::HADD: case X86ISD::HSUB: case X86ISD::HADDS: case X86ISD::HSUBS: return true; } return false; } /// This is a helper function of LowerToHorizontalOp(). /// This function checks that the build_vector \p N in input implements a /// 128-bit partial horizontal operation on a 256-bit vector, but that operation /// may not match the layout of an x86 256-bit horizontal instruction. /// In other words, if this returns true, then some extraction/insertion will /// be required to produce a valid horizontal instruction. /// /// Parameter \p Opcode defines the kind of horizontal operation to match. /// For example, if \p Opcode is equal to ISD::ADD, then this function /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode /// is equal to ISD::SUB, then this function checks if this is a horizontal /// arithmetic sub. /// /// This function only analyzes elements of \p N whose indices are /// in range [BaseIdx, LastIdx). /// /// TODO: This function was originally used to match both real and fake partial /// horizontal operations, but the index-matching logic is incorrect for that. /// See the corrected implementation in isHopBuildVector(). Can we reduce this /// code because it is only used for partial h-op matching now? static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1) { EVT VT = N->getValueType(0); assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops"); assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"); assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx && "Invalid Vector in input!"); bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD); bool CanFold = true; unsigned ExpectedVExtractIdx = BaseIdx; unsigned NumElts = LastIdx - BaseIdx; V0 = DAG.getUNDEF(VT); V1 = DAG.getUNDEF(VT); // Check if N implements a horizontal binop. for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) { SDValue Op = N->getOperand(i + BaseIdx); // Skip UNDEFs. if (Op->isUndef()) { // Update the expected vector extract index. if (i * 2 == NumElts) ExpectedVExtractIdx = BaseIdx; ExpectedVExtractIdx += 2; continue; } CanFold = Op->getOpcode() == Opcode && Op->hasOneUse(); if (!CanFold) break; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); // Try to match the following pattern: // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1)) CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Op0.getOperand(0) == Op1.getOperand(0) && isa(Op0.getOperand(1)) && isa(Op1.getOperand(1))); if (!CanFold) break; unsigned I0 = Op0.getConstantOperandVal(1); unsigned I1 = Op1.getConstantOperandVal(1); if (i * 2 < NumElts) { if (V0.isUndef()) { V0 = Op0.getOperand(0); if (V0.getValueType() != VT) return false; } } else { if (V1.isUndef()) { V1 = Op0.getOperand(0); if (V1.getValueType() != VT) return false; } if (i * 2 == NumElts) ExpectedVExtractIdx = BaseIdx; } SDValue Expected = (i * 2 < NumElts) ? V0 : V1; if (I0 == ExpectedVExtractIdx) CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected; else if (IsCommutable && I1 == ExpectedVExtractIdx) { // Try to match the following dag sequence: // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I)) CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected; } else CanFold = false; ExpectedVExtractIdx += 2; } return CanFold; } /// Emit a sequence of two 128-bit horizontal add/sub followed by /// a concat_vector. /// /// This is a helper function of LowerToHorizontalOp(). /// This function expects two 256-bit vectors called V0 and V1. /// At first, each vector is split into two separate 128-bit vectors. /// Then, the resulting 128-bit vectors are used to implement two /// horizontal binary operations. /// /// The kind of horizontal binary operation is defined by \p X86Opcode. /// /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to /// the two new horizontal binop. /// When Mode is set, the first horizontal binop dag node would take as input /// the lower 128-bit of V0 and the upper 128-bit of V0. The second /// horizontal binop dag node would take as input the lower 128-bit of V1 /// and the upper 128-bit of V1. /// Example: /// HADD V0_LO, V0_HI /// HADD V1_LO, V1_HI /// /// Otherwise, the first horizontal binop dag node takes as input the lower /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1. /// Example: /// HADD V0_LO, V1_LO /// HADD V0_HI, V1_HI /// /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to /// the upper 128-bits of the result. static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI) { MVT VT = V0.getSimpleValueType(); assert(VT.is256BitVector() && VT == V1.getSimpleValueType() && "Invalid nodes in input!"); unsigned NumElts = VT.getVectorNumElements(); SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL); SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL); SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL); SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL); MVT NewVT = V0_LO.getSimpleValueType(); SDValue LO = DAG.getUNDEF(NewVT); SDValue HI = DAG.getUNDEF(NewVT); if (Mode) { // Don't emit a horizontal binop if the result is expected to be UNDEF. if (!isUndefLO && !V0->isUndef()) LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI); if (!isUndefHI && !V1->isUndef()) HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI); } else { // Don't emit a horizontal binop if the result is expected to be UNDEF. if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef())) LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO); if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef())) HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI); } /// Returns true iff \p BV builds a vector with the result equivalent to /// the result of ADDSUB/SUBADD operation. /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 /// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters /// \p Opnd0 and \p Opnd1. static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd, bool &HasAllowContract) { using namespace SDPatternMatch; MVT VT = BV->getSimpleValueType(0); if (!Subtarget.hasSSE3() || !VT.isFloatingPoint()) return false; unsigned NumElts = VT.getVectorNumElements(); SDValue InVec0 = DAG.getUNDEF(VT); SDValue InVec1 = DAG.getUNDEF(VT); NumExtracts = 0; HasAllowContract = NumElts != 0; // Odd-numbered elements in the input build vector are obtained from // adding/subtracting two integer/float elements. // Even-numbered elements in the input build vector are obtained from // subtracting/adding two integer/float elements. unsigned Opc[2] = {0, 0}; for (unsigned i = 0, e = NumElts; i != e; ++i) { SDValue Op = BV->getOperand(i); // Skip 'undef' values. unsigned Opcode = Op.getOpcode(); if (Opcode == ISD::UNDEF) continue; // Early exit if we found an unexpected opcode. if (Opcode != ISD::FADD && Opcode != ISD::FSUB) return false; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); // Try to match the following pattern: // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i)) // Early exit if we cannot match that sequence. if (!sd_match(Op0, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))) || !sd_match(Op1, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i)))) return false; // We found a valid add/sub node, make sure its the same opcode as previous // elements for this parity. if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode) return false; Opc[i % 2] = Opcode; // Update InVec0 and InVec1. if (InVec0.isUndef()) InVec0 = Op0.getOperand(0); if (InVec1.isUndef()) InVec1 = Op1.getOperand(0); // Make sure that operands in input to each add/sub node always // come from a same pair of vectors. if (InVec0 != Op0.getOperand(0)) { if (Opcode == ISD::FSUB) return false; // FADD is commutable. Try to commute the operands // and then test again. std::swap(Op0, Op1); if (InVec0 != Op0.getOperand(0)) return false; } if (InVec1 != Op1.getOperand(0)) return false; // Increment the number of extractions done. ++NumExtracts; HasAllowContract &= Op->getFlags().hasAllowContract(); } // Ensure we have found an opcode for both parities and that they are // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the // inputs are undef. if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] || InVec0.isUndef() || InVec1.isUndef()) return false; IsSubAdd = Opc[0] == ISD::FADD; Opnd0 = InVec0; Opnd1 = InVec1; return true; } /// Returns true if is possible to fold MUL and an idiom that has already been /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2. /// /// Prior to calling this function it should be known that there is some /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called /// before replacement of such SDNode with ADDSUB operation. Thus the number /// of \p Opnd0 uses is expected to be equal to 2. /// For example, this function may be called for the following IR: /// %AB = fmul fast <2 x double> %A, %B /// %Sub = fsub fast <2 x double> %AB, %C /// %Add = fadd fast <2 x double> %AB, %C /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add, /// <2 x i32> /// There is a def for %Addsub here, which potentially can be replaced by /// X86ISD::ADDSUB operation: /// %Addsub = X86ISD::ADDSUB %AB, %C /// and such ADDSUB can further be replaced with FMADDSUB: /// %Addsub = FMADDSUB %A, %B, %C. /// /// The main reason why this method is called before the replacement of the /// recognized ADDSUB idiom with ADDSUB operation is that such replacement /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit /// FMADDSUB is. static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses, bool AllowSubAddOrAddSubContract) { if (Opnd0.getOpcode() != ISD::FMUL || !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA()) return false; // FIXME: These checks must match the similar ones in // DAGCombiner::visitFADDForFMACombine. It would be good to have one // function that would answer if it is Ok to fuse MUL + ADD to FMADD // or MUL + ADDSUB to FMADDSUB. bool AllowFusion = (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract()); if (!AllowFusion) return false; Opnd2 = Opnd1; Opnd1 = Opnd0.getOperand(1); Opnd0 = Opnd0.getOperand(0); return true; } /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or /// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or /// X86ISD::FMSUBADD node. static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue Opnd0, Opnd1; unsigned NumExtracts; bool IsSubAdd; bool HasAllowContract; if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd, HasAllowContract)) return SDValue(); MVT VT = BV->getSimpleValueType(0); // Try to generate X86ISD::FMADDSUB node here. SDValue Opnd2; if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts, HasAllowContract)) { unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB; return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2); } // We only support ADDSUB. if (IsSubAdd) return SDValue(); // There are no known X86 targets with 512-bit ADDSUB instructions! // Convert to blend(fsub,fadd). if (VT.is512BitVector()) { SmallVector Mask; for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) { Mask.push_back(I); Mask.push_back(I + E + 1); } SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1); SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1); return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask); } return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1); } static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1) { // Initialize outputs to known values. MVT VT = BV->getSimpleValueType(0); HOpcode = ISD::DELETED_NODE; V0 = DAG.getUNDEF(VT); V1 = DAG.getUNDEF(VT); // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit // half of the result is calculated independently from the 128-bit halves of // the inputs, so that makes the index-checking logic below more complicated. unsigned NumElts = VT.getVectorNumElements(); unsigned GenericOpcode = ISD::DELETED_NODE; unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1; unsigned NumEltsIn128Bits = NumElts / Num128BitChunks; unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2; for (unsigned i = 0; i != Num128BitChunks; ++i) { for (unsigned j = 0; j != NumEltsIn128Bits; ++j) { // Ignore undef elements. SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j); if (Op.isUndef()) continue; // If there's an opcode mismatch, we're done. if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode) return false; // Initialize horizontal opcode. if (HOpcode == ISD::DELETED_NODE) { GenericOpcode = Op.getOpcode(); switch (GenericOpcode) { // clang-format off case ISD::ADD: HOpcode = X86ISD::HADD; break; case ISD::SUB: HOpcode = X86ISD::HSUB; break; case ISD::FADD: HOpcode = X86ISD::FHADD; break; case ISD::FSUB: HOpcode = X86ISD::FHSUB; break; default: return false; // clang-format on } } SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Op0.getOperand(0) != Op1.getOperand(0) || !isa(Op0.getOperand(1)) || !isa(Op1.getOperand(1)) || !Op.hasOneUse()) return false; // The source vector is chosen based on which 64-bit half of the // destination vector is being calculated. if (j < NumEltsIn64Bits) { if (V0.isUndef()) V0 = Op0.getOperand(0); } else { if (V1.isUndef()) V1 = Op0.getOperand(0); } SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1; if (SourceVec != Op0.getOperand(0)) return false; // op (extract_vector_elt A, I), (extract_vector_elt A, I+1) unsigned ExtIndex0 = Op0.getConstantOperandVal(1); unsigned ExtIndex1 = Op1.getConstantOperandVal(1); unsigned ExpectedIndex = i * NumEltsIn128Bits + (j % NumEltsIn64Bits) * 2; if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1) continue; // If this is not a commutative op, this does not match. if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD) return false; // Addition is commutative, so try swapping the extract indexes. // op (extract_vector_elt A, I+1), (extract_vector_elt A, I) if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1) continue; // Extract indexes do not match horizontal requirement. return false; } } // We matched. Opcode and operands are returned by reference as arguments. return true; } static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1) { // If either input vector is not the same size as the build vector, // extract/insert the low bits to the correct size. // This is free (examples: zmm --> xmm, xmm --> ymm). MVT VT = BV->getSimpleValueType(0); unsigned Width = VT.getSizeInBits(); if (V0.getValueSizeInBits() > Width) V0 = extractSubVector(V0, 0, DAG, DL, Width); else if (V0.getValueSizeInBits() < Width) V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width); if (V1.getValueSizeInBits() > Width) V1 = extractSubVector(V1, 0, DAG, DL, Width); else if (V1.getValueSizeInBits() < Width) V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width); unsigned NumElts = VT.getVectorNumElements(); APInt DemandedElts = APInt::getAllOnes(NumElts); for (unsigned i = 0; i != NumElts; ++i) if (BV->getOperand(i).isUndef()) DemandedElts.clearBit(i); // If we don't need the upper xmm, then perform as a xmm hop. unsigned HalfNumElts = NumElts / 2; if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) { MVT HalfVT = VT.getHalfNumVectorElementsVT(); V0 = extractSubVector(V0, 0, DAG, DL, 128); V1 = extractSubVector(V1, 0, DAG, DL, 128); SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1); return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256); } return DAG.getNode(HOpcode, DL, VT, V0, V1); } /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible. static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // We need at least 2 non-undef elements to make this worthwhile by default. unsigned NumNonUndefs = count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); }); if (NumNonUndefs < 2) return SDValue(); // There are 4 sets of horizontal math operations distinguished by type: // int/FP at 128-bit/256-bit. Each type was introduced with a different // subtarget feature. Try to match those "native" patterns first. MVT VT = BV->getSimpleValueType(0); if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) || ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) || ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) || ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) { unsigned HOpcode; SDValue V0, V1; if (isHopBuildVector(BV, DAG, HOpcode, V0, V1)) return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1); } // Try harder to match 256-bit ops by using extract/concat. if (!Subtarget.hasAVX() || !VT.is256BitVector()) return SDValue(); // Count the number of UNDEF operands in the build_vector in input. unsigned NumElts = VT.getVectorNumElements(); unsigned Half = NumElts / 2; unsigned NumUndefsLO = 0; unsigned NumUndefsHI = 0; for (unsigned i = 0, e = Half; i != e; ++i) if (BV->getOperand(i)->isUndef()) NumUndefsLO++; for (unsigned i = Half, e = NumElts; i != e; ++i) if (BV->getOperand(i)->isUndef()) NumUndefsHI++; SDValue InVec0, InVec1; if (VT == MVT::v8i32 || VT == MVT::v16i16) { SDValue InVec2, InVec3; unsigned X86Opcode; bool CanFold = true; if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) X86Opcode = X86ISD::HADD; else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) X86Opcode = X86ISD::HSUB; else CanFold = false; if (CanFold) { // Do not try to expand this build_vector into a pair of horizontal // add/sub if we can emit a pair of scalar add/sub. if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) return SDValue(); // Convert this build_vector into a pair of horizontal binops followed by // a concat vector. We must adjust the outputs from the partial horizontal // matching calls above to account for undefined vector halves. SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0; SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1; assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?"); bool isUndefLO = NumUndefsLO == Half; bool isUndefHI = NumUndefsHI == Half; return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO, isUndefHI); } } if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 || VT == MVT::v16i16) { unsigned X86Opcode; if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HADD; else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HSUB; else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHADD; else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHSUB; else return SDValue(); // Don't try to expand this build_vector into a pair of horizontal add/sub // if we can simply emit a pair of scalar add/sub. if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) return SDValue(); // Convert this build_vector into two horizontal add/sub followed by // a concat vector. bool isUndefLO = NumUndefsLO == Half; bool isUndefHI = NumUndefsHI == Half; return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true, isUndefLO, isUndefHI); } return SDValue(); } static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG); /// If a BUILD_VECTOR's source elements all apply the same bit operation and /// one of their operands is constant, lower to a pair of BUILD_VECTOR and /// just apply the bit to the vectors. /// NOTE: Its not in our interest to start make a general purpose vectorizer /// from this, but enough scalar bit operations are created from the later /// legalization + scalarization stages to need basic support. static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); unsigned NumElems = VT.getVectorNumElements(); unsigned ElemSize = VT.getScalarSizeInBits(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Check that all elements have the same opcode. // TODO: Should we allow UNDEFS and if so how many? unsigned Opcode = Op->getOperand(0).getOpcode(); for (unsigned i = 1; i < NumElems; ++i) if (Opcode != Op->getOperand(i).getOpcode()) return SDValue(); // TODO: We may be able to add support for other Ops (e.g. ADD/SUB). bool IsShift = false; switch (Opcode) { default: return SDValue(); case ISD::SHL: case ISD::SRL: case ISD::SRA: IsShift = true; break; case ISD::AND: case ISD::XOR: case ISD::OR: // Don't do this if the buildvector is a splat - we'd replace one // constant with an entire vector. if (Op->getSplatValue()) return SDValue(); if (!TLI.isOperationLegalOrPromote(Opcode, VT)) return SDValue(); break; } // Collect elements. bool RHSAllConst = true; SmallVector LHSElts, RHSElts; for (SDValue Elt : Op->ops()) { SDValue LHS = Elt.getOperand(0); SDValue RHS = Elt.getOperand(1); RHSAllConst &= isa(RHS); LHSElts.push_back(LHS); RHSElts.push_back(RHS); } // Canonicalize shift amounts. if (IsShift) { // We expect the canonicalized RHS operand to be the constant. // TODO: Permit non-constant XOP/AVX2 cases? if (!RHSAllConst) return SDValue(); // Extend shift amounts. for (SDValue &Op1 : RHSElts) if (Op1.getValueSizeInBits() != ElemSize) Op1 = DAG.getZExtOrTrunc(Op1, DL, VT.getScalarType()); // Limit to shifts by uniform immediates. // TODO: Only accept vXi8/vXi64 special cases? // TODO: Permit non-uniform XOP/AVX2/MULLO cases? if (any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; })) return SDValue(); } assert(all_of(llvm::concat(LHSElts, RHSElts), [ElemSize](SDValue V) { return V.getValueSizeInBits() == ElemSize; }) && "Element size mismatch"); // To avoid an increase in GPR->FPU instructions, LHS/RHS must be foldable as // a load or RHS must be constant. SDValue LHS = EltsFromConsecutiveLoads(VT, LHSElts, DL, DAG, Subtarget, /*IsAfterLegalize=*/true); SDValue RHS = EltsFromConsecutiveLoads(VT, RHSElts, DL, DAG, Subtarget, /*IsAfterLegalize=*/true); if (!LHS && !RHS && !RHSAllConst) return SDValue(); if (!LHS) LHS = DAG.getBuildVector(VT, DL, LHSElts); if (!RHS) RHS = DAG.getBuildVector(VT, DL, RHSElts); SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS); if (!IsShift) return Res; // Immediately lower the shift to ensure the constant build vector doesn't // get converted to a constant pool before the shift is lowered. return LowerShift(Res, Subtarget, DAG); } static bool isShuffleFoldableLoad(SDValue); /// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats /// representing a blend. static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, X86Subtarget const &Subtarget, SelectionDAG &DAG) { MVT VT = BVOp->getSimpleValueType(0u); if (VT != MVT::v4f64) return SDValue(); // Collect unique operands. auto UniqueOps = SmallSet(); for (SDValue Op : BVOp->ops()) { if (isIntOrFPConstant(Op) || Op.isUndef()) return SDValue(); UniqueOps.insert(Op); } // Candidate BUILD_VECTOR must have 2 unique operands. if (UniqueOps.size() != 2u) return SDValue(); SDValue Op0 = BVOp->getOperand(0u); UniqueOps.erase(Op0); SDValue Op1 = *UniqueOps.begin(); if (Subtarget.hasAVX2() || isShuffleFoldableLoad(Op0) || isShuffleFoldableLoad(Op1)) { // Create shuffle mask. auto const NumElems = VT.getVectorNumElements(); SmallVector Mask(NumElems); for (auto I = 0u; I < NumElems; ++I) { SDValue Op = BVOp->getOperand(I); Mask[I] = Op == Op0 ? I : I + NumElems; } // Create shuffle of splats. SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0); SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1); return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask); } return SDValue(); } /// Widen a BUILD_VECTOR if the scalar operands are freely mergeable. static SDValue widenBuildVector(BuildVectorSDNode *BVOp, SDLoc const &DL, X86Subtarget const &Subtarget, SelectionDAG &DAG) { using namespace SDPatternMatch; MVT VT = BVOp->getSimpleValueType(0); MVT SVT = VT.getScalarType(); unsigned NumElts = VT.getVectorNumElements(); unsigned EltBits = SVT.getSizeInBits(); if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32) return SDValue(); unsigned WideBits = 2 * EltBits; MVT WideSVT = MVT::getIntegerVT(WideBits); MVT WideVT = MVT::getVectorVT(WideSVT, NumElts / 2); if (!DAG.getTargetLoweringInfo().isTypeLegal(WideSVT)) return SDValue(); SmallVector WideOps; for (unsigned I = 0; I != NumElts; I += 2) { SDValue Op0 = BVOp->getOperand(I + 0); SDValue Op1 = BVOp->getOperand(I + 1); if (Op0.isUndef() && Op1.isUndef()) { WideOps.push_back(DAG.getUNDEF(WideSVT)); continue; } // TODO: Constant repacking? // Merge scalars that have been split from the same source. SDValue X, Y; if (sd_match(Op0, m_Trunc(m_Value(X))) && sd_match(Op1, m_Trunc(m_Srl(m_Value(Y), m_SpecificInt(EltBits)))) && peekThroughTruncates(X) == peekThroughTruncates(Y) && X.getValueType().bitsGE(WideSVT)) { if (X.getValueType().bitsGT(WideSVT)) X = DAG.getNode(ISD::TRUNCATE, DL, WideSVT, X); WideOps.push_back(X); continue; } return SDValue(); } assert(WideOps.size() == (NumElts / 2) && "Failed to widen build vector"); return DAG.getBitcast(VT, DAG.getBuildVector(WideVT, DL, WideOps)); } /// Create a vector constant without a load. SSE/AVX provide the bare minimum /// functionality to do this, so it's all zeros, all ones, or some derivation /// that is cheap to calculate. static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); // Vectors containing all zeros can be matched by pxor and xorps. if (ISD::isBuildVectorAllZeros(Op.getNode())) return Op; // Vectors containing all ones can be matched by pcmpeqd on 128-bit width // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use // vpcmpeqd on 256-bit vectors. if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) return Op; return getOnesVector(VT, DAG, DL); } return SDValue(); } /// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute /// from a vector of source values and a vector of extraction indices. /// The vectors might be manipulated to match the type of the permute op. static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT ShuffleVT = VT; EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger(); unsigned NumElts = VT.getVectorNumElements(); unsigned SizeInBits = VT.getSizeInBits(); // Adjust IndicesVec to match VT size. assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts && "Illegal variable permute mask size"); if (IndicesVec.getValueType().getVectorNumElements() > NumElts) { // Narrow/widen the indices vector to the correct size. if (IndicesVec.getValueSizeInBits() > SizeInBits) IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec), NumElts * VT.getScalarSizeInBits()); else if (IndicesVec.getValueSizeInBits() < SizeInBits) IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG, SDLoc(IndicesVec), SizeInBits); // Zero-extend the index elements within the vector. if (IndicesVec.getValueType().getVectorNumElements() > NumElts) IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec), IndicesVT, IndicesVec); } IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT); // Handle SrcVec that don't match VT type. if (SrcVec.getValueSizeInBits() != SizeInBits) { if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) { // Handle larger SrcVec by treating it as a larger permute. unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits; VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts); IndicesVT = EVT(VT).changeVectorElementTypeToInteger(); IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false, Subtarget, DAG, SDLoc(IndicesVec)); SDValue NewSrcVec = createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget); if (NewSrcVec) return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits); return SDValue(); } else if (SrcVec.getValueSizeInBits() < SizeInBits) { // Widen smaller SrcVec to match VT. SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec)); } else return SDValue(); } auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) { assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale"); EVT SrcVT = Idx.getValueType(); unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale; uint64_t IndexScale = 0; uint64_t IndexOffset = 0; // If we're scaling a smaller permute op, then we need to repeat the // indices, scaling and offsetting them as well. // e.g. v4i32 -> v16i8 (Scale = 4) // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4) // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0) for (uint64_t i = 0; i != Scale; ++i) { IndexScale |= Scale << (i * NumDstBits); IndexOffset |= i << (i * NumDstBits); } Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx, DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT)); Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx, DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT)); return Idx; }; unsigned Opcode = 0; switch (VT.SimpleTy) { default: break; case MVT::v16i8: if (Subtarget.hasSSSE3()) Opcode = X86ISD::PSHUFB; break; case MVT::v8i16: if (Subtarget.hasVLX() && Subtarget.hasBWI()) Opcode = X86ISD::VPERMV; else if (Subtarget.hasSSSE3()) { Opcode = X86ISD::PSHUFB; ShuffleVT = MVT::v16i8; } break; case MVT::v4f32: case MVT::v4i32: if (Subtarget.hasAVX()) { Opcode = X86ISD::VPERMILPV; ShuffleVT = MVT::v4f32; } else if (Subtarget.hasSSSE3()) { Opcode = X86ISD::PSHUFB; ShuffleVT = MVT::v16i8; } break; case MVT::v2f64: case MVT::v2i64: if (Subtarget.hasAVX()) { // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec. IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec); Opcode = X86ISD::VPERMILPV; ShuffleVT = MVT::v2f64; } else if (Subtarget.hasSSE41()) { // SSE41 can compare v2i64 - select between indices 0 and 1. return DAG.getSelectCC( DL, IndicesVec, getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL), DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}), DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}), ISD::CondCode::SETEQ); } break; case MVT::v32i8: if (Subtarget.hasVLX() && Subtarget.hasVBMI()) Opcode = X86ISD::VPERMV; else if (Subtarget.hasXOP()) { SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL); SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL); SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL); SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL); return DAG.getNode( ISD::CONCAT_VECTORS, DL, VT, DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx), DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx)); } else if (Subtarget.hasAVX()) { SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL); SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL); SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo); SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi); auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { // Permute Lo and Hi and then select based on index range. // This works as SHUFB uses bits[3:0] to permute elements and we don't // care about the bit[7] as its just an index vector. SDValue Idx = Ops[2]; EVT VT = Idx.getValueType(); return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT), DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx), DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx), ISD::CondCode::SETGT); }; SDValue Ops[] = {LoLo, HiHi, IndicesVec}; return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops, PSHUFBBuilder); } break; case MVT::v16i16: if (Subtarget.hasVLX() && Subtarget.hasBWI()) Opcode = X86ISD::VPERMV; else if (Subtarget.hasAVX()) { // Scale to v32i8 and perform as v32i8. IndicesVec = ScaleIndices(IndicesVec, 2); return DAG.getBitcast( VT, createVariablePermute( MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec), DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget)); } break; case MVT::v8f32: case MVT::v8i32: if (Subtarget.hasAVX2()) Opcode = X86ISD::VPERMV; else if (Subtarget.hasAVX()) { SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec); SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec, {0, 1, 2, 3, 0, 1, 2, 3}); SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec, {4, 5, 6, 7, 4, 5, 6, 7}); if (Subtarget.hasXOP()) return DAG.getBitcast( VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi, IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8))); // Permute Lo and Hi and then select based on index range. // This works as VPERMILPS only uses index bits[0:1] to permute elements. SDValue Res = DAG.getSelectCC( DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32), DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec), DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec), ISD::CondCode::SETGT); return DAG.getBitcast(VT, Res); } break; case MVT::v4i64: case MVT::v4f64: if (Subtarget.hasAVX512()) { if (!Subtarget.hasVLX()) { MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8); SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec)); IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget, DAG, SDLoc(IndicesVec)); SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL, DAG, Subtarget); return extract256BitVector(Res, 0, DAG, DL); } Opcode = X86ISD::VPERMV; } else if (Subtarget.hasAVX()) { SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec); SDValue LoLo = DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1}); SDValue HiHi = DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3}); // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec. IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec); if (Subtarget.hasXOP()) return DAG.getBitcast( VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi, IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8))); // Permute Lo and Hi and then select based on index range. // This works as VPERMILPD only uses index bit[1] to permute elements. SDValue Res = DAG.getSelectCC( DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64), DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec), DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec), ISD::CondCode::SETGT); return DAG.getBitcast(VT, Res); } break; case MVT::v64i8: if (Subtarget.hasVBMI()) Opcode = X86ISD::VPERMV; break; case MVT::v32i16: if (Subtarget.hasBWI()) Opcode = X86ISD::VPERMV; break; case MVT::v16f32: case MVT::v16i32: case MVT::v8f64: case MVT::v8i64: if (Subtarget.hasAVX512()) Opcode = X86ISD::VPERMV; break; } if (!Opcode) return SDValue(); assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && "Illegal variable permute shuffle type"); uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits(); if (Scale > 1) IndicesVec = ScaleIndices(IndicesVec, Scale); EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger(); IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec); SrcVec = DAG.getBitcast(ShuffleVT, SrcVec); SDValue Res = Opcode == X86ISD::VPERMV ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec) : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec); return DAG.getBitcast(VT, Res); } // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be // reasoned to be a permutation of a vector by indices in a non-constant vector. // (build_vector (extract_elt V, (extract_elt I, 0)), // (extract_elt V, (extract_elt I, 1)), // ... // -> // (vpermv I, V) // // TODO: Handle undefs // TODO: Utilize pshufb and zero mask blending to support more efficient // construction of vectors with constant-0 elements. static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue SrcVec, IndicesVec; auto PeekThroughFreeze = [](SDValue N) { if (N->getOpcode() == ISD::FREEZE && N.hasOneUse()) return N->getOperand(0); return N; }; // Check for a match of the permute source vector and permute index elements. // This is done by checking that the i-th build_vector operand is of the form: // (extract_elt SrcVec, (extract_elt IndicesVec, i)). for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) { SDValue Op = PeekThroughFreeze(V.getOperand(Idx)); if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); // If this is the first extract encountered in V, set the source vector, // otherwise verify the extract is from the previously defined source // vector. if (!SrcVec) SrcVec = Op.getOperand(0); else if (SrcVec != Op.getOperand(0)) return SDValue(); SDValue ExtractedIndex = Op->getOperand(1); // Peek through extends. if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND || ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND) ExtractedIndex = ExtractedIndex.getOperand(0); if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); // If this is the first extract from the index vector candidate, set the // indices vector, otherwise verify the extract is from the previously // defined indices vector. if (!IndicesVec) IndicesVec = ExtractedIndex.getOperand(0); else if (IndicesVec != ExtractedIndex.getOperand(0)) return SDValue(); auto *PermIdx = dyn_cast(ExtractedIndex.getOperand(1)); if (!PermIdx || PermIdx->getAPIntValue() != Idx) return SDValue(); } MVT VT = V.getSimpleValueType(); return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget); } SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); MVT OpEltVT = Op.getOperand(0).getSimpleValueType(); unsigned NumElems = Op.getNumOperands(); // Generate vectors for predicate vectors. if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget); if (VT.getVectorElementType() == MVT::bf16 && (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget); if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget)) return VectorCst; unsigned EVTBits = EltVT.getSizeInBits(); APInt UndefMask = APInt::getZero(NumElems); APInt FrozenUndefMask = APInt::getZero(NumElems); APInt ZeroMask = APInt::getZero(NumElems); APInt NonZeroMask = APInt::getZero(NumElems); bool IsAllConstants = true; bool OneUseFrozenUndefs = true; SmallSet Values; unsigned NumConstants = NumElems; for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = Op.getOperand(i); if (Elt.isUndef()) { UndefMask.setBit(i); continue; } if (ISD::isFreezeUndef(Elt.getNode())) { OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse(); FrozenUndefMask.setBit(i); continue; } Values.insert(Elt); if (!isIntOrFPConstant(Elt)) { IsAllConstants = false; NumConstants--; } if (X86::isZeroNode(Elt)) { ZeroMask.setBit(i); } else { NonZeroMask.setBit(i); } } // All undef vector. Return an UNDEF. if (UndefMask.isAllOnes()) return DAG.getUNDEF(VT); // All undef/freeze(undef) vector. Return a FREEZE UNDEF. if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes()) return DAG.getFreeze(DAG.getUNDEF(VT)); // All undef/freeze(undef)/zero vector. Return a zero vector. if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes()) return getZeroVector(VT, Subtarget, DAG, dl); // If we have multiple FREEZE-UNDEF operands, we are likely going to end up // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR, // and blend the FREEZE-UNDEF operands back in. // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand? if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount(); NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) { SmallVector BlendMask(NumElems, -1); SmallVector Elts(NumElems, DAG.getUNDEF(OpEltVT)); for (unsigned i = 0; i < NumElems; ++i) { if (UndefMask[i]) { BlendMask[i] = -1; continue; } BlendMask[i] = i; if (!FrozenUndefMask[i]) Elts[i] = Op.getOperand(i); else BlendMask[i] += NumElems; } SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts); SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT)); SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt); return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask); } BuildVectorSDNode *BV = cast(Op.getNode()); // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might // be better off lowering to a smaller build vector and padding with // undef/zero. if ((VT.is256BitVector() || VT.is512BitVector()) && !isFoldableUseOfShuffle(BV)) { unsigned UpperElems = NumElems / 2; APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask; unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one(); if (NumUpperUndefsOrZeros >= UpperElems) { if (VT.is512BitVector() && NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4))) UpperElems = NumElems - (NumElems / 4); // If freeze(undef) is in any upper elements, force to zero. bool UndefUpper = UndefMask.countl_one() >= UpperElems; MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems); SDValue NewBV = DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems)); return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl); } } if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG)) return AddSub; if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG)) return HorizontalOp; if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG)) return Broadcast; if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG)) return BitOp; if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG)) return Blend; if (SDValue WideBV = widenBuildVector(BV, dl, Subtarget, DAG)) return WideBV; unsigned NumZero = ZeroMask.popcount(); unsigned NumNonZero = NonZeroMask.popcount(); // If we are inserting one variable into a vector of non-zero constants, try // to avoid loading each constant element as a scalar. Load the constants as a // vector and then insert the variable scalar element. If insertion is not // supported, fall back to a shuffle to get the scalar blended with the // constants. Insertion into a zero vector is handled as a special-case // somewhere below here. if (NumConstants == NumElems - 1 && NumNonZero != 1 && FrozenUndefMask.isZero() && (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) || isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) { // Create an all-constant vector. The variable element in the old // build vector is replaced by undef in the constant vector. Save the // variable scalar element and its index for use in the insertelement. LLVMContext &Context = *DAG.getContext(); Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context); SmallVector ConstVecOps(NumElems, UndefValue::get(EltType)); SDValue VarElt; SDValue InsIndex; for (unsigned i = 0; i != NumElems; ++i) { SDValue Elt = Op.getOperand(i); if (auto *C = dyn_cast(Elt)) ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue()); else if (auto *C = dyn_cast(Elt)) ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF()); else if (!Elt.isUndef()) { assert(!VarElt.getNode() && !InsIndex.getNode() && "Expected one variable element in this vector"); VarElt = Elt; InsIndex = DAG.getVectorIdxConstant(i, dl); } } Constant *CV = ConstantVector::get(ConstVecOps); SDValue DAGConstVec = DAG.getConstantPool(CV, VT); // The constants we just created may not be legal (eg, floating point). We // must lower the vector right here because we can not guarantee that we'll // legalize it before loading it. This is also why we could not just create // a new build vector here. If the build vector contains illegal constants, // it could get split back up into a series of insert elements. // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD. SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG); MachineFunction &MF = DAG.getMachineFunction(); MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF); SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI); unsigned InsertC = InsIndex->getAsZExtVal(); unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits(); if (InsertC < NumEltsInLow128Bits) return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex); // There's no good way to insert into the high elements of a >128-bit // vector, so use shuffles to avoid an extract/insert sequence. assert(VT.getSizeInBits() > 128 && "Invalid insertion index?"); assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector"); SmallVector ShuffleMask; unsigned NumElts = VT.getVectorNumElements(); for (unsigned i = 0; i != NumElts; ++i) ShuffleMask.push_back(i == InsertC ? NumElts : i); SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt); return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask); } // Special case for single non-zero, non-undef, element. if (NumNonZero == 1) { unsigned Idx = NonZeroMask.countr_zero(); SDValue Item = Op.getOperand(Idx); // If we have a constant or non-constant insertion into the low element of // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into // the rest of the elements. This will be matched as movd/movq/movss/movsd // depending on what the source datatype is. if (Idx == 0) { if (NumZero == 0) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) || (EltVT == MVT::i16 && Subtarget.hasFP16())) { assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Expected an SSE value type!"); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a // zero vector. return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } // We can't directly insert an i8 or i16 into a vector, so zero extend // it to i32 first. if (EltVT == MVT::i16 || EltVT == MVT::i8) { Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); return DAG.getBitcast(VT, Item); } } // Is it a vector logical left shift? if (NumElems == 2 && Idx == 1 && X86::isZeroNode(Op.getOperand(0)) && !X86::isZeroNode(Op.getOperand(1))) { unsigned NumBits = VT.getSizeInBits(); return getVShift(true, VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(1)), NumBits/2, DAG, *this, dl); } if (IsAllConstants) // Otherwise, it's better to do a constpool load. return SDValue(); // Otherwise, if this is a vector with i32 or f32 elements, and the element // is a non-constant being inserted into an element other than the low one, // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka // movd/movss) to move this into the low element, then shuffle it into // place. if (EVTBits == 32) { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG); } } // Splat is obviously ok. Let legalizer expand it to a shuffle. if (Values.size() == 1) { if (EVTBits == 32) { // Instead of a shuffle like this: // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> // Check if it's possible to issue this instead. // shuffle (vload ptr)), undef, <1, 1, 1, 1> unsigned Idx = NonZeroMask.countr_zero(); SDValue Item = Op.getOperand(Idx); if (Op.getNode()->isOnlyUserOf(Item.getNode())) return LowerAsSplatVectorLoad(Item, VT, dl, DAG); } return SDValue(); } // A vector full of immediates; various special cases are already // handled, so this is best done with a single constant-pool load. if (IsAllConstants) return SDValue(); if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget)) return V; // See if we can use a vector load to get all of the elements. { SmallVector Ops(Op->ops().take_front(NumElems)); if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false)) return LD; } // If this is a splat of pairs of 32-bit elements, we can use a narrower // build_vector and broadcast it. // TODO: We could probably generalize this more. if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) { SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1), DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) }; auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef Ops) { // Make sure all the even/odd operands match. for (unsigned i = 2; i != NumElems; ++i) if (Ops[i % 2] != Op.getOperand(i)) return false; return true; }; if (CanSplat(Op, NumElems, Ops)) { MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64; MVT NarrowVT = MVT::getVectorVT(EltVT, 4); // Create a new build vector and cast to v2i64/v2f64. SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2), DAG.getBuildVector(NarrowVT, dl, Ops)); // Broadcast from v2i64/v2f64 and cast to final VT. MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2); return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT, NewBV)); } } // For AVX-length vectors, build the individual 128-bit pieces and use // shuffles to put them in place. if (VT.getSizeInBits() > 128) { MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2); // Build both the lower and upper subvector. SDValue Lower = DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2)); SDValue Upper = DAG.getBuildVector( HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2)); // Recreate the wider vector with the lower and upper part. return concatSubVectors(Lower, Upper, DAG, dl); } // Let legalizer expand 2-wide build_vectors. if (EVTBits == 64) { if (NumNonZero == 1) { // One half is zero or undef. unsigned Idx = NonZeroMask.countr_zero(); SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(Idx)); return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); } return SDValue(); } // If element VT is < 32 bits, convert it to inserts into a zero vector. if (EVTBits == 8 && NumElems == 16) if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero, NumZero, DAG, Subtarget)) return V; if (EltVT == MVT::i16 && NumElems == 8) if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero, NumZero, DAG, Subtarget)) return V; // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS if (EVTBits == 32 && NumElems == 4) if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget)) return V; // If element VT is == 32 bits, turn it into a number of shuffles. if (NumElems == 4 && NumZero > 0) { SmallVector Ops(NumElems); for (unsigned i = 0; i < 4; ++i) { bool isZero = !NonZeroMask[i]; if (isZero) Ops[i] = getZeroVector(VT, Subtarget, DAG, dl); else Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); } for (unsigned i = 0; i < 2; ++i) { switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) { default: llvm_unreachable("Unexpected NonZero count"); case 0: Ops[i] = Ops[i*2]; // Must be a zero vector. break; case 1: Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]); break; case 2: Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]); break; case 3: Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]); break; } } bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2; bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2; int MaskVec[] = { Reverse1 ? 1 : 0, Reverse1 ? 0 : 1, static_cast(Reverse2 ? NumElems+1 : NumElems), static_cast(Reverse2 ? NumElems : NumElems+1) }; return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec); } assert(Values.size() > 1 && "Expected non-undef and non-splat vector"); // Check for a build vector from mostly shuffle plus few inserting. if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG)) return Sh; // For SSE 4.1, use insertps to put the high elements into the low element. if (Subtarget.hasSSE41() && EltVT != MVT::f16) { SDValue Result; if (!Op.getOperand(0).isUndef()) Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); else Result = DAG.getUNDEF(VT); for (unsigned i = 1; i < NumElems; ++i) { if (Op.getOperand(i).isUndef()) continue; Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, Op.getOperand(i), DAG.getVectorIdxConstant(i, dl)); } return Result; } // Otherwise, expand into a number of unpckl*, start by extending each of // our (non-undef) elements to the full vector width with the element in the // bottom slot of the vector (which generates no code for SSE). SmallVector Ops(NumElems); for (unsigned i = 0; i < NumElems; ++i) { if (!Op.getOperand(i).isUndef()) Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); else Ops[i] = DAG.getUNDEF(VT); } // Next, we iteratively mix elements, e.g. for v4f32: // Step 1: unpcklps 0, 1 ==> X: // : unpcklps 2, 3 ==> Y: // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0> for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) { // Generate scaled UNPCKL shuffle mask. SmallVector Mask; for(unsigned i = 0; i != Scale; ++i) Mask.push_back(i); for (unsigned i = 0; i != Scale; ++i) Mask.push_back(NumElems+i); Mask.append(NumElems - Mask.size(), SM_SentinelUndef); for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i) Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask); } return Ops[0]; } // 256-bit AVX can use the vinsertf128 instruction // to create 256-bit vectors from two other 128-bit ones. // TODO: Detect subvector broadcast here instead of DAG combine? static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT ResVT = Op.getSimpleValueType(); assert((ResVT.is256BitVector() || ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide"); unsigned NumOperands = Op.getNumOperands(); unsigned NumFreezeUndef = 0; unsigned NumZero = 0; unsigned NumNonZero = 0; unsigned NonZeros = 0; SmallSet Undefs; for (unsigned i = 0; i != NumOperands; ++i) { SDValue SubVec = Op.getOperand(i); if (SubVec.isUndef()) continue; if (ISD::isFreezeUndef(SubVec.getNode())) { // If the freeze(undef) has multiple uses then we must fold to zero. if (SubVec.hasOneUse()) { ++NumFreezeUndef; } else { ++NumZero; Undefs.insert(SubVec); } } else if (ISD::isBuildVectorAllZeros(SubVec.getNode())) ++NumZero; else { assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range. NonZeros |= 1 << i; ++NumNonZero; } } // If we have more than 2 non-zeros, build each half separately. if (NumNonZero > 2) { MVT HalfVT = ResVT.getHalfNumVectorElementsVT(); ArrayRef Ops = Op->ops(); SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops.slice(0, NumOperands/2)); SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops.slice(NumOperands/2)); return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); } // Otherwise, build it up through insert_subvectors. SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl) : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT)) : DAG.getUNDEF(ResVT)); // Replace Undef operands with ZeroVector. for (SDValue U : Undefs) DAG.ReplaceAllUsesWith( U, getZeroVector(U.getSimpleValueType(), Subtarget, DAG, dl)); MVT SubVT = Op.getOperand(0).getSimpleValueType(); unsigned NumSubElems = SubVT.getVectorNumElements(); for (unsigned i = 0; i != NumOperands; ++i) { if ((NonZeros & (1 << i)) == 0) continue; Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i), DAG.getVectorIdxConstant(i * NumSubElems, dl)); } return Vec; } // Returns true if the given node is a type promotion (by concatenating i1 // zeros) of the result of a node that already zeros all upper bits of // k-register. // TODO: Merge this with LowerAVXCONCAT_VECTORS? static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG & DAG) { MVT ResVT = Op.getSimpleValueType(); unsigned NumOperands = Op.getNumOperands(); assert(NumOperands > 1 && isPowerOf2_32(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"); uint64_t Zeros = 0; uint64_t NonZeros = 0; for (unsigned i = 0; i != NumOperands; ++i) { SDValue SubVec = Op.getOperand(i); if (SubVec.isUndef()) continue; assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range. if (ISD::isBuildVectorAllZeros(SubVec.getNode())) Zeros |= (uint64_t)1 << i; else NonZeros |= (uint64_t)1 << i; } unsigned NumElems = ResVT.getVectorNumElements(); // If we are inserting non-zero vector and there are zeros in LSBs and undef // in the MSBs we need to emit a KSHIFTL. The generic lowering to // insert_subvector will give us two kshifts. if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros && Log2_64(NonZeros) != NumOperands - 1) { unsigned Idx = Log2_64(NonZeros); SDValue SubVec = Op.getOperand(Idx); unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements(); MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget); Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl); Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op, DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op, DAG.getVectorIdxConstant(0, dl)); } // If there are zero or one non-zeros we can handle this very simply. if (NonZeros == 0 || isPowerOf2_64(NonZeros)) { SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT); if (!NonZeros) return Vec; unsigned Idx = Log2_64(NonZeros); SDValue SubVec = Op.getOperand(Idx); unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements(); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec, DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl)); } if (NumOperands > 2) { MVT HalfVT = ResVT.getHalfNumVectorElementsVT(); ArrayRef Ops = Op->ops(); SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops.slice(0, NumOperands / 2)); SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops.slice(NumOperands / 2)); return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); } assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?"); if (ResVT.getVectorNumElements() >= 16) return Op; // The operation is legal with KUNPCK SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT), Op.getOperand(0), DAG.getVectorIdxConstant(0, dl)); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1), DAG.getVectorIdxConstant(NumElems / 2, dl)); } static SDValue LowerCONCAT_VECTORS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); if (VT.getVectorElementType() == MVT::i1) return LowerCONCAT_VECTORSvXi1(Op, DL, Subtarget, DAG); // AVX can use the vinsertf128 instruction to create 256-bit vectors // from two other 128-bit ones. // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors assert((VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))); return LowerAVXCONCAT_VECTORS(Op, DL, DAG, Subtarget); } //===----------------------------------------------------------------------===// // Vector shuffle lowering // // This is an experimental code path for lowering vector shuffles on x86. It is // designed to handle arbitrary vector shuffles and blends, gracefully // degrading performance as necessary. It works hard to recognize idiomatic // shuffles and lower them to optimal instruction patterns without leaving // a framework that allows reasonably efficient handling of all vector shuffle // patterns. //===----------------------------------------------------------------------===// /// Checks whether the vector elements referenced by two shuffle masks are /// equivalent. static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx) { assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && "Out of range element index"); if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode()) return false; EVT VT = Op.getValueType(); EVT ExpectedVT = ExpectedOp.getValueType(); // Sources must be vectors and match the mask's element count. if (!VT.isVector() || !ExpectedVT.isVector() || (int)VT.getVectorNumElements() != MaskSize || (int)ExpectedVT.getVectorNumElements() != MaskSize) return false; // Exact match. if (Idx == ExpectedIdx && Op == ExpectedOp) return true; switch (Op.getOpcode()) { case ISD::BUILD_VECTOR: // If the values are build vectors, we can look through them to find // equivalent inputs that make the shuffles equivalent. return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx); case ISD::BITCAST: { SDValue Src = peekThroughBitcasts(Op); EVT SrcVT = Src.getValueType(); if (Op == ExpectedOp && SrcVT.isVector()) { if ((SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) { unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits(); return (Idx % Scale) == (ExpectedIdx % Scale) && IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src, Idx / Scale, ExpectedIdx / Scale); } if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0) { unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits(); for (unsigned I = 0; I != Scale; ++I) if (!IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src, (Idx * Scale) + I, (ExpectedIdx * Scale) + I)) return false; return true; } } break; } case ISD::VECTOR_SHUFFLE: { auto *SVN = cast(Op); return Op == ExpectedOp && SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx); } case X86ISD::VBROADCAST: case X86ISD::VBROADCAST_LOAD: return Op == ExpectedOp; case X86ISD::SUBV_BROADCAST_LOAD: if (Op == ExpectedOp) { auto *MemOp = cast(Op); unsigned NumMemElts = MemOp->getMemoryVT().getVectorNumElements(); return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts); } break; case X86ISD::VPERMI: { if (Op == ExpectedOp) { SmallVector Mask; DecodeVPERMMask(MaskSize, Op.getConstantOperandVal(1), Mask); SDValue Src = Op.getOperand(0); return IsElementEquivalent(MaskSize, Src, Src, Mask[Idx], Mask[ExpectedIdx]); } break; } case X86ISD::HADD: case X86ISD::HSUB: case X86ISD::FHADD: case X86ISD::FHSUB: case X86ISD::PACKSS: case X86ISD::PACKUS: // HOP(X,X) can refer to the elt from the lower/upper half of a lane. // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases. if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) { int NumElts = VT.getVectorNumElements(); int NumLanes = VT.getSizeInBits() / 128; int NumEltsPerLane = NumElts / NumLanes; int NumHalfEltsPerLane = NumEltsPerLane / 2; bool SameLane = (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane); bool SameElt = (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane); return SameLane && SameElt; } break; } return false; } /// Tiny helper function to identify a no-op mask. /// /// This is a somewhat boring predicate function. It checks whether the mask /// array input, which is assumed to be a single-input shuffle mask of the kind /// used by the X86 shuffle instructions (not a fully general /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an /// in-place shuffle are 'no-op's. static bool isNoopShuffleMask(ArrayRef Mask) { for (int i = 0, Size = Mask.size(); i < Size; ++i) { assert(Mask[i] >= -1 && "Out of bound mask element!"); if (Mask[i] >= 0 && Mask[i] != i) return false; } return true; } /// Test whether there are elements crossing LaneSizeInBits lanes in this /// shuffle mask. /// /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations /// and we routinely test for these. static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef Mask) { assert(LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && "Illegal shuffle lane size"); int LaneSize = LaneSizeInBits / ScalarSizeInBits; int Size = Mask.size(); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) return true; return false; } /// Test whether there are elements crossing 128-bit lanes in this /// shuffle mask. static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef Mask) { return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask); } /// Test whether elements in each LaneSizeInBits lane in this shuffle mask come /// from multiple lanes - this is different to isLaneCrossingShuffleMask to /// better support 'repeated mask + lane permute' style shuffles. static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef Mask) { assert(LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && "Illegal shuffle lane size"); int NumElts = Mask.size(); int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits; int NumLanes = NumElts / NumEltsPerLane; if (NumLanes > 1) { for (int i = 0; i != NumLanes; ++i) { int SrcLane = -1; for (int j = 0; j != NumEltsPerLane; ++j) { int M = Mask[(i * NumEltsPerLane) + j]; if (M < 0) continue; int Lane = (M % NumElts) / NumEltsPerLane; if (SrcLane >= 0 && SrcLane != Lane) return true; SrcLane = Lane; } } } return false; } /// Test whether a shuffle mask is equivalent within each sub-lane. /// /// This checks a shuffle mask to see if it is performing the same /// lane-relative shuffle in each sub-lane. This trivially implies /// that it is also not lane-crossing. It may however involve a blend from the /// same lane of a second vector. /// /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is /// non-trivial to compute in the face of undef lanes. The representation is /// suitable for use with existing 128-bit shuffles as entries from the second /// vector have been remapped to [LaneSize, 2*LaneSize). static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef Mask, SmallVectorImpl &RepeatedMask) { auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits(); RepeatedMask.assign(LaneSize, -1); int Size = Mask.size(); for (int i = 0; i < Size; ++i) { assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0); if (Mask[i] < 0) continue; if ((Mask[i] % Size) / LaneSize != i / LaneSize) // This entry crosses lanes, so there is no way to model this shuffle. return false; // Ok, handle the in-lane shuffles by detecting if and when they repeat. // Adjust second vector indices to start at LaneSize instead of Size. int LocalM = Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize; if (RepeatedMask[i % LaneSize] < 0) // This is the first non-undef entry in this slot of a 128-bit lane. RepeatedMask[i % LaneSize] = LocalM; else if (RepeatedMask[i % LaneSize] != LocalM) // Found a mismatch with the repeated mask. return false; } return true; } /// Test whether a shuffle mask is equivalent within each 128-bit lane. static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask, SmallVectorImpl &RepeatedMask) { return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask); } static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask) { SmallVector RepeatedMask; return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask); } /// Test whether a shuffle mask is equivalent within each 256-bit lane. static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef Mask, SmallVectorImpl &RepeatedMask) { return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask); } /// Test whether a target shuffle mask is equivalent within each sub-lane. /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero. static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef Mask, SmallVectorImpl &RepeatedMask) { int LaneSize = LaneSizeInBits / EltSizeInBits; RepeatedMask.assign(LaneSize, SM_SentinelUndef); int Size = Mask.size(); for (int i = 0; i < Size; ++i) { assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0)); if (Mask[i] == SM_SentinelUndef) continue; if (Mask[i] == SM_SentinelZero) { if (!isUndefOrZero(RepeatedMask[i % LaneSize])) return false; RepeatedMask[i % LaneSize] = SM_SentinelZero; continue; } if ((Mask[i] % Size) / LaneSize != i / LaneSize) // This entry crosses lanes, so there is no way to model this shuffle. return false; // Handle the in-lane shuffles by detecting if and when they repeat. Adjust // later vector indices to start at multiples of LaneSize instead of Size. int LaneM = Mask[i] / Size; int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize); if (RepeatedMask[i % LaneSize] == SM_SentinelUndef) // This is the first non-undef entry in this slot of a 128-bit lane. RepeatedMask[i % LaneSize] = LocalM; else if (RepeatedMask[i % LaneSize] != LocalM) // Found a mismatch with the repeated mask. return false; } return true; } /// Test whether a target shuffle mask is equivalent within each sub-lane. /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero. static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef Mask, SmallVectorImpl &RepeatedMask) { return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(), Mask, RepeatedMask); } /// Checks whether a shuffle mask is equivalent to an explicit list of /// arguments. /// /// This is a fast way to test a shuffle mask against a fixed pattern: /// /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... } /// /// It returns true if the mask is exactly as wide as the argument list, and /// each element of the mask is either -1 (signifying undef) or the value given /// in the argument. static bool isShuffleEquivalent(ArrayRef Mask, ArrayRef ExpectedMask, SDValue V1 = SDValue(), SDValue V2 = SDValue()) { int Size = Mask.size(); if (Size != (int)ExpectedMask.size()) return false; for (int i = 0; i < Size; ++i) { assert(Mask[i] >= -1 && "Out of bound mask element!"); int MaskIdx = Mask[i]; int ExpectedIdx = ExpectedMask[i]; if (0 <= MaskIdx && MaskIdx != ExpectedIdx) { SDValue MaskV = MaskIdx < Size ? V1 : V2; SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2; MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size); ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size); if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx)) return false; } } return true; } /// Checks whether a target shuffle mask is equivalent to an explicit pattern. /// /// The masks must be exactly the same width. /// /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding /// value in ExpectedMask is always accepted. Otherwise the indices must match. /// /// SM_SentinelZero is accepted as a valid negative index but must match in /// both, or via a known bits test. static bool isTargetShuffleEquivalent(MVT VT, ArrayRef Mask, ArrayRef ExpectedMask, const SelectionDAG &DAG, SDValue V1 = SDValue(), SDValue V2 = SDValue()) { int Size = Mask.size(); if (Size != (int)ExpectedMask.size()) return false; assert(llvm::all_of(ExpectedMask, [Size](int M) { return M == SM_SentinelZero || isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"); // Check for out-of-range target shuffle mask indices. if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size)) return false; // Don't use V1/V2 if they're not the same size as the shuffle mask type. if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() || !V1.getValueType().isVector())) V1 = SDValue(); if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() || !V2.getValueType().isVector())) V2 = SDValue(); APInt ZeroV1 = APInt::getZero(Size); APInt ZeroV2 = APInt::getZero(Size); for (int i = 0; i < Size; ++i) { int MaskIdx = Mask[i]; int ExpectedIdx = ExpectedMask[i]; if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx) continue; // If we failed to match an expected SM_SentinelZero then early out. if (ExpectedIdx < 0) return false; if (MaskIdx == SM_SentinelZero) { // If we need this expected index to be a zero element, then update the // relevant zero mask and perform the known bits at the end to minimize // repeated computes. SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2; if (ExpectedV && Size == (int)ExpectedV.getValueType().getVectorNumElements()) { int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size); APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2; ZeroMask.setBit(BitIdx); continue; } } if (MaskIdx >= 0) { SDValue MaskV = MaskIdx < Size ? V1 : V2; SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2; MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size); ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size); if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx)) continue; } return false; } return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) && (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2)); } // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd // instructions. static bool isUnpackWdShuffleMask(ArrayRef Mask, MVT VT, const SelectionDAG &DAG) { if (VT != MVT::v8i32 && VT != MVT::v8f32) return false; SmallVector Unpcklwd; createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true, /* Unary = */ false); SmallVector Unpckhwd; createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false, /* Unary = */ false); bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) || isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG)); return IsUnpackwdMask; } static bool is128BitUnpackShuffleMask(ArrayRef Mask, const SelectionDAG &DAG) { // Create 128-bit vector type based on mask size. MVT EltVT = MVT::getIntegerVT(128 / Mask.size()); MVT VT = MVT::getVectorVT(EltVT, Mask.size()); // We can't assume a canonical shuffle mask, so try the commuted version too. SmallVector CommutedMask(Mask); ShuffleVectorSDNode::commuteMask(CommutedMask); // Match any of unary/binary or low/high. for (unsigned i = 0; i != 4; ++i) { SmallVector UnpackMask; createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2); if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) || isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG)) return true; } return false; } /// Return true if a shuffle mask chooses elements identically in its top and /// bottom halves. For example, any splat mask has the same top and bottom /// halves. If an element is undefined in only one half of the mask, the halves /// are not considered identical. static bool hasIdenticalHalvesShuffleMask(ArrayRef Mask) { assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask"); unsigned HalfSize = Mask.size() / 2; for (unsigned i = 0; i != HalfSize; ++i) { if (Mask[i] != Mask[i + HalfSize]) return false; } return true; } /// Get a 4-lane 8-bit shuffle immediate for a mask. /// /// This helper function produces an 8-bit shuffle immediate corresponding to /// the ubiquitous shuffle encoding scheme used in x86 instructions for /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for /// example. /// /// NB: We rely heavily on "undef" masks preserving the input lane. static unsigned getV4X86ShuffleImm(ArrayRef Mask) { assert(Mask.size() == 4 && "Only 4-lane shuffle masks"); assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"); assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"); assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"); assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"); // If the mask only uses one non-undef element, then fully 'splat' it to // improve later broadcast matching. int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin(); assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask"); int FirstElt = Mask[FirstIndex]; if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; })) return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt; unsigned Imm = 0; Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0; Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2; Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4; Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6; return Imm; } static SDValue getV4X86ShuffleImm8ForMask(ArrayRef Mask, const SDLoc &DL, SelectionDAG &DAG) { return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8); } // Canonicalize SHUFPD mask to improve chances of further folding. // Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern. static unsigned getSHUFPDImm(ArrayRef Mask) { assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) && "Unexpected SHUFPD mask size"); assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) && "Unexpected SHUFPD mask elements"); // If the mask only uses one non-undef element, then fully 'splat' it to // improve later broadcast matching. int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin(); assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() && "All undef shuffle mask"); int FirstElt = Mask[FirstIndex]; if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) && count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) { unsigned Imm = 0; for (unsigned I = 0, E = Mask.size(); I != E; ++I) Imm |= FirstElt << I; return Imm; } // Attempt to keep any undef elements in place to improve chances of the // shuffle becoming a (commutative) blend. unsigned Imm = 0; for (unsigned I = 0, E = Mask.size(); I != E; ++I) Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I; return Imm; } static SDValue getSHUFPDImmForMask(ArrayRef Mask, const SDLoc &DL, SelectionDAG &DAG) { return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8); } // The Shuffle result is as follow: // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order. // Each Zeroable's element correspond to a particular Mask's element. // As described in computeZeroableShuffleElements function. // // The function looks for a sub-mask that the nonzero elements are in // increasing order. If such sub-mask exist. The function returns true. static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef Mask, const EVT &VectorType, bool &IsZeroSideLeft) { int NextElement = -1; // Check if the Mask's nonzero elements are in increasing order. for (int i = 0, e = Mask.size(); i < e; i++) { // Checks if the mask's zeros elements are built from only zeros. assert(Mask[i] >= -1 && "Out of bound mask element!"); if (Mask[i] < 0) return false; if (Zeroable[i]) continue; // Find the lowest non zero element if (NextElement < 0) { NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0; IsZeroSideLeft = NextElement != 0; } // Exit if the mask's non zero elements are not in increasing order. if (NextElement != Mask[i]) return false; NextElement++; } return true; } static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth = 0); /// Try to lower a shuffle with a single PSHUFB of V1 or V2. static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Size = Mask.size(); int LaneSize = 128 / VT.getScalarSizeInBits(); const int NumBytes = VT.getSizeInBits() / 8; const int NumEltBytes = VT.getScalarSizeInBits() / 8; assert((Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())); SmallVector PSHUFBMask(NumBytes); // Sign bit set in i8 mask means zero element. SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8); SDValue V; for (int i = 0; i < NumBytes; ++i) { int M = Mask[i / NumEltBytes]; if (M < 0) { PSHUFBMask[i] = DAG.getUNDEF(MVT::i8); continue; } if (Zeroable[i / NumEltBytes]) { PSHUFBMask[i] = ZeroMask; continue; } // We can only use a single input of V1 or V2. SDValue SrcV = (M >= Size ? V2 : V1); if (V && V != SrcV) return SDValue(); V = SrcV; M %= Size; // PSHUFB can't cross lanes, ensure this doesn't happen. if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize)) return SDValue(); M = M % LaneSize; M = M * NumEltBytes + (i % NumEltBytes); PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8); } assert(V && "Failed to find a source input"); MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes); return DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V), DAG.getBuildVector(I8VT, DL, PSHUFBMask))); } static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl); // X86 has dedicated shuffle that can be lowered to VEXPAND static SDValue lowerShuffleWithEXPAND(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { bool IsLeftZeroSide = true; if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(), IsLeftZeroSide)) return SDValue(); unsigned VEXPANDMask = (~Zeroable).getZExtValue(); MVT IntegerType = MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType); unsigned NumElts = VT.getVectorNumElements(); assert((NumElts == 4 || NumElts == 8 || NumElts == 16) && "Unexpected number of vector elements"); SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts), Subtarget, DAG, DL); SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL); SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1; return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask); } static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { int NumElts = VT.getVectorNumElements(); bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true; for (int i = 0; i != NumElts; i += 2) { int M1 = TargetMask[i + 0]; int M2 = TargetMask[i + 1]; Undef1 &= (SM_SentinelUndef == M1); Undef2 &= (SM_SentinelUndef == M2); Zero1 &= isUndefOrZero(M1); Zero2 &= isUndefOrZero(M2); } assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) && "Zeroable shuffle detected"); // Attempt to match the target mask against the unpack lo/hi mask patterns. SmallVector Unpckl, Unpckh; createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary); if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1, (IsUnary ? V1 : V2))) { UnpackOpcode = X86ISD::UNPCKL; V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); V1 = (Undef1 ? DAG.getUNDEF(VT) : V1); return true; } createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary); if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1, (IsUnary ? V1 : V2))) { UnpackOpcode = X86ISD::UNPCKH; V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); V1 = (Undef1 ? DAG.getUNDEF(VT) : V1); return true; } // If an unary shuffle, attempt to match as an unpack lo/hi with zero. if (IsUnary && (Zero1 || Zero2)) { // Don't bother if we can blend instead. if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) && isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0)) return false; bool MatchLo = true, MatchHi = true; for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) { int M = TargetMask[i]; // Ignore if the input is known to be zero or the index is undef. if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) || (M == SM_SentinelUndef)) continue; MatchLo &= (M == Unpckl[i]); MatchHi &= (M == Unpckh[i]); } if (MatchLo || MatchHi) { UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH; V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1; V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1; return true; } } // If a binary shuffle, commute and try again. if (!IsUnary) { ShuffleVectorSDNode::commuteMask(Unpckl); if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) { UnpackOpcode = X86ISD::UNPCKL; std::swap(V1, V2); return true; } ShuffleVectorSDNode::commuteMask(Unpckh); if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) { UnpackOpcode = X86ISD::UNPCKH; std::swap(V1, V2); return true; } } return false; } // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { SmallVector Unpckl; createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false); if (isShuffleEquivalent(Mask, Unpckl, V1, V2)) return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2); SmallVector Unpckh; createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false); if (isShuffleEquivalent(Mask, Unpckh, V1, V2)) return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); // Commute and try again. ShuffleVectorSDNode::commuteMask(Unpckl); if (isShuffleEquivalent(Mask, Unpckl, V1, V2)) return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1); ShuffleVectorSDNode::commuteMask(Unpckh); if (isShuffleEquivalent(Mask, Unpckh, V1, V2)) return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1); return SDValue(); } /// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) /// followed by unpack 256-bit. static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { SmallVector Unpckl, Unpckh; createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true); createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false); unsigned UnpackOpcode; if (isShuffleEquivalent(Mask, Unpckl, V1, V2)) UnpackOpcode = X86ISD::UNPCKL; else if (isShuffleEquivalent(Mask, Unpckh, V1, V2)) UnpackOpcode = X86ISD::UNPCKH; else return SDValue(); // This is a "natural" unpack operation (rather than the 128-bit sectored // operation implemented by AVX). We need to rearrange 64-bit chunks of the // input in order to use the x86 instruction. V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1), DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3}); V1 = DAG.getBitcast(VT, V1); return DAG.getNode(UnpackOpcode, DL, VT, V1, V1); } // Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the // source into the lower elements and zeroing the upper elements. static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget) { if (!VT.is512BitVector() && !Subtarget.hasVLX()) return false; unsigned NumElts = Mask.size(); unsigned EltSizeInBits = VT.getScalarSizeInBits(); unsigned MaxScale = 64 / EltSizeInBits; for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) { unsigned SrcEltBits = EltSizeInBits * Scale; if (SrcEltBits < 32 && !Subtarget.hasBWI()) continue; unsigned NumSrcElts = NumElts / Scale; if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale)) continue; unsigned UpperElts = NumElts - NumSrcElts; if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes()) continue; SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale); SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts); DstVT = MVT::getIntegerVT(EltSizeInBits); if ((NumSrcElts * EltSizeInBits) >= 128) { // ISD::TRUNCATE DstVT = MVT::getVectorVT(DstVT, NumSrcElts); } else { // X86ISD::VTRUNC DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits); } return true; } return false; } // Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper // element padding to the final DstVT. static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers) { MVT SrcVT = Src.getSimpleValueType(); MVT DstSVT = DstVT.getScalarType(); unsigned NumDstElts = DstVT.getVectorNumElements(); unsigned NumSrcElts = SrcVT.getVectorNumElements(); unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits(); if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) return SDValue(); // Perform a direct ISD::TRUNCATE if possible. if (NumSrcElts == NumDstElts) return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src); if (NumSrcElts > NumDstElts) { MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src); return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits()); } if ((NumSrcElts * DstEltSizeInBits) >= 128) { MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src); return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL, DstVT.getSizeInBits()); } // Non-VLX targets must truncate from a 512-bit type, so we need to // widen, truncate and then possibly extract the original subvector. if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) { SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512); return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers); } // Fallback to a X86ISD::VTRUNC, padding if necessary. MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits); SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src); if (DstVT != TruncVT) Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL, DstVT.getSizeInBits()); return Trunc; } // Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction. // // An example is the following: // // t0: ch = EntryToken // t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0 // t25: v4i32 = truncate t2 // t41: v8i16 = bitcast t25 // t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16, // Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0> // t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21 // t18: v2i64 = bitcast t51 // // One can just use a single vpmovdw instruction, without avx512vl we need to // use the zmm variant and extract the lower subvector, padding with zeroes. // TODO: Merge with lowerShuffleAsVTRUNC. static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type"); if (!Subtarget.hasAVX512()) return SDValue(); unsigned NumElts = VT.getVectorNumElements(); unsigned EltSizeInBits = VT.getScalarSizeInBits(); unsigned MaxScale = 64 / EltSizeInBits; for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) { unsigned SrcEltBits = EltSizeInBits * Scale; unsigned NumSrcElts = NumElts / Scale; unsigned UpperElts = NumElts - NumSrcElts; if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) || !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes()) continue; // Attempt to find a matching source truncation, but as a fall back VLX // cases can use the VPMOV directly. SDValue Src = peekThroughBitcasts(V1); if (Src.getOpcode() == ISD::TRUNCATE && Src.getScalarValueSizeInBits() == SrcEltBits) { Src = Src.getOperand(0); } else if (Subtarget.hasVLX()) { MVT SrcSVT = MVT::getIntegerVT(SrcEltBits); MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts); Src = DAG.getBitcast(SrcVT, Src); // Don't do this if PACKSS/PACKUS could perform it cheaper. if (Scale == 2 && ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) || (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits))) return SDValue(); } else return SDValue(); // VPMOVWB is only available with avx512bw. if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32) return SDValue(); bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts); return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers); } return SDValue(); } // Attempt to match binary shuffle patterns as a truncate. static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert((VT.is128BitVector() || VT.is256BitVector()) && "Unexpected VTRUNC type"); if (!Subtarget.hasAVX512() || (VT.is256BitVector() && !Subtarget.useAVX512Regs())) return SDValue(); unsigned NumElts = VT.getVectorNumElements(); unsigned EltSizeInBits = VT.getScalarSizeInBits(); unsigned MaxScale = 64 / EltSizeInBits; for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) { // TODO: Support non-BWI VPMOVWB truncations? unsigned SrcEltBits = EltSizeInBits * Scale; if (SrcEltBits < 32 && !Subtarget.hasBWI()) continue; // Match shuffle // Bail if the V2 elements are undef. unsigned NumHalfSrcElts = NumElts / Scale; unsigned NumSrcElts = 2 * NumHalfSrcElts; for (unsigned Offset = 0; Offset != Scale; ++Offset) { if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) || isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts)) continue; // The elements beyond the truncation must be undef/zero. unsigned UpperElts = NumElts - NumSrcElts; if (UpperElts > 0 && !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes()) continue; bool UndefUppers = UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts); // As we're using both sources then we need to concat them together // and truncate from the double-sized src. MVT ConcatVT = VT.getDoubleNumVectorElementsVT(); // For offset truncations, ensure that the concat is cheap. SDValue Src = combineConcatVectorOps(DL, ConcatVT, {V1, V2}, DAG, Subtarget); if (!Src) { if (Offset) continue; Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2); } MVT SrcSVT = MVT::getIntegerVT(SrcEltBits); MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts); Src = DAG.getBitcast(SrcVT, Src); // Shift the offset'd elements into place for the truncation. // TODO: Use getTargetVShiftByConstNode. if (Offset) Src = DAG.getNode( X86ISD::VSRLI, DL, SrcVT, Src, DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8)); return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers); } } return SDValue(); } /// Check whether a compaction lowering can be done by dropping even/odd /// elements and compute how many times even/odd elements must be dropped. /// /// This handles shuffles which take every Nth element where N is a power of /// two. Example shuffle masks: /// /// (even) /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 /// /// (odd) /// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14 /// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 /// /// Any of these lanes can of course be undef. /// /// This routine only supports N <= 3. /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here /// for larger N. /// /// \returns N above, or the number of times even/odd elements must be dropped /// if there is such a number. Otherwise returns zero. static int canLowerByDroppingElements(ArrayRef Mask, bool MatchEven, bool IsSingleInput) { // The modulus for the shuffle vector entries is based on whether this is // a single input or not. int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2); assert(isPowerOf2_32((uint32_t)ShuffleModulus) && "We should only be called with masks with a power-of-2 size!"); uint64_t ModMask = (uint64_t)ShuffleModulus - 1; int Offset = MatchEven ? 0 : 1; // We track whether the input is viable for all power-of-2 strides 2^1, 2^2, // and 2^3 simultaneously. This is because we may have ambiguity with // partially undef inputs. bool ViableForN[3] = {true, true, true}; for (int i = 0, e = Mask.size(); i < e; ++i) { // Ignore undef lanes, we'll optimistically collapse them to the pattern we // want. if (Mask[i] < 0) continue; bool IsAnyViable = false; for (unsigned j = 0; j != std::size(ViableForN); ++j) if (ViableForN[j]) { uint64_t N = j + 1; // The shuffle mask must be equal to (i * 2^N) % M. if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask)) IsAnyViable = true; else ViableForN[j] = false; } // Early exit if we exhaust the possible powers of two. if (!IsAnyViable) break; } for (unsigned j = 0; j != std::size(ViableForN); ++j) if (ViableForN[j]) return j + 1; // Return 0 as there is no viable power of two. return 0; } // X86 has dedicated pack instructions that can handle specific truncation // operations: PACKSS and PACKUS. // Checks for compaction shuffle masks if MaxStages > 1. // TODO: Add support for matching multiple PACKSS/PACKUS stages. static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages = 1) { unsigned NumElts = VT.getVectorNumElements(); unsigned BitSize = VT.getScalarSizeInBits(); assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && "Illegal maximum compaction"); auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) { unsigned NumSrcBits = PackVT.getScalarSizeInBits(); unsigned NumPackedBits = NumSrcBits - BitSize; N1 = peekThroughBitcasts(N1); N2 = peekThroughBitcasts(N2); unsigned NumBits1 = N1.getScalarValueSizeInBits(); unsigned NumBits2 = N2.getScalarValueSizeInBits(); bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false); bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false); if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) || (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits)) return false; if (Subtarget.hasSSE41() || BitSize == 8) { APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits); if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) && (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) { V1 = N1; V2 = N2; SrcVT = PackVT; PackOpcode = X86ISD::PACKUS; return true; } } bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false); bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false); if ((N1.isUndef() || IsZero1 || IsAllOnes1 || DAG.ComputeNumSignBits(N1) > NumPackedBits) && (N2.isUndef() || IsZero2 || IsAllOnes2 || DAG.ComputeNumSignBits(N2) > NumPackedBits)) { V1 = N1; V2 = N2; SrcVT = PackVT; PackOpcode = X86ISD::PACKSS; return true; } return false; }; // Attempt to match against wider and wider compaction patterns. for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) { MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages); MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages); // Try binary shuffle. SmallVector BinaryMask; createPackShuffleMask(VT, BinaryMask, false, NumStages); if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2)) if (MatchPACK(V1, V2, PackVT)) return true; // Try unary shuffle. SmallVector UnaryMask; createPackShuffleMask(VT, UnaryMask, true, NumStages); if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1)) if (MatchPACK(V1, V1, PackVT)) return true; } return false; } static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT PackVT; unsigned PackOpcode; unsigned SizeBits = VT.getSizeInBits(); unsigned EltBits = VT.getScalarSizeInBits(); unsigned MaxStages = Log2_32(64 / EltBits); if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG, Subtarget, MaxStages)) return SDValue(); unsigned CurrentEltBits = PackVT.getScalarSizeInBits(); unsigned NumStages = Log2_32(CurrentEltBits / EltBits); // Don't lower multi-stage packs on AVX512, truncation is better. if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX()) return SDValue(); // Pack to the largest type possible: // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB. unsigned MaxPackBits = 16; if (CurrentEltBits > 16 && (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41())) MaxPackBits = 32; // Repeatedly pack down to the target size. SDValue Res; for (unsigned i = 0; i != NumStages; ++i) { unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits); unsigned NumSrcElts = SizeBits / SrcEltBits; MVT SrcSVT = MVT::getIntegerVT(SrcEltBits); MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2); MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts); MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2); Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1), DAG.getBitcast(SrcVT, V2)); V1 = V2 = Res; CurrentEltBits /= 2; } assert(Res && Res.getValueType() == VT && "Failed to lower compaction shuffle"); return Res; } /// Try to emit a bitmask instruction for a shuffle. /// /// This handles cases where we can model a blend exactly as a bitmask due to /// one of the inputs being zeroable. static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT MaskVT = VT; MVT EltVT = VT.getVectorElementType(); SDValue Zero, AllOnes; // Use f64 if i64 isn't legal. if (EltVT == MVT::i64 && !Subtarget.is64Bit()) { EltVT = MVT::f64; MaskVT = MVT::getVectorVT(EltVT, Mask.size()); } MVT LogicVT = VT; if (EltVT.isFloatingPoint()) { Zero = DAG.getConstantFP(0.0, DL, EltVT); APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics()); AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT); LogicVT = MVT::getVectorVT(EltVT.changeTypeToInteger(), Mask.size()); } else { Zero = DAG.getConstant(0, DL, EltVT); AllOnes = DAG.getAllOnesConstant(DL, EltVT); } SmallVector VMaskOps(Mask.size(), Zero); SDValue V; for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Zeroable[i]) continue; if (Mask[i] % Size != i) return SDValue(); // Not a blend. if (!V) V = Mask[i] < Size ? V1 : V2; else if (V != (Mask[i] < Size ? V1 : V2)) return SDValue(); // Can only let one input through the mask. VMaskOps[i] = AllOnes; } if (!V) return SDValue(); // No non-zeroable elements! SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps); VMask = DAG.getBitcast(LogicVT, VMask); V = DAG.getBitcast(LogicVT, V); SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask); return DAG.getBitcast(VT, And); } /// Try to emit a blend instruction for a shuffle using bit math. /// /// This is used as a fallback approach when first class blend instructions are /// unavailable. Currently it is only suitable for integer vectors, but could /// be generalized for floating point vectors if desirable. static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { assert(VT.isInteger() && "Only supports integer vector types!"); MVT EltVT = VT.getVectorElementType(); SDValue Zero = DAG.getConstant(0, DL, EltVT); SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT); SmallVector MaskOps; for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size) return SDValue(); // Shuffled input! MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero); } SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps); return getBitSelect(DL, VT, V1, V2, V1Mask, DAG); } static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG); static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask) { bool V1IsZeroOrUndef = V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZeroOrUndef = V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode()); BlendMask = 0; ForceV1Zero = false, ForceV2Zero = false; assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask"); int NumElts = Mask.size(); int NumLanes = VT.getSizeInBits() / 128; int NumEltsPerLane = NumElts / NumLanes; assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch"); // For 32/64-bit elements, if we only reference one input (plus any undefs), // then ensure the blend mask part for that lane just references that input. bool ForceWholeLaneMasks = VT.is256BitVector() && VT.getScalarSizeInBits() >= 32; // Attempt to generate the binary blend mask. If an input is zero then // we can use any lane. for (int Lane = 0; Lane != NumLanes; ++Lane) { // Keep track of the inputs used per lane. bool LaneV1InUse = false; bool LaneV2InUse = false; uint64_t LaneBlendMask = 0; for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) { int Elt = (Lane * NumEltsPerLane) + LaneElt; int M = Mask[Elt]; if (M == SM_SentinelUndef) continue; if (M == Elt || (0 <= M && M < NumElts && IsElementEquivalent(NumElts, V1, V1, M, Elt))) { Mask[Elt] = Elt; LaneV1InUse = true; continue; } if (M == (Elt + NumElts) || (NumElts <= M && IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) { LaneBlendMask |= 1ull << LaneElt; Mask[Elt] = Elt + NumElts; LaneV2InUse = true; continue; } if (Zeroable[Elt]) { if (V1IsZeroOrUndef) { ForceV1Zero = true; Mask[Elt] = Elt; LaneV1InUse = true; continue; } if (V2IsZeroOrUndef) { ForceV2Zero = true; LaneBlendMask |= 1ull << LaneElt; Mask[Elt] = Elt + NumElts; LaneV2InUse = true; continue; } } return false; } // If we only used V2 then splat the lane blend mask to avoid any demanded // elts from V1 in this lane (the V1 equivalent is implicit with a zero // blend mask bit). if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse) LaneBlendMask = (1ull << NumEltsPerLane) - 1; BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane); } return true; } /// Try to emit a blend instruction for a shuffle. /// /// This doesn't do any checks for the availability of instructions for blending /// these values. It relies on the availability of the X86ISD::BLENDI pattern to /// be matched in the backend with the type given. What it does check for is /// that the shuffle mask is a blend, or convertible into a blend with zero. static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { uint64_t BlendMask = 0; bool ForceV1Zero = false, ForceV2Zero = false; SmallVector Mask(Original); if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero, BlendMask)) return SDValue(); // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. if (ForceV1Zero) V1 = getZeroVector(VT, Subtarget, DAG, DL); if (ForceV2Zero) V2 = getZeroVector(VT, Subtarget, DAG, DL); unsigned NumElts = VT.getVectorNumElements(); switch (VT.SimpleTy) { case MVT::v4i64: case MVT::v8i32: assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"); [[fallthrough]]; case MVT::v4f64: case MVT::v8f32: assert(Subtarget.hasAVX() && "256-bit float blends require AVX!"); [[fallthrough]]; case MVT::v2f64: case MVT::v2i64: case MVT::v4f32: case MVT::v4i32: case MVT::v8i16: assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!"); return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, DAG.getTargetConstant(BlendMask, DL, MVT::i8)); case MVT::v16i16: { assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!"); SmallVector RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { // We can lower these with PBLENDW which is mirrored across 128-bit lanes. assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); BlendMask = 0; for (int i = 0; i < 8; ++i) if (RepeatedMask[i] >= 8) BlendMask |= 1ull << i; return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, DAG.getTargetConstant(BlendMask, DL, MVT::i8)); } // Use PBLENDW for lower/upper lanes and then blend lanes. // TODO - we should allow 2 PBLENDW here and leave shuffle combine to // merge to VSELECT where useful. uint64_t LoMask = BlendMask & 0xFF; uint64_t HiMask = (BlendMask >> 8) & 0xFF; if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) { SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, DAG.getTargetConstant(LoMask, DL, MVT::i8)); SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, DAG.getTargetConstant(HiMask, DL, MVT::i8)); return DAG.getVectorShuffle( MVT::v16i16, DL, Lo, Hi, {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}); } [[fallthrough]]; } case MVT::v32i8: assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"); [[fallthrough]]; case MVT::v16i8: { assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"); // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Masked; if (Subtarget.hasBWI() && Subtarget.hasVLX()) { MVT IntegerType = MVT::getIntegerVT(std::max(NumElts, 8)); SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG); } // If we have VPTERNLOG, we can use that as a bit blend. if (Subtarget.hasVLX()) if (SDValue BitBlend = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) return BitBlend; // Scale the blend by the number of bytes per element. int Scale = VT.getScalarSizeInBits() / 8; // This form of blend is always done on bytes. Compute the byte vector // type. MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); // x86 allows load folding with blendvb from the 2nd source operand. But // we are still using LLVM select here (see comment below), so that's V1. // If V2 can be load-folded and V1 cannot be load-folded, then commute to // allow that load-folding possibility. if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) { ShuffleVectorSDNode::commuteMask(Mask); std::swap(V1, V2); } // Compute the VSELECT mask. Note that VSELECT is really confusing in the // mix of LLVM's code generator and the x86 backend. We tell the code // generator that boolean values in the elements of an x86 vector register // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' // mapping a select to operand #1, and 'false' mapping to operand #2. The // reality in x86 is that vector masks (pre-AVX-512) use only the high bit // of the element (the remaining are ignored) and 0 in that high bit would // mean operand #1 while 1 in the high bit would mean operand #2. So while // the LLVM model for boolean values in vector elements gets the relevant // bit set, it is set backwards and over constrained relative to x86's // actual model. SmallVector VSELECTMask; for (int i = 0, Size = Mask.size(); i < Size; ++i) for (int j = 0; j < Scale; ++j) VSELECTMask.push_back( Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8)); V1 = DAG.getBitcast(BlendVT, V1); V2 = DAG.getBitcast(BlendVT, V2); return DAG.getBitcast( VT, DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2)); } case MVT::v16f32: case MVT::v8f64: case MVT::v8i64: case MVT::v16i32: case MVT::v32i16: case MVT::v64i8: { // Attempt to lower to a bitmask if we can. Only if not optimizing for size. bool OptForSize = DAG.shouldOptForSize(); if (!OptForSize) { if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Masked; } // Otherwise load an immediate into a GPR, cast to k-register, and use a // masked move. MVT IntegerType = MVT::getIntegerVT(std::max(NumElts, 8)); SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG); } default: llvm_unreachable("Not a supported integer vector type!"); } } /// Try to lower as a blend of elements from two inputs followed by /// a single-input permutation. /// /// This matches the pattern where we can blend elements from two inputs and /// then reduce the shuffle to a single-input permutation. static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG, bool ImmBlends = false) { // We build up the blend mask while checking whether a blend is a viable way // to reduce the shuffle. SmallVector BlendMask(Mask.size(), -1); SmallVector PermuteMask(Mask.size(), -1); for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Mask[i] < 0) continue; assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds."); if (BlendMask[Mask[i] % Size] < 0) BlendMask[Mask[i] % Size] = Mask[i]; else if (BlendMask[Mask[i] % Size] != Mask[i]) return SDValue(); // Can't blend in the needed input! PermuteMask[i] = Mask[i] % Size; } // If only immediate blends, then bail if the blend mask can't be widened to // i16. unsigned EltSize = VT.getScalarSizeInBits(); if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask)) return SDValue(); SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask); } /// Try to lower as an unpack of elements from two inputs followed by /// a single-input permutation. /// /// This matches the pattern where we can unpack elements from two inputs and /// then reduce the shuffle to a single-input (wider) permutation. static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { int NumElts = Mask.size(); int NumLanes = VT.getSizeInBits() / 128; int NumLaneElts = NumElts / NumLanes; int NumHalfLaneElts = NumLaneElts / 2; bool MatchLo = true, MatchHi = true; SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)}; // Determine UNPCKL/UNPCKH type and operand order. for (int Elt = 0; Elt != NumElts; ++Elt) { int M = Mask[Elt]; if (M < 0) continue; // Normalize the mask value depending on whether it's V1 or V2. int NormM = M; SDValue &Op = Ops[Elt & 1]; if (M < NumElts && (Op.isUndef() || Op == V1)) Op = V1; else if (NumElts <= M && (Op.isUndef() || Op == V2)) { Op = V2; NormM -= NumElts; } else return SDValue(); bool MatchLoAnyLane = false, MatchHiAnyLane = false; for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) { int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts; MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid); MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi); if (MatchLoAnyLane || MatchHiAnyLane) { assert((MatchLoAnyLane ^ MatchHiAnyLane) && "Failed to match UNPCKLO/UNPCKHI"); break; } } MatchLo &= MatchLoAnyLane; MatchHi &= MatchHiAnyLane; if (!MatchLo && !MatchHi) return SDValue(); } assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI"); // Element indices have changed after unpacking. Calculate permute mask // so that they will be put back to the position as dictated by the // original shuffle mask indices. SmallVector PermuteMask(NumElts, -1); for (int Elt = 0; Elt != NumElts; ++Elt) { int M = Mask[Elt]; if (M < 0) continue; int NormM = M; if (NumElts <= M) NormM -= NumElts; bool IsFirstOp = M < NumElts; int BaseMaskElt = NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts)); if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0])) PermuteMask[Elt] = BaseMaskElt; else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1])) PermuteMask[Elt] = BaseMaskElt + 1; assert(PermuteMask[Elt] != -1 && "Input mask element is defined but failed to assign permute mask"); } unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH; SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops); return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask); } /// Try to lower a shuffle as a permute of the inputs followed by an /// UNPCK instruction. /// /// This specifically targets cases where we end up with alternating between /// the two inputs, and so can permute them into something that feeds a single /// UNPCK instruction. Note that this routine only targets integer vectors /// because for floating point vectors we have a generalized SHUFPS lowering /// strategy that handles everything that doesn't *exactly* match an unpack, /// making this clever lowering unnecessary. static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Size = Mask.size(); assert(Mask.size() >= 2 && "Single element masks are invalid."); // This routine only supports 128-bit integer dual input vectors. if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef()) return SDValue(); int NumLoInputs = count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; }); int NumHiInputs = count_if(Mask, [Size](int M) { return M % Size >= Size / 2; }); bool UnpackLo = NumLoInputs >= NumHiInputs; auto TryUnpack = [&](int ScalarSize, int Scale) { SmallVector V1Mask((unsigned)Size, -1); SmallVector V2Mask((unsigned)Size, -1); for (int i = 0; i < Size; ++i) { if (Mask[i] < 0) continue; // Each element of the unpack contains Scale elements from this mask. int UnpackIdx = i / Scale; // We only handle the case where V1 feeds the first slots of the unpack. // We rely on canonicalization to ensure this is the case. if ((UnpackIdx % 2 == 0) != (Mask[i] < Size)) return SDValue(); // Setup the mask for this input. The indexing is tricky as we have to // handle the unpack stride. SmallVectorImpl &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask; VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] = Mask[i] % Size; } // If we will have to shuffle both inputs to use the unpack, check whether // we can just unpack first and shuffle the result. If so, skip this unpack. if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) return SDValue(); // Shuffle the inputs into place. V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); // Cast the inputs to the type we will use to unpack them. MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale); V1 = DAG.getBitcast(UnpackVT, V1); V2 = DAG.getBitcast(UnpackVT, V2); // Unpack the inputs and cast the result back to the desired type. return DAG.getBitcast( VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, UnpackVT, V1, V2)); }; // We try each unpack from the largest to the smallest to try and find one // that fits this mask. int OrigScalarSize = VT.getScalarSizeInBits(); for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize)) return Unpack; // If we're shuffling with a zero vector then we're better off not doing // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements. if (ISD::isBuildVectorAllZeros(V1.getNode()) || ISD::isBuildVectorAllZeros(V2.getNode())) return SDValue(); // If none of the unpack-rooted lowerings worked (or were profitable) try an // initial unpack. if (NumLoInputs == 0 || NumHiInputs == 0) { assert((NumLoInputs > 0 || NumHiInputs > 0) && "We have to have *some* inputs!"); int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0; // FIXME: We could consider the total complexity of the permute of each // possible unpacking. Or at the least we should consider how many // half-crossings are created. // FIXME: We could consider commuting the unpacks. SmallVector PermMask((unsigned)Size, -1); for (int i = 0; i < Size; ++i) { if (Mask[i] < 0) continue; assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!"); PermMask[i] = 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1); } return DAG.getVectorShuffle( VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT, V1, V2), DAG.getUNDEF(VT), PermMask); } return SDValue(); } /// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then /// permuting the elements of the result in place. static SDValue lowerShuffleAsByteRotateAndPermute( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) || (VT.is256BitVector() && !Subtarget.hasAVX2()) || (VT.is512BitVector() && !Subtarget.hasBWI())) return SDValue(); // We don't currently support lane crossing permutes. if (is128BitLaneCrossingShuffleMask(VT, Mask)) return SDValue(); int Scale = VT.getScalarSizeInBits() / 8; int NumLanes = VT.getSizeInBits() / 128; int NumElts = VT.getVectorNumElements(); int NumEltsPerLane = NumElts / NumLanes; // Determine range of mask elts. bool Blend1 = true; bool Blend2 = true; std::pair Range1 = std::make_pair(INT_MAX, INT_MIN); std::pair Range2 = std::make_pair(INT_MAX, INT_MIN); for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) { for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) { int M = Mask[Lane + Elt]; if (M < 0) continue; if (M < NumElts) { Blend1 &= (M == (Lane + Elt)); assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask"); M = M % NumEltsPerLane; Range1.first = std::min(Range1.first, M); Range1.second = std::max(Range1.second, M); } else { M -= NumElts; Blend2 &= (M == (Lane + Elt)); assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask"); M = M % NumEltsPerLane; Range2.first = std::min(Range2.first, M); Range2.second = std::max(Range2.second, M); } } } // Bail if we don't need both elements. // TODO - it might be worth doing this for unary shuffles if the permute // can be widened. if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) || !(0 <= Range2.first && Range2.second < NumEltsPerLane)) return SDValue(); if (VT.getSizeInBits() > 128 && (Blend1 || Blend2)) return SDValue(); // Rotate the 2 ops so we can access both ranges, then permute the result. auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) { MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); SDValue Rotate = DAG.getBitcast( VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi), DAG.getBitcast(ByteVT, Lo), DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8))); SmallVector PermMask(NumElts, SM_SentinelUndef); for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) { for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) { int M = Mask[Lane + Elt]; if (M < 0) continue; if (M < NumElts) PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane); else PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane); } } return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask); }; // Check if the ranges are small enough to rotate from either direction. if (Range2.second < Range1.first) return RotateAndPermute(V1, V2, Range1.first, 0); if (Range1.second < Range2.first) return RotateAndPermute(V2, V1, Range2.first, NumElts); return SDValue(); } static bool isBroadcastShuffleMask(ArrayRef Mask) { return isUndefOrEqual(Mask, 0); } static bool isNoopOrBroadcastShuffleMask(ArrayRef Mask) { return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask); } /// Check if the Mask consists of the same element repeated multiple times. static bool isSingleElementRepeatedMask(ArrayRef Mask) { size_t NumUndefs = 0; std::optional UniqueElt; for (int Elt : Mask) { if (Elt == SM_SentinelUndef) { NumUndefs++; continue; } if (UniqueElt.has_value() && UniqueElt.value() != Elt) return false; UniqueElt = Elt; } // Make sure the element is repeated enough times by checking the number of // undefs is small. return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value(); } /// Generic routine to decompose a shuffle and blend into independent /// blends and permutes. /// /// This matches the extremely common pattern for handling combined /// shuffle+blend operations on newer X86 ISAs where we have very fast blend /// operations. It will try to pick the best arrangement of shuffles and /// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend. static SDValue lowerShuffleAsDecomposedShuffleMerge( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int NumElts = Mask.size(); int NumLanes = VT.getSizeInBits() / 128; int NumEltsPerLane = NumElts / NumLanes; // Shuffle the input elements into the desired positions in V1 and V2 and // unpack/blend them together. bool IsAlternating = true; bool V1Zero = true, V2Zero = true; SmallVector V1Mask(NumElts, -1); SmallVector V2Mask(NumElts, -1); SmallVector FinalMask(NumElts, -1); for (int i = 0; i < NumElts; ++i) { int M = Mask[i]; if (M >= 0 && M < NumElts) { V1Mask[i] = M; FinalMask[i] = i; V1Zero &= Zeroable[i]; IsAlternating &= (i & 1) == 0; } else if (M >= NumElts) { V2Mask[i] = M - NumElts; FinalMask[i] = i + NumElts; V2Zero &= Zeroable[i]; IsAlternating &= (i & 1) == 1; } } // If we effectively only demand the 0'th element of \p Input, and not only // as 0'th element, then broadcast said input, // and change \p InputMask to be a no-op (identity) mask. auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget, &DAG](SDValue &Input, MutableArrayRef InputMask) { unsigned EltSizeInBits = Input.getScalarValueSizeInBits(); if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 || !X86::mayFoldLoad(Input, Subtarget))) return; if (isNoopShuffleMask(InputMask)) return; assert(isBroadcastShuffleMask(InputMask) && "Expected to demand only the 0'th element."); Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input); for (auto I : enumerate(InputMask)) { int &InputMaskElt = I.value(); if (InputMaskElt >= 0) InputMaskElt = I.index(); } }; // Currently, we may need to produce one shuffle per input, and blend results. // It is possible that the shuffle for one of the inputs is already a no-op. // See if we can simplify non-no-op shuffles into broadcasts, // which we consider to be strictly better than an arbitrary shuffle. if (isNoopOrBroadcastShuffleMask(V1Mask) && isNoopOrBroadcastShuffleMask(V2Mask)) { canonicalizeBroadcastableInput(V1, V1Mask); canonicalizeBroadcastableInput(V2, V2Mask); } // Try to lower with the simpler initial blend/unpack/rotate strategies unless // one of the input shuffles would be a no-op. We prefer to shuffle inputs as // the shuffle may be able to fold with a load or other benefit. However, when // we'll have to do 2x as many shuffles in order to achieve this, a 2-input // pre-shuffle first is a better strategy. if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) { // If we don't have blends, see if we can create a cheap unpack. if (!Subtarget.hasSSE41() && VT.is128BitVector() && (is128BitUnpackShuffleMask(V1Mask, DAG) || is128BitUnpackShuffleMask(V2Mask, DAG))) if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack( DL, VT, V1, V2, Mask, Subtarget, DAG)) return PermUnpack; // Only prefer immediate blends to unpack/rotate. if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG, true)) return BlendPerm; // If either input vector provides only a single element which is repeated // multiple times, unpacking from both input vectors would generate worse // code. e.g. for // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4 // it is better to process t4 first to create a vector of t4[0], then unpack // that vector with t2. if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) && !isSingleElementRepeatedMask(V2Mask)) if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG)) return UnpackPerm; if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute( DL, VT, V1, V2, Mask, Subtarget, DAG)) return RotatePerm; // Unpack/rotate failed - try again with variable blends. if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG)) return BlendPerm; if (VT.getScalarSizeInBits() >= 32) if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack( DL, VT, V1, V2, Mask, Subtarget, DAG)) return PermUnpack; } // If the final mask is an alternating blend of vXi8/vXi16, convert to an // UNPCKL(SHUFFLE, SHUFFLE) pattern. // TODO: It doesn't have to be alternating - but each lane mustn't have more // than half the elements coming from each source. if (IsAlternating && VT.getScalarSizeInBits() < 32) { V1Mask.assign(NumElts, -1); V2Mask.assign(NumElts, -1); FinalMask.assign(NumElts, -1); for (int i = 0; i != NumElts; i += NumEltsPerLane) for (int j = 0; j != NumEltsPerLane; ++j) { int M = Mask[i + j]; if (M >= 0 && M < NumElts) { V1Mask[i + (j / 2)] = M; FinalMask[i + j] = i + (j / 2); } else if (M >= NumElts) { V2Mask[i + (j / 2)] = M - NumElts; FinalMask[i + j] = i + (j / 2) + NumElts; } } } V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask); } static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits, const X86Subtarget &Subtarget, ArrayRef Mask) { assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers"); // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size. int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2; int MaxSubElts = 64 / EltSizeInBits; unsigned RotateAmt, NumSubElts; if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts, MaxSubElts, NumSubElts, RotateAmt)) return -1; unsigned NumElts = Mask.size(); MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts); RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts); return RotateAmt; } /// Lower shuffle using X86ISD::VROTLI rotations. static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { // Only XOP + AVX512 targets have bit rotation instructions. // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this. bool IsLegal = (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512(); if (!IsLegal && Subtarget.hasSSE3()) return SDValue(); MVT RotateVT; int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(), Subtarget, Mask); if (RotateAmt < 0) return SDValue(); // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL, // expanded to OR(SRL,SHL), will be more efficient, but if they can // widen to vXi16 or more then existing lowering should will be better. if (!IsLegal) { if ((RotateAmt % 16) == 0) return SDValue(); // TODO: Use getTargetVShiftByConstNode. unsigned ShlAmt = RotateAmt; unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt; V1 = DAG.getBitcast(RotateVT, V1); SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1, DAG.getTargetConstant(ShlAmt, DL, MVT::i8)); SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1, DAG.getTargetConstant(SrlAmt, DL, MVT::i8)); SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL); return DAG.getBitcast(VT, Rot); } SDValue Rot = DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1), DAG.getTargetConstant(RotateAmt, DL, MVT::i8)); return DAG.getBitcast(VT, Rot); } /// Try to match a vector shuffle as an element rotation. /// /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512. static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef Mask) { int NumElts = Mask.size(); // We need to detect various ways of spelling a rotation: // [11, 12, 13, 14, 15, 0, 1, 2] // [-1, 12, 13, 14, -1, -1, 1, -1] // [-1, -1, -1, -1, -1, -1, 1, 2] // [ 3, 4, 5, 6, 7, 8, 9, 10] // [-1, 4, 5, 6, -1, -1, 9, -1] // [-1, 4, 5, 6, -1, -1, -1, -1] int Rotation = 0; SDValue Lo, Hi; for (int i = 0; i < NumElts; ++i) { int M = Mask[i]; assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && "Unexpected mask index."); if (M < 0) continue; // Determine where a rotated vector would have started. int StartIdx = i - (M % NumElts); if (StartIdx == 0) // The identity rotation isn't interesting, stop. return -1; // If we found the tail of a vector the rotation must be the missing // front. If we found the head of a vector, it must be how much of the // head. int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx; if (Rotation == 0) Rotation = CandidateRotation; else if (Rotation != CandidateRotation) // The rotations don't match, so we can't match this mask. return -1; // Compute which value this mask is pointing at. SDValue MaskV = M < NumElts ? V1 : V2; // Compute which of the two target values this index should be assigned // to. This reflects whether the high elements are remaining or the low // elements are remaining. SDValue &TargetV = StartIdx < 0 ? Hi : Lo; // Either set up this value if we've not encountered it before, or check // that it remains consistent. if (!TargetV) TargetV = MaskV; else if (TargetV != MaskV) // This may be a rotation, but it pulls from the inputs in some // unsupported interleaving. return -1; } // Check that we successfully analyzed the mask, and normalize the results. assert(Rotation != 0 && "Failed to locate a viable rotation!"); assert((Lo || Hi) && "Failed to find a rotated input vector!"); if (!Lo) Lo = Hi; else if (!Hi) Hi = Lo; V1 = Lo; V2 = Hi; return Rotation; } /// Try to lower a vector shuffle as a byte rotation. /// /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will /// try to generically lower a vector shuffle through such an pattern. It /// does not check for the profitability of lowering either as PALIGNR or /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form. /// This matches shuffle vectors that look like: /// /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] /// /// Essentially it concatenates V1 and V2, shifts right by some number of /// elements, and takes the low elements as the result. Note that while this is /// specified as a *right shift* because x86 is little-endian, it is a *left /// rotate* of the vector lanes. static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef Mask) { // Don't accept any shuffles with zero elements. if (isAnyZero(Mask)) return -1; // PALIGNR works on 128-bit lanes. SmallVector RepeatedMask; if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) return -1; int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask); if (Rotation <= 0) return -1; // PALIGNR rotates bytes, so we need to scale the // rotation based on how many bytes are in the vector lane. int NumElts = RepeatedMask.size(); int Scale = 16 / NumElts; return Rotation * Scale; } static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); SDValue Lo = V1, Hi = V2; int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask); if (ByteRotation <= 0) return SDValue(); // Cast the inputs to i8 vector of correct length to match PALIGNR or // PSLLDQ/PSRLDQ. MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); Lo = DAG.getBitcast(ByteVT, Lo); Hi = DAG.getBitcast(ByteVT, Hi); // SSSE3 targets can use the palignr instruction. if (Subtarget.hasSSSE3()) { assert((!VT.is512BitVector() || Subtarget.hasBWI()) && "512-bit PALIGNR requires BWI instructions"); return DAG.getBitcast( VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi, DAG.getTargetConstant(ByteRotation, DL, MVT::i8))); } assert(VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"); assert(Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"); assert(ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"); // Default SSE2 implementation int LoByteShift = 16 - ByteRotation; int HiByteShift = ByteRotation; SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo, DAG.getTargetConstant(LoByteShift, DL, MVT::i8)); SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi, DAG.getTargetConstant(HiByteShift, DL, MVT::i8)); return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift)); } /// Try to lower a vector shuffle as a dword/qword rotation. /// /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary /// rotation of the concatenation of two vectors; This routine will /// try to generically lower a vector shuffle through such an pattern. /// /// Essentially it concatenates V1 and V2, shifts right by some number of /// elements, and takes the low elements as the result. Note that while this is /// specified as a *right shift* because x86 is little-endian, it is a *left /// rotate* of the vector lanes. static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"); // 128/256-bit vectors are only supported with VLX. assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"); SDValue Lo = V1, Hi = V2; int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask); if (0 < Rotation) return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi, DAG.getTargetConstant(Rotation, DL, MVT::i8)); // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ. // TODO: Pull this out as a matchShuffleAsElementShift helper? // TODO: We can probably make this more aggressive and use shift-pairs like // lowerShuffleAsByteShiftMask. unsigned NumElts = Mask.size(); unsigned ZeroLo = Zeroable.countr_one(); unsigned ZeroHi = Zeroable.countl_one(); assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected"); if (!ZeroLo && !ZeroHi) return SDValue(); if (ZeroLo) { SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2; int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts; if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low)) return DAG.getNode(X86ISD::VALIGN, DL, VT, Src, getZeroVector(VT, Subtarget, DAG, DL), DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8)); } if (ZeroHi) { SDValue Src = Mask[0] < (int)NumElts ? V1 : V2; int Low = Mask[0] < (int)NumElts ? 0 : NumElts; if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi)) return DAG.getNode(X86ISD::VALIGN, DL, VT, getZeroVector(VT, Subtarget, DAG, DL), Src, DAG.getTargetConstant(ZeroHi, DL, MVT::i8)); } return SDValue(); } /// Try to lower a vector shuffle as a byte shift sequence. static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); assert(VT.is128BitVector() && "Only 128-bit vectors supported"); // We need a shuffle that has zeros at one/both ends and a sequential // shuffle from one source within. unsigned ZeroLo = Zeroable.countr_one(); unsigned ZeroHi = Zeroable.countl_one(); if (!ZeroLo && !ZeroHi) return SDValue(); unsigned NumElts = Mask.size(); unsigned Len = NumElts - (ZeroLo + ZeroHi); if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo])) return SDValue(); unsigned Scale = VT.getScalarSizeInBits() / 8; ArrayRef StubMask = Mask.slice(ZeroLo, Len); if (!isUndefOrInRange(StubMask, 0, NumElts) && !isUndefOrInRange(StubMask, NumElts, 2 * NumElts)) return SDValue(); SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2; Res = DAG.getBitcast(MVT::v16i8, Res); // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an // inner sequential set of elements, possibly offset: // 01234567 --> zzzzzz01 --> 1zzzzzzz // 01234567 --> 4567zzzz --> zzzzz456 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz if (ZeroLo == 0) { unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts); Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8)); } else if (ZeroHi == 0) { unsigned Shift = Mask[ZeroLo] % NumElts; Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8)); } else if (!Subtarget.hasSSSE3()) { // If we don't have PSHUFB then its worth avoiding an AND constant mask // by performing 3 byte shifts. Shuffle combining can kick in above that. // TODO: There may be some cases where VSH{LR}DQ+PAND is still better. unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts); Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); Shift += Mask[ZeroLo] % NumElts; Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8)); } else return SDValue(); return DAG.getBitcast(VT, Res); } /// Try to lower a vector shuffle as a bit shift (shifts in zeros). /// /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function /// matches elements from one of the input vectors shuffled to the left or /// right with zeroable elements 'shifted in'. It handles both the strictly /// bit-wise element shifts and the byte shift across an entire 128-bit double /// quad word lane. /// /// PSHL : (little-endian) left bit shift. /// [ zz, 0, zz, 2 ] /// [ -1, 4, zz, -1 ] /// PSRL : (little-endian) right bit shift. /// [ 1, zz, 3, zz] /// [ -1, -1, 7, zz] /// PSLLDQ : (little-endian) left byte shift /// [ zz, 0, 1, 2, 3, 4, 5, 6] /// [ zz, zz, -1, -1, 2, 3, 4, -1] /// [ zz, zz, zz, zz, zz, zz, -1, 1] /// PSRLDQ : (little-endian) right byte shift /// [ 5, 6, 7, zz, zz, zz, zz, zz] /// [ -1, 5, 6, 7, zz, zz, zz, zz] /// [ 1, 2, -1, -1, -1, -1, zz, zz] static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef Mask, int MaskOffset, const APInt &Zeroable, const X86Subtarget &Subtarget) { int Size = Mask.size(); unsigned SizeInBits = Size * ScalarSizeInBits; auto CheckZeros = [&](int Shift, int Scale, bool Left) { for (int i = 0; i < Size; i += Scale) for (int j = 0; j < Shift; ++j) if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))]) return false; return true; }; auto MatchShift = [&](int Shift, int Scale, bool Left) { for (int i = 0; i != Size; i += Scale) { unsigned Pos = Left ? i + Shift : i; unsigned Low = Left ? i : i + Shift; unsigned Len = Scale - Shift; if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset)) return -1; } int ShiftEltBits = ScalarSizeInBits * Scale; bool ByteShift = ShiftEltBits > 64; Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI) : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI); int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1); // Normalize the scale for byte shifts to still produce an i64 element // type. Scale = ByteShift ? Scale / 2 : Scale; // We need to round trip through the appropriate type for the shift. MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale); ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8) : MVT::getVectorVT(ShiftSVT, Size / Scale); return ShiftAmt; }; // SSE/AVX supports logical shifts up to 64-bit integers - so we can just // keep doubling the size of the integer elements up to that. We can // then shift the elements of the integer vector by whole multiples of // their width within the elements of the larger integer vector. Test each // multiple to see if we can find a match with the moved element indices // and that the shifted in elements are all zeroable. unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128); for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2) for (int Shift = 1; Shift != Scale; ++Shift) for (bool Left : {true, false}) if (CheckZeros(Shift, Scale, Left)) { int ShiftAmt = MatchShift(Shift, Scale, Left); if (0 < ShiftAmt) return ShiftAmt; } // no match return -1; } static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly) { int Size = Mask.size(); assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); MVT ShiftVT; SDValue V = V1; unsigned Opcode; // Try to match shuffle against V1 shift. int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget); // If V1 failed, try to match shuffle against V2 shift. if (ShiftAmt < 0) { ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, Size, Zeroable, Subtarget); V = V2; } if (ShiftAmt < 0) return SDValue(); if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ)) return SDValue(); assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && "Illegal integer vector type"); V = DAG.getBitcast(ShiftVT, V); V = DAG.getNode(Opcode, DL, ShiftVT, V, DAG.getTargetConstant(ShiftAmt, DL, MVT::i8)); return DAG.getBitcast(VT, V); } // EXTRQ: Extract Len elements from lower half of source, starting at Idx. // Remainder of lower half result is zero and upper half is all undef. static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable) { int Size = Mask.size(); int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask"); // Upper half must be undefined. if (!isUndefUpperHalf(Mask)) return false; // Determine the extraction length from the part of the // lower half that isn't zeroable. int Len = HalfSize; for (; Len > 0; --Len) if (!Zeroable[Len - 1]) break; assert(Len > 0 && "Zeroable shuffle mask"); // Attempt to match first Len sequential elements from the lower half. SDValue Src; int Idx = -1; for (int i = 0; i != Len; ++i) { int M = Mask[i]; if (M == SM_SentinelUndef) continue; SDValue &V = (M < Size ? V1 : V2); M = M % Size; // The extracted elements must start at a valid index and all mask // elements must be in the lower half. if (i > M || M >= HalfSize) return false; if (Idx < 0 || (Src == V && Idx == (M - i))) { Src = V; Idx = M - i; continue; } return false; } if (!Src || Idx < 0) return false; assert((Idx + Len) <= HalfSize && "Illegal extraction mask"); BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; V1 = Src; return true; } // INSERTQ: Extract lowest Len elements from lower half of second source and // insert over first source, starting at Idx. // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... } static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef Mask, uint64_t &BitLen, uint64_t &BitIdx) { int Size = Mask.size(); int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); // Upper half must be undefined. if (!isUndefUpperHalf(Mask)) return false; for (int Idx = 0; Idx != HalfSize; ++Idx) { SDValue Base; // Attempt to match first source from mask before insertion point. if (isUndefInRange(Mask, 0, Idx)) { /* EMPTY */ } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) { Base = V1; } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) { Base = V2; } else { continue; } // Extend the extraction length looking to match both the insertion of // the second source and the remaining elements of the first. for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) { SDValue Insert; int Len = Hi - Idx; // Match insertion. if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) { Insert = V1; } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) { Insert = V2; } else { continue; } // Match the remaining elements of the lower half. if (isUndefInRange(Mask, Hi, HalfSize - Hi)) { /* EMPTY */ } else if ((!Base || (Base == V1)) && isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) { Base = V1; } else if ((!Base || (Base == V2)) && isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Size + Hi)) { Base = V2; } else { continue; } BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; V1 = Base; V2 = Insert; return true; } } return false; } /// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, SelectionDAG &DAG) { uint64_t BitLen, BitIdx; if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable)) return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1, DAG.getTargetConstant(BitLen, DL, MVT::i8), DAG.getTargetConstant(BitIdx, DL, MVT::i8)); if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx)) return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT), V2 ? V2 : DAG.getUNDEF(VT), DAG.getTargetConstant(BitLen, DL, MVT::i8), DAG.getTargetConstant(BitIdx, DL, MVT::i8)); return SDValue(); } /// Lower a vector shuffle as an any/signed/zero extension. /// /// Given a specific number of elements, element bit width, and extension /// stride, produce either an extension based on the available /// features of the subtarget. The extended elements are consecutive and /// begin and can start from an offsetted element index in the input; to /// avoid excess shuffling the offset must either being in the bottom lane /// or at the start of a higher lane. All extended elements must be from /// the same lane. static SDValue lowerShuffleAsSpecificExtension(const SDLoc &DL, MVT VT, int Scale, int Offset, unsigned ExtOpc, SDValue InputV, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Scale > 1 && "Need a scale to extend."); assert(ISD::isExtOpcode(ExtOpc) && "Unsupported extension"); int EltBits = VT.getScalarSizeInBits(); int NumElements = VT.getVectorNumElements(); int NumEltsPerLane = 128 / EltBits; int OffsetLane = Offset / NumEltsPerLane; assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && "Only 8, 16, and 32 bit elements can be extended."); assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."); assert(0 <= Offset && "Extension offset must be positive."); assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."); // Check that an index is in same lane as the base offset. auto SafeOffset = [&](int Idx) { return OffsetLane == (Idx / NumEltsPerLane); }; // Shift along an input so that the offset base moves to the first element. auto ShuffleOffset = [&](SDValue V) { if (!Offset) return V; SmallVector ShMask((unsigned)NumElements, -1); for (int i = 0; i * Scale < NumElements; ++i) { int SrcIdx = i + Offset; ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1; } return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask); }; // Found a valid a/zext mask! Try various lowering strategies based on the // input type and available ISA extensions. if (Subtarget.hasSSE41()) { // Not worth offsetting 128-bit vectors if scale == 2, a pattern using // PUNPCK will catch this in a later shuffle match. if (Offset && Scale == 2 && VT.is128BitVector()) return SDValue(); MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); InputV = DAG.getBitcast(VT, InputV); InputV = ShuffleOffset(InputV); InputV = getEXTEND_VECTOR_INREG(ExtOpc, DL, ExtVT, InputV, DAG); return DAG.getBitcast(VT, InputV); } assert(VT.is128BitVector() && "Only 128-bit vectors can be extended."); InputV = DAG.getBitcast(VT, InputV); bool AnyExt = ExtOpc == ISD::ANY_EXTEND; // TODO: Add pre-SSE41 SIGN_EXTEND_VECTOR_INREG handling. if (ExtOpc == ISD::SIGN_EXTEND) return SDValue(); // For any extends we can cheat for larger element sizes and use shuffle // instructions that can fold with a load and/or copy. if (AnyExt && EltBits == 32) { int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1, -1}; return DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); } if (AnyExt && EltBits == 16 && Scale > 2) { int PSHUFDMask[4] = {Offset / 2, -1, SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1}; InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); int PSHUFWMask[4] = {1, -1, -1, -1}; unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; return DAG.getBitcast( VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, InputV), getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG))); } // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes // to 64-bits. if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) { assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!"); assert(VT.is128BitVector() && "Unexpected vector width!"); int LoIdx = Offset * EltBits; SDValue Lo = DAG.getBitcast( MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, DAG.getTargetConstant(EltBits, DL, MVT::i8), DAG.getTargetConstant(LoIdx, DL, MVT::i8))); if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1)) return DAG.getBitcast(VT, Lo); int HiIdx = (Offset + 1) * EltBits; SDValue Hi = DAG.getBitcast( MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, DAG.getTargetConstant(EltBits, DL, MVT::i8), DAG.getTargetConstant(HiIdx, DL, MVT::i8))); return DAG.getBitcast(VT, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi)); } // If this would require more than 2 unpack instructions to expand, use // pshufb when available. We can only use more than 2 unpack instructions // when zero extending i8 elements which also makes it easier to use pshufb. if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) { assert(NumElements == 16 && "Unexpected byte vector width!"); SDValue PSHUFBMask[16]; for (int i = 0; i < 16; ++i) { int Idx = Offset + (i / Scale); if ((i % Scale == 0 && SafeOffset(Idx))) { PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8); continue; } PSHUFBMask[i] = AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8); } InputV = DAG.getBitcast(MVT::v16i8, InputV); return DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV, DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask))); } // If we are extending from an offset, ensure we start on a boundary that // we can unpack from. int AlignToUnpack = Offset % (NumElements / Scale); if (AlignToUnpack) { SmallVector ShMask((unsigned)NumElements, -1); for (int i = AlignToUnpack; i < NumElements; ++i) ShMask[i - AlignToUnpack] = i; InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask); Offset -= AlignToUnpack; } // Otherwise emit a sequence of unpacks. do { unsigned UnpackLoHi = X86ISD::UNPCKL; if (Offset >= (NumElements / 2)) { UnpackLoHi = X86ISD::UNPCKH; Offset -= (NumElements / 2); } MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT) : getZeroVector(InputVT, Subtarget, DAG, DL); InputV = DAG.getBitcast(InputVT, InputV); InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext); Scale /= 2; EltBits *= 2; NumElements /= 2; } while (Scale > 1); return DAG.getBitcast(VT, InputV); } /// Try to lower a vector shuffle as a zero extension on any microarch. /// /// This routine will try to do everything in its power to cleverly lower /// a shuffle which happens to match the pattern of a zero extend. It doesn't /// check for the profitability of this lowering, it tries to aggressively /// match this pattern. It will use all of the micro-architectural details it /// can to emit an efficient lowering. It handles both blends with all-zero /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to /// masking out later). /// /// The reason we have dedicated lowering for zext-style shuffles is that they /// are both incredibly common and often quite performance sensitive. static SDValue lowerShuffleAsZeroOrAnyExtend( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { int Bits = VT.getSizeInBits(); int NumLanes = Bits / 128; int NumElements = VT.getVectorNumElements(); int NumEltsPerLane = NumElements / NumLanes; assert(VT.getScalarSizeInBits() <= 32 && "Exceeds 32-bit integer zero extension limit"); assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size"); // Define a helper function to check a particular ext-scale and lower to it if // valid. auto Lower = [&](int Scale) -> SDValue { SDValue InputV; bool AnyExt = true; int Offset = 0; int Matches = 0; for (int i = 0; i < NumElements; ++i) { int M = Mask[i]; if (M < 0) continue; // Valid anywhere but doesn't tell us anything. if (i % Scale != 0) { // Each of the extended elements need to be zeroable. if (!Zeroable[i]) return SDValue(); // We no longer are in the anyext case. AnyExt = false; continue; } // Each of the base elements needs to be consecutive indices into the // same input vector. SDValue V = M < NumElements ? V1 : V2; M = M % NumElements; if (!InputV) { InputV = V; Offset = M - (i / Scale); } else if (InputV != V) return SDValue(); // Flip-flopping inputs. // Offset must start in the lowest 128-bit lane or at the start of an // upper lane. // FIXME: Is it ever worth allowing a negative base offset? if (!((0 <= Offset && Offset < NumEltsPerLane) || (Offset % NumEltsPerLane) == 0)) return SDValue(); // If we are offsetting, all referenced entries must come from the same // lane. if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane)) return SDValue(); if ((M % NumElements) != (Offset + (i / Scale))) return SDValue(); // Non-consecutive strided elements. Matches++; } // If we fail to find an input, we have a zero-shuffle which should always // have already been handled. // FIXME: Maybe handle this here in case during blending we end up with one? if (!InputV) return SDValue(); // If we are offsetting, don't extend if we only match a single input, we // can always do better by using a basic PSHUF or PUNPCK. if (Offset != 0 && Matches < 2) return SDValue(); unsigned ExtOpc = AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND; return lowerShuffleAsSpecificExtension(DL, VT, Scale, Offset, ExtOpc, InputV, Mask, Subtarget, DAG); }; // The widest scale possible for extending is to a 64-bit integer. assert(Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"); int NumExtElements = Bits / 64; // Each iteration, try extending the elements half as much, but into twice as // many elements. for (; NumExtElements < NumElements; NumExtElements *= 2) { assert(NumElements % NumExtElements == 0 && "The input vector size must be divisible by the extended size."); if (SDValue V = Lower(NumElements / NumExtElements)) return V; } // General extends failed, but 128-bit vectors may be able to use MOVQ. if (Bits != 128) return SDValue(); // Returns one of the source operands if the shuffle can be reduced to a // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits. auto CanZExtLowHalf = [&]() { for (int i = NumElements / 2; i != NumElements; ++i) if (!Zeroable[i]) return SDValue(); if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0)) return V1; if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements)) return V2; return SDValue(); }; if (SDValue V = CanZExtLowHalf()) { V = DAG.getBitcast(MVT::v2i64, V); V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V); return DAG.getBitcast(VT, V); } // No viable ext lowering found. return SDValue(); } /// Try to get a scalar value for a specific element of a vector. /// /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar. static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG) { MVT VT = V.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); V = peekThroughBitcasts(V); // If the bitcasts shift the element size, we can't extract an equivalent // element from it. MVT NewVT = V.getSimpleValueType(); if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) return SDValue(); if (V.getOpcode() == ISD::BUILD_VECTOR || (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) { // Ensure the scalar operand is the same size as the destination. // FIXME: Add support for scalar truncation where possible. SDValue S = V.getOperand(Idx); if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits()) return DAG.getBitcast(EltVT, S); } return SDValue(); } /// Helper to test for a load that can be folded with x86 shuffles. /// /// This is particularly important because the set of instructions varies /// significantly based on whether the operand is a load or not. static bool isShuffleFoldableLoad(SDValue V) { return V.hasOneUse() && ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode()); } template static bool isSoftF16(T VT, const X86Subtarget &Subtarget) { T EltVT = VT.getScalarType(); return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) || (EltVT == MVT::f16 && !Subtarget.hasFP16()); } /// Try to lower insertion of a single element into a zero vector. /// /// This is a common pattern that we have especially efficient patterns to lower /// across all subtarget feature sets. static SDValue lowerShuffleAsElementInsertion( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT ExtVT = VT; MVT EltVT = VT.getVectorElementType(); unsigned NumElts = VT.getVectorNumElements(); unsigned EltBits = VT.getScalarSizeInBits(); if (isSoftF16(EltVT, Subtarget)) return SDValue(); int V2Index = find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) - Mask.begin(); bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr; bool IsV1Zeroable = true; for (int i = 0, Size = Mask.size(); i < Size; ++i) if (i != V2Index && !Zeroable[i]) { IsV1Zeroable = false; break; } // Bail if a non-zero V1 isn't used in place. if (!IsV1Zeroable) { SmallVector V1Mask(Mask); V1Mask[V2Index] = -1; if (!isNoopShuffleMask(V1Mask)) return SDValue(); } // Check for a single input from a SCALAR_TO_VECTOR node. // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and // all the smarts here sunk into that routine. However, the current // lowering of BUILD_VECTOR makes that nearly impossible until the old // vector shuffle lowering is dead. SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(), DAG); if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) { // We need to zext the scalar if it is smaller than an i32. V2S = DAG.getBitcast(EltVT, V2S); if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) { // Using zext to expand a narrow element won't work for non-zero // insertions. But we can use a masked constant vector if we're // inserting V2 into the bottom of V1. if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0)) return SDValue(); // Zero-extend directly to i32. ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32); V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S); // If we're inserting into a constant, mask off the inserted index // and OR with the zero-extended scalar. if (!IsV1Zeroable) { SmallVector Bits(NumElts, APInt::getAllOnes(EltBits)); Bits[V2Index] = APInt::getZero(EltBits); SDValue BitMask = getConstVector(Bits, VT, DAG, DL); V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask); V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S); V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2)); return DAG.getNode(ISD::OR, DL, VT, V1, V2); } } V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S); } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) { // Either not inserting from the low element of the input or the input // element size is too small to use VZEXT_MOVL to clear the high bits. return SDValue(); } if (!IsV1Zeroable) { // If V1 can't be treated as a zero vector we have fewer options to lower // this. We can't support integer vectors or non-zero targets cheaply. assert(VT == ExtVT && "Cannot change extended type when non-zeroable!"); if (!VT.isFloatingPoint() || V2Index != 0) return SDValue(); if (!VT.is128BitVector()) return SDValue(); // Otherwise, use MOVSD, MOVSS or MOVSH. unsigned MovOpc = 0; if (EltVT == MVT::f16) MovOpc = X86ISD::MOVSH; else if (EltVT == MVT::f32) MovOpc = X86ISD::MOVSS; else if (EltVT == MVT::f64) MovOpc = X86ISD::MOVSD; else llvm_unreachable("Unsupported floating point element type to handle!"); return DAG.getNode(MovOpc, DL, ExtVT, V1, V2); } // This lowering only works for the low element with floating point vectors. if (VT.isFloatingPoint() && V2Index != 0) return SDValue(); V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2); if (ExtVT != VT) V2 = DAG.getBitcast(VT, V2); if (V2Index != 0) { // If we have 4 or fewer lanes we can cheaply shuffle the element into // the desired position. Otherwise it is more efficient to do a vector // shift left. We know that we can do a vector shift left because all // the inputs are zero. if (VT.isFloatingPoint() || NumElts <= 4) { SmallVector V2Shuffle(Mask.size(), 1); V2Shuffle[V2Index] = 0; V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle); } else { V2 = DAG.getBitcast(MVT::v16i8, V2); V2 = DAG.getNode( X86ISD::VSHLDQ, DL, MVT::v16i8, V2, DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8)); V2 = DAG.getBitcast(VT, V2); } } return V2; } /// Try to lower broadcast of a single - truncated - integer element, /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements. /// /// This assumes we have AVX2. static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"); MVT EltVT = VT.getVectorElementType(); MVT V0VT = V0.getSimpleValueType(); assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!"); assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!"); MVT V0EltVT = V0VT.getVectorElementType(); if (!V0EltVT.isInteger()) return SDValue(); const unsigned EltSize = EltVT.getSizeInBits(); const unsigned V0EltSize = V0EltVT.getSizeInBits(); // This is only a truncation if the original element type is larger. if (V0EltSize <= EltSize) return SDValue(); assert(((V0EltSize % EltSize) == 0) && "Scalar type sizes must all be powers of 2 on x86!"); const unsigned V0Opc = V0.getOpcode(); const unsigned Scale = V0EltSize / EltSize; const unsigned V0BroadcastIdx = BroadcastIdx / Scale; if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) && V0Opc != ISD::BUILD_VECTOR) return SDValue(); SDValue Scalar = V0.getOperand(V0BroadcastIdx); // If we're extracting non-least-significant bits, shift so we can truncate. // Hopefully, we can fold away the trunc/srl/load into the broadcast. // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd. if (const int OffsetIdx = BroadcastIdx % Scale) Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar, DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8)); return DAG.getNode(X86ISD::VBROADCAST, DL, VT, DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar)); } /// Test whether this can be lowered with a single SHUFPS instruction. /// /// This is used to disable more specialized lowerings when the shufps lowering /// will happen to be efficient. static bool isSingleSHUFPSMask(ArrayRef Mask) { // This routine only handles 128-bit shufps. assert(Mask.size() == 4 && "Unsupported mask size!"); assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!"); assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!"); assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!"); assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!"); // To lower with a single SHUFPS we need to have the low half and high half // each requiring a single input. if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4)) return false; if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4)) return false; return true; } /// Test whether the specified input (0 or 1) is in-place blended by the /// given mask. /// /// This returns true if the elements from a particular input are already in the /// slot required by the given mask and require no permutation. static bool isShuffleMaskInputInPlace(int Input, ArrayRef Mask) { assert((Input == 0 || Input == 1) && "Only two inputs to shuffles."); int Size = Mask.size(); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i) return false; return true; } /// Test whether the specified input (0 or 1) is a broadcast/splat blended by /// the given mask. /// static bool isShuffleMaskInputBroadcastable(int Input, ArrayRef Mask, int BroadcastableElement = 0) { assert((Input == 0 || Input == 1) && "Only two inputs to shuffles."); int Size = Mask.size(); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != BroadcastableElement) return false; return true; } /// If we are extracting two 128-bit halves of a vector and shuffling the /// result, match that to a 256-bit AVX2 vperm* instruction to avoid a /// multi-shuffle lowering. static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef Mask, SelectionDAG &DAG) { MVT VT = N0.getSimpleValueType(); assert((VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"); // Check that both sources are extracts of the same source vector. if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR || N1.getOpcode() != ISD::EXTRACT_SUBVECTOR || N0.getOperand(0) != N1.getOperand(0) || !N0.hasOneUse() || !N1.hasOneUse()) return SDValue(); SDValue WideVec = N0.getOperand(0); MVT WideVT = WideVec.getSimpleValueType(); if (!WideVT.is256BitVector()) return SDValue(); // Match extracts of each half of the wide source vector. Commute the shuffle // if the extract of the low half is N1. unsigned NumElts = VT.getVectorNumElements(); SmallVector NewMask(Mask); const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1); const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1); if (ExtIndex1 == 0 && ExtIndex0 == NumElts) ShuffleVectorSDNode::commuteMask(NewMask); else if (ExtIndex0 != 0 || ExtIndex1 != NumElts) return SDValue(); // Final bailout: if the mask is simple, we are better off using an extract // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps // because that avoids a constant load from memory. if (NumElts == 4 && (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG))) return SDValue(); // Extend the shuffle mask with undef elements. NewMask.append(NumElts, -1); // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT), NewMask); // This is free: ymm -> xmm. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf, DAG.getVectorIdxConstant(0, DL)); } /// Try to lower broadcast of a single element. /// /// For convenience, this code also bundles all of the subtarget feature set /// filtering. While a little annoying to re-dispatch on type here, there isn't /// a convenient way to factor it out. static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT EltVT = VT.getVectorElementType(); if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) || (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) || (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16)))) return SDValue(); // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise // we can only broadcast from a register with AVX2. unsigned NumEltBits = VT.getScalarSizeInBits(); unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2()) ? X86ISD::MOVDDUP : X86ISD::VBROADCAST; bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2(); // Check that the mask is a broadcast. int BroadcastIdx = getSplatIndex(Mask); if (BroadcastIdx < 0) { // Check for hidden broadcast. SmallVector BroadcastMask(VT.getVectorNumElements(), 0); if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2)) return SDValue(); BroadcastIdx = 0; } assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with " "a sorted mask where the broadcast " "comes from V1."); int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; }); // Go up the chain of (vector) values to find a scalar load that we can // combine with the broadcast. // TODO: Combine this logic with findEltLoadSrc() used by // EltsFromConsecutiveLoads(). int BitOffset = BroadcastIdx * NumEltBits; SDValue V = V1; for (;;) { switch (V.getOpcode()) { case ISD::BITCAST: { V = V.getOperand(0); continue; } case ISD::CONCAT_VECTORS: { int OpBitWidth = V.getOperand(0).getValueSizeInBits(); int OpIdx = BitOffset / OpBitWidth; V = V.getOperand(OpIdx); BitOffset %= OpBitWidth; continue; } case ISD::EXTRACT_SUBVECTOR: { // The extraction index adds to the existing offset. unsigned EltBitWidth = V.getScalarValueSizeInBits(); unsigned Idx = V.getConstantOperandVal(1); unsigned BeginOffset = Idx * EltBitWidth; BitOffset += BeginOffset; V = V.getOperand(0); continue; } case ISD::INSERT_SUBVECTOR: { SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1); int EltBitWidth = VOuter.getScalarValueSizeInBits(); int Idx = (int)V.getConstantOperandVal(2); int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements(); int BeginOffset = Idx * EltBitWidth; int EndOffset = BeginOffset + NumSubElts * EltBitWidth; if (BeginOffset <= BitOffset && BitOffset < EndOffset) { BitOffset -= BeginOffset; V = VInner; } else { V = VOuter; } continue; } } break; } assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset"); BroadcastIdx = BitOffset / NumEltBits; // Do we need to bitcast the source to retrieve the original broadcast index? bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits; // Check if this is a broadcast of a scalar. We special case lowering // for scalars so that we can more effectively fold with loads. // If the original value has a larger element type than the shuffle, the // broadcast element is in essence truncated. Make that explicit to ease // folding. if (BitCastSrc && VT.isInteger()) if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast( DL, VT, V, BroadcastIdx, Subtarget, DAG)) return TruncBroadcast; // Also check the simpler case, where we can directly reuse the scalar. if (!BitCastSrc && ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) || (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) { V = V.getOperand(BroadcastIdx); // If we can't broadcast from a register, check that the input is a load. if (!BroadcastFromReg && !isShuffleFoldableLoad(V)) return SDValue(); } else if (ISD::isNormalLoad(V.getNode()) && cast(V)->isSimple()) { // We do not check for one-use of the vector load because a broadcast load // is expected to be a win for code size, register pressure, and possibly // uops even if the original vector load is not eliminated. // Reduce the vector load and shuffle to a broadcasted scalar load. auto *Ld = cast(V); SDValue BaseAddr = Ld->getBasePtr(); MVT SVT = VT.getScalarType(); unsigned Offset = BroadcastIdx * SVT.getStoreSize(); assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset"); SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, TypeSize::getFixed(Offset), DL); // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather // than MOVDDUP. // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX? if (Opcode == X86ISD::VBROADCAST) { SDVTList Tys = DAG.getVTList(VT, MVT::Other); SDValue Ops[] = {Ld->getChain(), NewAddr}; V = DAG.getMemIntrinsicNode( X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT, DAG.getMachineFunction().getMachineMemOperand( Ld->getMemOperand(), Offset, SVT.getStoreSize())); DAG.makeEquivalentMemoryOrdering(Ld, V); return DAG.getBitcast(VT, V); } assert(SVT == MVT::f64 && "Unexpected VT!"); V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, DAG.getMachineFunction().getMachineMemOperand( Ld->getMemOperand(), Offset, SVT.getStoreSize())); DAG.makeEquivalentMemoryOrdering(Ld, V); } else if (!BroadcastFromReg) { // We can't broadcast from a vector register. return SDValue(); } else if (BitOffset != 0) { // We can only broadcast from the zero-element of a vector register, // but it can be advantageous to broadcast from the zero-element of a // subvector. if (!VT.is256BitVector() && !VT.is512BitVector()) return SDValue(); // VPERMQ/VPERMPD can perform the cross-lane shuffle directly. if (VT == MVT::v4f64 || VT == MVT::v4i64) return SDValue(); // If we are broadcasting an element from the lowest 128-bit subvector, try // to move the element in position. if (BitOffset < 128 && NumActiveElts > 1 && V.getScalarValueSizeInBits() == NumEltBits) { assert((BitOffset % V.getScalarValueSizeInBits()) == 0 && "Unexpected bit-offset"); SmallVector ExtractMask(128 / NumEltBits, SM_SentinelUndef); ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits(); V = extractSubVector(V, 0, DAG, DL, 128); V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask); } else { // Only broadcast the zero-element of a 128-bit subvector. if ((BitOffset % 128) != 0) return SDValue(); assert((BitOffset % V.getScalarValueSizeInBits()) == 0 && "Unexpected bit-offset"); assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && "Unexpected vector size"); unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits(); V = extract128BitVector(V, ExtractIdx, DAG, DL); } } // On AVX we can use VBROADCAST directly for scalar sources. if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) { V = DAG.getBitcast(MVT::f64, V); if (Subtarget.hasAVX()) { V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V); return DAG.getBitcast(VT, V); } V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V); } // If this is a scalar, do the broadcast on this type and bitcast. if (!V.getValueType().isVector()) { assert(V.getScalarValueSizeInBits() == NumEltBits && "Unexpected scalar size"); MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(), VT.getVectorNumElements()); return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V)); } // We only support broadcasting from 128-bit vectors to minimize the // number of patterns we need to deal with in isel. So extract down to // 128-bits, removing as many bitcasts as possible. if (V.getValueSizeInBits() > 128) V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL); // Otherwise cast V to a vector with the same element type as VT, but // possibly narrower than VT. Then perform the broadcast. unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits; MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts); return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V)); } // Check for whether we can use INSERTPS to perform the shuffle. We only use // INSERTPS when the V1 elements are already in the correct locations // because otherwise we can just always use two SHUFPS instructions which // are much smaller to encode than a SHUFPS and an INSERTPS. We can also // perform INSERTPS if a single V1 element is out of place and all V2 // elements are zeroable. static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef Mask, SelectionDAG &DAG) { assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!"); assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); // Attempt to match INSERTPS with one element from VA or VB being // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask // are updated. auto matchAsInsertPS = [&](SDValue VA, SDValue VB, ArrayRef CandidateMask) { unsigned ZMask = 0; int VADstIndex = -1; int VBDstIndex = -1; bool VAUsedInPlace = false; for (int i = 0; i < 4; ++i) { // Synthesize a zero mask from the zeroable elements (includes undefs). if (Zeroable[i]) { ZMask |= 1 << i; continue; } // Flag if we use any VA inputs in place. if (i == CandidateMask[i]) { VAUsedInPlace = true; continue; } // We can only insert a single non-zeroable element. if (VADstIndex >= 0 || VBDstIndex >= 0) return false; if (CandidateMask[i] < 4) { // VA input out of place for insertion. VADstIndex = i; } else { // VB input for insertion. VBDstIndex = i; } } // Don't bother if we have no (non-zeroable) element for insertion. if (VADstIndex < 0 && VBDstIndex < 0) return false; // Determine element insertion src/dst indices. The src index is from the // start of the inserted vector, not the start of the concatenated vector. unsigned VBSrcIndex = 0; if (VADstIndex >= 0) { // If we have a VA input out of place, we use VA as the V2 element // insertion and don't use the original V2 at all. VBSrcIndex = CandidateMask[VADstIndex]; VBDstIndex = VADstIndex; VB = VA; } else { VBSrcIndex = CandidateMask[VBDstIndex] - 4; } // If no V1 inputs are used in place, then the result is created only from // the zero mask and the V2 insertion - so remove V1 dependency. if (!VAUsedInPlace) VA = DAG.getUNDEF(MVT::v4f32); // Update V1, V2 and InsertPSMask accordingly. V1 = VA; V2 = VB; // Insert the V2 element into the desired position. InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask; assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); return true; }; if (matchAsInsertPS(V1, V2, Mask)) return true; // Commute and try again. SmallVector CommutedMask(Mask); ShuffleVectorSDNode::commuteMask(CommutedMask); if (matchAsInsertPS(V2, V1, CommutedMask)) return true; return false; } static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef Mask, const APInt &Zeroable, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); // Attempt to match the insertps pattern. unsigned InsertPSMask = 0; if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG)) return SDValue(); // Insert the V2 element into the desired position. return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); } /// Handle lowering of 2-lane 64-bit floating point shuffles. /// /// This is the basis function for the 2-lane 64-bit shuffles as we have full /// support for floating point shuffles but not integer shuffles. These /// instructions will incur a domain crossing penalty on some chips though so /// it is better to avoid lowering through this for integer vectors where /// possible. static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (V2.isUndef()) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. Simulate this by using the // single input as both of the "inputs" to this instruction.. unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); if (Subtarget.hasAVX()) { // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1, DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8)); } return DAG.getNode( X86ISD::SHUFP, DL, MVT::v2f64, Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1, Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1, DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8)); } assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!"); assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!"); assert(Mask[0] < 2 && "We sort V1 to be the first input."); assert(Mask[1] >= 2 && "We sort V2 to be the second input."); if (Subtarget.hasAVX2()) if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) return Extract; // When loading a scalar and then shuffling it into a vector we can often do // the insertion cheaply. if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG)) return Insertion; // Try to use one of the special instruction patterns to handle two common // blend patterns if a zero-blend above didn't work. if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) || isShuffleEquivalent(Mask, {1, 3}, V1, V2)) if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG)) // We can either use a special instruction to load over the low double or // to move just the low double. return DAG.getNode( X86ISD::MOVSD, DL, MVT::v2f64, V2, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S)); if (Subtarget.hasSSE41()) if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG)) return V; unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2, DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8)); } /// Handle lowering of 2-lane 64-bit integer shuffles. /// /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by /// the integer unit to minimize domain crossing penalties. However, for blends /// it falls back to the floating point shuffle operation with appropriate bit /// casting. static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (V2.isUndef()) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. // We have to map the mask as it is actually a v4i32 shuffle instruction. V1 = DAG.getBitcast(MVT::v4i32, V1); int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2), Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1), Mask[1] < 0 ? -1 : (Mask[1] * 2), Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)}; return DAG.getBitcast( MVT::v2i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG))); } assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"); assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"); assert(Mask[0] < 2 && "We sort V1 to be the first input."); assert(Mask[1] >= 2 && "We sort V2 to be the second input."); if (Subtarget.hasAVX2()) if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) return Extract; // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG, /*BitwiseOnly*/ false)) return Shift; // When loading a scalar and then shuffling it into a vector we can often do // the insertion cheaply. if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2}; if (SDValue Insertion = lowerShuffleAsElementInsertion( DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG)) return Insertion; // We have different paths for blend lowering, but they all must use the // *exact* same predicate. bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG)) return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget.hasSSSE3()) { if (Subtarget.hasVLX()) if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Rotate; if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; } // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG); // We implement this with SHUFPD which is pretty lame because it will likely // incur 2 cycles of stall for integer vectors on Nehalem and older chips. // However, all the alternatives are still more cycles and newer chips don't // have this problem. It would be really nice if x86 had better shuffles here. V1 = DAG.getBitcast(MVT::v2f64, V1); V2 = DAG.getBitcast(MVT::v2f64, V2); return DAG.getBitcast(MVT::v2i64, DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); } /// Lower a vector shuffle using the SHUFPS instruction. /// /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS. /// It makes no assumptions about whether this is the *best* lowering, it simply /// uses it. static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { SDValue LowV = V1, HighV = V2; SmallVector NewMask(Mask); int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 1) { int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin(); // Compute the index adjacent to V2Index and in the same half by toggling // the low bit. int V2AdjIndex = V2Index ^ 1; if (Mask[V2AdjIndex] < 0) { // Handles all the cases where we have a single V2 element and an undef. // This will only ever happen in the high lanes because we commute the // vector otherwise. if (V2Index < 2) std::swap(LowV, HighV); NewMask[V2Index] -= 4; } else { // Handle the case where the V2 element ends up adjacent to a V1 element. // To make this work, blend them together as the first step. int V1Index = V2AdjIndex; int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0}; V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1, getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG)); // Now proceed to reconstruct the final blend as we have the necessary // high or low half formed. if (V2Index < 2) { LowV = V2; HighV = V1; } else { HighV = V2; } NewMask[V1Index] = 2; // We put the V1 element in V2[2]. NewMask[V2Index] = 0; // We shifted the V2 element into V2[0]. } } else if (NumV2Elements == 2) { if (Mask[0] < 4 && Mask[1] < 4) { // Handle the easy case where we have V1 in the low lanes and V2 in the // high lanes. NewMask[2] -= 4; NewMask[3] -= 4; } else if (Mask[2] < 4 && Mask[3] < 4) { // We also handle the reversed case because this utility may get called // when we detect a SHUFPS pattern but can't easily commute the shuffle to // arrange things in the right direction. NewMask[0] -= 4; NewMask[1] -= 4; HighV = V1; LowV = V2; } else { // We have a mixture of V1 and V2 in both low and high lanes. Rather than // trying to place elements directly, just blend them and set up the final // shuffle to place them. // The first two blend mask elements are for V1, the second two are for // V2. int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1], Mask[2] < 4 ? Mask[2] : Mask[3], (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4, (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4}; V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG)); // Now we do a normal shuffle of V1 by giving V1 as both operands to // a blend. LowV = HighV = V1; NewMask[0] = Mask[0] < 4 ? 0 : 2; NewMask[1] = Mask[0] < 4 ? 2 : 0; NewMask[2] = Mask[2] < 4 ? 1 : 3; NewMask[3] = Mask[2] < 4 ? 3 : 1; } } else if (NumV2Elements == 3) { // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but // we can get here due to other paths (e.g repeated mask matching) that we // don't want to do another round of lowerVECTOR_SHUFFLE. ShuffleVectorSDNode::commuteMask(NewMask); return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG); } return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV, getV4X86ShuffleImm8ForMask(NewMask, DL, DAG)); } /// Lower 4-lane 32-bit floating point shuffles. /// /// Uses instructions exclusively from the floating point unit to minimize /// domain crossing penalties, as these are sufficient to implement all v4f32 /// shuffles. static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); if (Subtarget.hasSSE41()) if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 0) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; // Use even/odd duplicate instructions for masks that match their pattern. if (Subtarget.hasSSE3()) { if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2)) return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1); if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2)) return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1); } if (Subtarget.hasAVX()) { // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid // in SSE1 because otherwise they are widened to v2f64 and never get here. if (!Subtarget.hasSSE2()) { if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2)) return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1); if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2)) return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1); } // Otherwise, use a straight shuffle of a single input vector. We pass the // input vector to both operands to simulate this with a SHUFPS. return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } if (Subtarget.hasSSE2()) if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) { ZExt = DAG.getBitcast(MVT::v4f32, ZExt); return ZExt; } if (Subtarget.hasAVX2()) if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) return Extract; // There are special ways we can lower some single-element blends. However, we // have custom ways we can lower more complex single-element blends below that // we defer to if both this and BLENDPS fail to match, so restrict this to // when the V2 input is targeting element 0 of the mask -- that is the fast // case here. if (NumV2Elements == 1 && Mask[0] >= 4) if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; if (Subtarget.hasSSE41()) { // Use INSERTPS if we can complete the shuffle efficiently. if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG)) return V; if (!isSingleSHUFPSMask(Mask)) if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1, V2, Mask, DAG)) return BlendPerm; } // Use low/high mov instructions. These are only valid in SSE1 because // otherwise they are widened to v2f64 and never get here. if (!Subtarget.hasSSE2()) { if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2); if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2)) return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1); } // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG)) return V; // Otherwise fall back to a SHUFPS lowering strategy. return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); } /// Lower 4-lane i32 vector shuffles. /// /// We try to handle these with integer-domain shuffles where we can, but for /// blends we use the floating point domain blend instructions. static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); // Try to use shift instructions if fast. if (Subtarget.preferLowerShuffleAsShift()) { if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG, /*BitwiseOnly*/ true)) return Shift; if (NumV2Elements == 0) if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG)) return Rotate; } if (NumV2Elements == 0) { // Try to use broadcast unless the mask only has one non-undef element. if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) { if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Broadcast; } // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. // We coerce the shuffle pattern to be compatible with UNPCK instructions // but we aren't actually going to use the UNPCK instruction because doing // so prevents folding a load into this instruction or making a copy. const int UnpackLoMask[] = {0, 0, 1, 1}; const int UnpackHiMask[] = {2, 2, 3, 3}; if (!isSingleElementRepeatedMask(Mask)) { if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2)) Mask = UnpackLoMask; else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2)) Mask = UnpackHiMask; } return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } if (Subtarget.hasAVX2()) if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) return Extract; // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG, /*BitwiseOnly*/ false)) return Shift; // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) if (SDValue V = lowerShuffleAsElementInsertion( DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return V; // We have different paths for blend lowering, but they all must use the // *exact* same predicate. bool IsBlendSupported = Subtarget.hasSSE41(); if (IsBlendSupported) if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG)) return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget.hasSSSE3()) { if (Subtarget.hasVLX()) if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Rotate; if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; } // Assume that a single SHUFPS is faster than an alternative sequence of // multiple instructions (even if the CPU has a domain penalty). // If some CPU is harmed by the domain switch, we can fix it in a later pass. if (!isSingleSHUFPSMask(Mask)) { // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG); // Try to lower by permuting the inputs into an unpack instruction. if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Unpack; } // We implement this with SHUFPS because it can blend from two vectors. // Because we're going to eventually use SHUFPS, we use SHUFPS even to build // up the inputs, bypassing domain shift penalties that we would incur if we // directly used PSHUFD on Nehalem and older. For newer chips, this isn't // relevant. SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1); SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2); SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask); return DAG.getBitcast(MVT::v4i32, ShufPS); } /// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 /// shuffle lowering, and the most complex part. /// /// The lowering strategy is to try to form pairs of input lanes which are /// targeted at the same half of the final vector, and then use a dword shuffle /// to place them onto the right half, and finally unpack the paired lanes into /// their final position. /// /// The exact breakdown of how to form these dword pairs and align them on the /// correct sides is really tricky. See the comments within the function for /// more of the details. /// /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16 /// vector, form the analogous 128-bit 8-element Mask. static SDValue lowerV8I16GeneralSingleInputShuffle( const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!"); MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); assert(Mask.size() == 8 && "Shuffle mask length doesn't match!"); MutableArrayRef LoMask = Mask.slice(0, 4); MutableArrayRef HiMask = Mask.slice(4, 4); // Attempt to directly match PSHUFLW or PSHUFHW. if (isUndefOrInRange(LoMask, 0, 4) && isSequentialOrUndefInRange(HiMask, 0, 4, 4)) { return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, getV4X86ShuffleImm8ForMask(LoMask, DL, DAG)); } if (isUndefOrInRange(HiMask, 4, 8) && isSequentialOrUndefInRange(LoMask, 0, 4, 0)) { for (int i = 0; i != 4; ++i) HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4)); return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, getV4X86ShuffleImm8ForMask(HiMask, DL, DAG)); } SmallVector LoInputs; copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; }); array_pod_sort(LoInputs.begin(), LoInputs.end()); LoInputs.erase(llvm::unique(LoInputs), LoInputs.end()); SmallVector HiInputs; copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; }); array_pod_sort(HiInputs.begin(), HiInputs.end()); HiInputs.erase(llvm::unique(HiInputs), HiInputs.end()); int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin(); int NumHToL = LoInputs.size() - NumLToL; int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin(); int NumHToH = HiInputs.size() - NumLToH; MutableArrayRef LToLInputs(LoInputs.data(), NumLToL); MutableArrayRef LToHInputs(HiInputs.data(), NumLToH); MutableArrayRef HToLInputs(LoInputs.data() + NumLToL, NumHToL); MutableArrayRef HToHInputs(HiInputs.data() + NumLToH, NumHToH); // If we are shuffling values from one half - check how many different DWORD // pairs we need to create. If only 1 or 2 then we can perform this as a // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below. auto ShuffleDWordPairs = [&](ArrayRef PSHUFHalfMask, ArrayRef PSHUFDMask, unsigned ShufWOp) { V = DAG.getNode(ShufWOp, DL, VT, V, getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); V = DAG.getBitcast(PSHUFDVT, V); V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V, getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); return DAG.getBitcast(VT, V); }; if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) { int PSHUFDMask[4] = { -1, -1, -1, -1 }; SmallVector, 4> DWordPairs; int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2); // Collect the different DWORD pairs. for (int DWord = 0; DWord != 4; ++DWord) { int M0 = Mask[2 * DWord + 0]; int M1 = Mask[2 * DWord + 1]; M0 = (M0 >= 0 ? M0 % 4 : M0); M1 = (M1 >= 0 ? M1 % 4 : M1); if (M0 < 0 && M1 < 0) continue; bool Match = false; for (int j = 0, e = DWordPairs.size(); j < e; ++j) { auto &DWordPair = DWordPairs[j]; if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) && (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) { DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first); DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second); PSHUFDMask[DWord] = DOffset + j; Match = true; break; } } if (!Match) { PSHUFDMask[DWord] = DOffset + DWordPairs.size(); DWordPairs.push_back(std::make_pair(M0, M1)); } } if (DWordPairs.size() <= 2) { DWordPairs.resize(2, std::make_pair(-1, -1)); int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second, DWordPairs[1].first, DWordPairs[1].second}; // For splat, ensure we widen the PSHUFDMask to allow vXi64 folds. if (ShuffleVectorSDNode::isSplatMask(PSHUFDMask) && ShuffleVectorSDNode::isSplatMask(PSHUFHalfMask)) { int SplatIdx = ShuffleVectorSDNode::getSplatMaskIndex(PSHUFHalfMask); std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx); PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0; PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1; } if ((NumHToL + NumHToH) == 0) return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW); if ((NumLToL + NumLToH) == 0) return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW); } } // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all // such inputs we can swap two of the dwords across the half mark and end up // with <=2 inputs to each half in each half. Once there, we can fall through // to the generic code below. For example: // // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5] // // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half // and an existing 2-into-2 on the other half. In this case we may have to // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or // 1-into-3 which could cause us to cycle endlessly fixing each side in turn. // Fortunately, we don't have to handle anything but a 2-into-2 pattern // because any other situation (including a 3-into-1 or 1-into-3 in the other // half than the one we target for fixing) will be fixed when we re-enter this // path. We will also combine away any sequence of PSHUFD instructions that // result into a single instruction. Here is an example of the tricky case: // // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3] // // This now has a 1-into-3 in the high half! Instead, we do two shuffles: // // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h] // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6] // // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h] // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6] // // The result is fine to be handled by the generic logic. auto balanceSides = [&](ArrayRef AToAInputs, ArrayRef BToAInputs, ArrayRef BToBInputs, ArrayRef AToBInputs, int AOffset, int BOffset) { assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."); assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."); assert(AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."); bool ThreeAInputs = AToAInputs.size() == 3; // Compute the index of dword with only one word among the three inputs in // a half by taking the sum of the half with three inputs and subtracting // the sum of the actual three inputs. The difference is the remaining // slot. int ADWord = 0, BDWord = 0; int &TripleDWord = ThreeAInputs ? ADWord : BDWord; int &OneInputDWord = ThreeAInputs ? BDWord : ADWord; int TripleInputOffset = ThreeAInputs ? AOffset : BOffset; ArrayRef TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs; int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0]; int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset); int TripleNonInputIdx = TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0); TripleDWord = TripleNonInputIdx / 2; // We use xor with one to compute the adjacent DWord to whichever one the // OneInput is in. OneInputDWord = (OneInput / 2) ^ 1; // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA // and BToA inputs. If there is also such a problem with the BToB and AToB // inputs, we don't try to fix it necessarily -- we'll recurse and see it in // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it // is essential that we don't *create* a 3<-1 as then we might oscillate. if (BToBInputs.size() == 2 && AToBInputs.size() == 2) { // Compute how many inputs will be flipped by swapping these DWords. We // need // to balance this to ensure we don't form a 3-1 shuffle in the other // half. int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) + llvm::count(AToBInputs, 2 * ADWord + 1); int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) + llvm::count(BToBInputs, 2 * BDWord + 1); if ((NumFlippedAToBInputs == 1 && (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) || (NumFlippedBToBInputs == 1 && (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) { // We choose whether to fix the A half or B half based on whether that // half has zero flipped inputs. At zero, we may not be able to fix it // with that half. We also bias towards fixing the B half because that // will more commonly be the high half, and we have to bias one way. auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord, ArrayRef Inputs) { int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot. bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1); // Determine whether the free index is in the flipped dword or the // unflipped dword based on where the pinned index is. We use this bit // in an xor to conditionally select the adjacent dword. int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord)); bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx); if (IsFixIdxInput == IsFixFreeIdxInput) FixFreeIdx += 1; IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx); assert(IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!"); int PSHUFHalfMask[] = {0, 1, 2, 3}; std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]); V = DAG.getNode( FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL, MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V, getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); for (int &M : Mask) if (M >= 0 && M == FixIdx) M = FixFreeIdx; else if (M >= 0 && M == FixFreeIdx) M = FixIdx; }; if (NumFlippedBToBInputs != 0) { int BPinnedIdx = BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs); } else { assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!"); int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput; FixFlippedInputs(APinnedIdx, ADWord, AToBInputs); } } } int PSHUFDMask[] = {0, 1, 2, 3}; PSHUFDMask[ADWord] = BDWord; PSHUFDMask[BDWord] = ADWord; V = DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); // Adjust the mask to match the new locations of A and B. for (int &M : Mask) if (M >= 0 && M/2 == ADWord) M = 2 * BDWord + M % 2; else if (M >= 0 && M/2 == BDWord) M = 2 * ADWord + M % 2; // Recurse back into this routine to re-compute state now that this isn't // a 3 and 1 problem. return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG); }; if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3)) return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0); // At this point there are at most two inputs to the low and high halves from // each half. That means the inputs can always be grouped into dwords and // those dwords can then be moved to the correct half with a dword shuffle. // We use at most one low and one high word shuffle to collect these paired // inputs into dwords, and finally a dword shuffle to place them. int PSHUFLMask[4] = {-1, -1, -1, -1}; int PSHUFHMask[4] = {-1, -1, -1, -1}; int PSHUFDMask[4] = {-1, -1, -1, -1}; // First fix the masks for all the inputs that are staying in their // original halves. This will then dictate the targets of the cross-half // shuffles. auto fixInPlaceInputs = [&PSHUFDMask](ArrayRef InPlaceInputs, ArrayRef IncomingInputs, MutableArrayRef SourceHalfMask, MutableArrayRef HalfMask, int HalfOffset) { if (InPlaceInputs.empty()) return; if (InPlaceInputs.size() == 1) { SourceHalfMask[InPlaceInputs[0] - HalfOffset] = InPlaceInputs[0] - HalfOffset; PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2; return; } if (IncomingInputs.empty()) { // Just fix all of the in place inputs. for (int Input : InPlaceInputs) { SourceHalfMask[Input - HalfOffset] = Input - HalfOffset; PSHUFDMask[Input / 2] = Input / 2; } return; } assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"); SourceHalfMask[InPlaceInputs[0] - HalfOffset] = InPlaceInputs[0] - HalfOffset; // Put the second input next to the first so that they are packed into // a dword. We find the adjacent index by toggling the low bit. int AdjIndex = InPlaceInputs[0] ^ 1; SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset; llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex); PSHUFDMask[AdjIndex / 2] = AdjIndex / 2; }; fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0); fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4); // Now gather the cross-half inputs and place them into a free dword of // their target half. // FIXME: This operation could almost certainly be simplified dramatically to // look more like the 3-1 fixing operation. auto moveInputsToRightHalf = [&PSHUFDMask]( MutableArrayRef