From 47ea18840aa0364a969073ef71d08c18fbc044b3 Mon Sep 17 00:00:00 2001
From: Anton Malakhov <Anton.Malakhov@intel.com>
Date: Thu, 1 Nov 2018 17:10:55 -0500
Subject: [PATCH] Fixes vectorizer and extends SVML support

This patch is created on top of LLVM 8.0.0 merging several fixes:

1. https://reviews.llvm.org/D47188 patch fixes the problem with improper calls
to SVML library as it has non-standard calling conventions. So accordingly it
has SVML calling conventions definitions and code to set CC to the vectorized
calls. As SVML provides several implementations for the math functions we also
took into consideration fast attribute and select more fast implementation in
such case. This work is based on original Matt Masten's work.
Author: Denis Nagorny

2. https://reviews.llvm.org/D53035 patch implements support to legalize SVML
calls by breaking down the illegal vector call instruction into multiple legal
vector call instructions during code generation. Currently the vectorizer does
not check legality of the generated SVML (or any VECLIB) call instructions, and
this can lead to potential problems even during vector type legalization. This
patch addresses this issue by adding a legality check during code generation and
replaces the illegal SVML call with corresponding legalized instructions.
(RFC: http://lists.llvm.org/pipermail/llvm-dev/2018-June/124357.html)
Author: Karthik Senthil

3. Functional merge of the patches above, which fixes calling convention
---
 include/llvm/Analysis/TargetLibraryInfo.h          |  17 +-
 include/llvm/IR/CMakeLists.txt                     |   4 +
 include/llvm/IR/CallingConv.h                      |   3 +
 include/llvm/IR/SVML.td                            |  62 +++
 lib/Analysis/CMakeLists.txt                        |   1 +
 lib/Analysis/TargetLibraryInfo.cpp                 | 126 +----
 lib/AsmParser/LLLexer.cpp                          |   1 +
 lib/AsmParser/LLParser.cpp                         |   2 +
 lib/AsmParser/LLToken.h                            |   1 +
 lib/IR/AsmWriter.cpp                               |   1 +
 lib/IR/Verifier.cpp                                |   1 +
 lib/Target/X86/X86CallingConv.td                   |  59 ++-
 lib/Target/X86/X86ISelLowering.cpp                 |   3 +-
 lib/Target/X86/X86RegisterInfo.cpp                 |  34 ++
 lib/Target/X86/X86Subtarget.h                      |   1 +
 lib/Transforms/Vectorize/LoopVectorize.cpp         | 279 ++++++++++-
 test/Transforms/LoopVectorize/X86/scatter_crash.ll |   0
 .../LoopVectorize/X86/svml-calls-finite.ll         |   9 +-
 test/Transforms/LoopVectorize/X86/svml-calls.ll    |  81 +++-
 .../LoopVectorize/X86/svml-legal-calls.ll          | 513 +++++++++++++++++++++
 .../LoopVectorize/X86/svml-legal-codegen.ll        |  61 +++
 utils/TableGen/CMakeLists.txt                      |   1 +
 utils/TableGen/SVMLEmitter.cpp                     | 110 +++++
 utils/TableGen/TableGen.cpp                        |   8 +-
 utils/TableGen/TableGenBackends.h                  |   1 +
 utils/vim/syntax/llvm.vim                          |   1 +
 26 files changed, 1238 insertions(+), 142 deletions(-)
 create mode 100644 include/llvm/IR/SVML.td
 mode change 100755 => 100644 test/Transforms/LoopVectorize/X86/scatter_crash.ll
 create mode 100644 test/Transforms/LoopVectorize/X86/svml-legal-calls.ll
 create mode 100644 test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll
 create mode 100644 utils/TableGen/SVMLEmitter.cpp

diff --git a/include/llvm/Analysis/TargetLibraryInfo.h b/include/llvm/Analysis/TargetLibraryInfo.h
index a3fe834..2b93099 100644
--- a/include/llvm/Analysis/TargetLibraryInfo.h
+++ b/include/llvm/Analysis/TargetLibraryInfo.h
@@ -38,6 +38,12 @@ struct VecDesc {
     NumLibFuncs
   };
 
+enum SVMLAccuracy {
+  SVML_DEFAULT,
+  SVML_HA,
+  SVML_EP
+};
+
 /// Implementation of the target library information.
 ///
 /// This class constructs tables that hold the target library information and
@@ -150,7 +156,8 @@ public:
   /// Return true if the function F has a vector equivalent with vectorization
   /// factor VF.
   bool isFunctionVectorizable(StringRef F, unsigned VF) const {
-    return !getVectorizedFunction(F, VF).empty();
+     bool Ignored;
+     return !getVectorizedFunction(F, VF, Ignored, false).empty();
   }
 
   /// Return true if the function F has a vector equivalent with any
@@ -159,7 +166,8 @@ public:
 
   /// Return the name of the equivalent of F, vectorized with factor VF. If no
   /// such mapping exists, return the empty string.
-  StringRef getVectorizedFunction(StringRef F, unsigned VF) const;
+  std::string getVectorizedFunction(StringRef F, unsigned VF, bool &FromSVML,
+                                    bool IsFast) const;
 
   /// Return true if the function F has a scalar equivalent, and set VF to be
   /// the vectorization factor.
@@ -253,8 +261,9 @@ public:
   bool isFunctionVectorizable(StringRef F) const {
     return Impl->isFunctionVectorizable(F);
   }
-  StringRef getVectorizedFunction(StringRef F, unsigned VF) const {
-    return Impl->getVectorizedFunction(F, VF);
+  std::string getVectorizedFunction(StringRef F, unsigned VF, bool &FromSVML,
+                                    bool IsFast) const {
+    return Impl->getVectorizedFunction(F, VF, FromSVML, IsFast);
   }
 
   /// Tests if the function is both available and a candidate for optimized code
diff --git a/include/llvm/IR/CMakeLists.txt b/include/llvm/IR/CMakeLists.txt
index 830f375..dfe25b6 100644
--- a/include/llvm/IR/CMakeLists.txt
+++ b/include/llvm/IR/CMakeLists.txt
@@ -5,3 +5,7 @@ set(LLVM_TARGET_DEFINITIONS Intrinsics.td)
 tablegen(LLVM IntrinsicEnums.inc -gen-intrinsic-enums)
 tablegen(LLVM IntrinsicImpl.inc -gen-intrinsic-impl)
 add_public_tablegen_target(intrinsics_gen)
+
+set(LLVM_TARGET_DEFINITIONS SVML.td)
+tablegen(LLVM SVML.inc -gen-svml)
+add_public_tablegen_target(svml_gen)
diff --git a/include/llvm/IR/CallingConv.h b/include/llvm/IR/CallingConv.h
index 49c3be9..24bc535 100644
--- a/include/llvm/IR/CallingConv.h
+++ b/include/llvm/IR/CallingConv.h
@@ -223,6 +223,9 @@ namespace CallingConv {
     // Calling convention between AArch64 Advanced SIMD functions
     AArch64_VectorCall = 97,
 
+    /// Intel_SVML - Calling conventions for Intel Short Math Vector Library
+    Intel_SVML = 98,
+
     /// The highest possible calling convention ID. Must be some 2^k - 1.
     MaxID = 1023
   };
diff --git a/include/llvm/IR/SVML.td b/include/llvm/IR/SVML.td
new file mode 100644
index 0000000..5af7104
--- /dev/null
+++ b/include/llvm/IR/SVML.td
@@ -0,0 +1,62 @@
+//===-- Intel_SVML.td - Defines SVML call variants ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is used by TableGen to define the different typs of SVML function
+// variants used with -fveclib=SVML.
+//
+//===----------------------------------------------------------------------===//
+
+class SvmlVariant;
+
+def sin        : SvmlVariant;
+def cos        : SvmlVariant;
+def pow        : SvmlVariant;
+def exp        : SvmlVariant;
+def log        : SvmlVariant;
+def acos       : SvmlVariant;
+def acosh      : SvmlVariant;
+def asin       : SvmlVariant;
+def asinh      : SvmlVariant;
+def atan2      : SvmlVariant;
+def atan       : SvmlVariant;
+def atanh      : SvmlVariant;
+def cbrt       : SvmlVariant;
+def cdfnorm    : SvmlVariant;
+def cdfnorminv : SvmlVariant;
+def cosd       : SvmlVariant;
+def cosh       : SvmlVariant;
+def erf        : SvmlVariant;
+def erfc       : SvmlVariant;
+def erfcinv    : SvmlVariant;
+def erfinv     : SvmlVariant;
+def exp10      : SvmlVariant;
+def exp2       : SvmlVariant;
+def expm1      : SvmlVariant;
+def hypot      : SvmlVariant;
+def invsqrt    : SvmlVariant;
+def log10      : SvmlVariant;
+def log1p      : SvmlVariant;
+def log2       : SvmlVariant;
+def sind       : SvmlVariant;
+def sinh       : SvmlVariant;
+def sqrt       : SvmlVariant;
+def tan        : SvmlVariant;
+def tanh       : SvmlVariant;
+
+// TODO: SVML does not currently provide _ha and _ep variants of these fucnctions.
+// We should call the default variant of these functions in all cases instead.
+
+// def nearbyint  : SvmlVariant;
+// def logb       : SvmlVariant;
+// def floor      : SvmlVariant;
+// def fmod       : SvmlVariant;
+// def ceil       : SvmlVariant;
+// def trunc      : SvmlVariant;
+// def rint       : SvmlVariant;
+// def round      : SvmlVariant;
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index c57d8ef..e2e91f5 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -100,4 +100,5 @@ add_llvm_library(LLVMAnalysis
 
   DEPENDS
   intrinsics_gen
+  svml_gen
   )
diff --git a/lib/Analysis/TargetLibraryInfo.cpp b/lib/Analysis/TargetLibraryInfo.cpp
index ae86ee3..62f8284 100644
--- a/lib/Analysis/TargetLibraryInfo.cpp
+++ b/lib/Analysis/TargetLibraryInfo.cpp
@@ -50,6 +50,11 @@ static bool hasSinCosPiStret(const Triple &T) {
   return true;
 }
 
+std::string svmlMangle(StringRef FnName, const bool IsFast) {
+  std::string FullName = FnName;
+  return IsFast ? FullName : FullName + "_ha";
+}
+
 /// Initialize the set of available library functions based on the specified
 /// target triple. This should be carefully written so that a missing target
 /// triple gets a sane set of defaults.
@@ -1492,109 +1497,9 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
   }
   case SVML: {
     const VecDesc VecFuncs[] = {
-        {"sin", "__svml_sin2", 2},
-        {"sin", "__svml_sin4", 4},
-        {"sin", "__svml_sin8", 8},
-
-        {"sinf", "__svml_sinf4", 4},
-        {"sinf", "__svml_sinf8", 8},
-        {"sinf", "__svml_sinf16", 16},
-
-        {"llvm.sin.f64", "__svml_sin2", 2},
-        {"llvm.sin.f64", "__svml_sin4", 4},
-        {"llvm.sin.f64", "__svml_sin8", 8},
-
-        {"llvm.sin.f32", "__svml_sinf4", 4},
-        {"llvm.sin.f32", "__svml_sinf8", 8},
-        {"llvm.sin.f32", "__svml_sinf16", 16},
-
-        {"cos", "__svml_cos2", 2},
-        {"cos", "__svml_cos4", 4},
-        {"cos", "__svml_cos8", 8},
-
-        {"cosf", "__svml_cosf4", 4},
-        {"cosf", "__svml_cosf8", 8},
-        {"cosf", "__svml_cosf16", 16},
-
-        {"llvm.cos.f64", "__svml_cos2", 2},
-        {"llvm.cos.f64", "__svml_cos4", 4},
-        {"llvm.cos.f64", "__svml_cos8", 8},
-
-        {"llvm.cos.f32", "__svml_cosf4", 4},
-        {"llvm.cos.f32", "__svml_cosf8", 8},
-        {"llvm.cos.f32", "__svml_cosf16", 16},
-
-        {"pow", "__svml_pow2", 2},
-        {"pow", "__svml_pow4", 4},
-        {"pow", "__svml_pow8", 8},
-
-        {"powf", "__svml_powf4", 4},
-        {"powf", "__svml_powf8", 8},
-        {"powf", "__svml_powf16", 16},
-
-        { "__pow_finite", "__svml_pow2", 2 },
-        { "__pow_finite", "__svml_pow4", 4 },
-        { "__pow_finite", "__svml_pow8", 8 },
-
-        { "__powf_finite", "__svml_powf4", 4 },
-        { "__powf_finite", "__svml_powf8", 8 },
-        { "__powf_finite", "__svml_powf16", 16 },
-
-        {"llvm.pow.f64", "__svml_pow2", 2},
-        {"llvm.pow.f64", "__svml_pow4", 4},
-        {"llvm.pow.f64", "__svml_pow8", 8},
-
-        {"llvm.pow.f32", "__svml_powf4", 4},
-        {"llvm.pow.f32", "__svml_powf8", 8},
-        {"llvm.pow.f32", "__svml_powf16", 16},
-
-        {"exp", "__svml_exp2", 2},
-        {"exp", "__svml_exp4", 4},
-        {"exp", "__svml_exp8", 8},
-
-        {"expf", "__svml_expf4", 4},
-        {"expf", "__svml_expf8", 8},
-        {"expf", "__svml_expf16", 16},
-
-        { "__exp_finite", "__svml_exp2", 2 },
-        { "__exp_finite", "__svml_exp4", 4 },
-        { "__exp_finite", "__svml_exp8", 8 },
-
-        { "__expf_finite", "__svml_expf4", 4 },
-        { "__expf_finite", "__svml_expf8", 8 },
-        { "__expf_finite", "__svml_expf16", 16 },
-
-        {"llvm.exp.f64", "__svml_exp2", 2},
-        {"llvm.exp.f64", "__svml_exp4", 4},
-        {"llvm.exp.f64", "__svml_exp8", 8},
-
-        {"llvm.exp.f32", "__svml_expf4", 4},
-        {"llvm.exp.f32", "__svml_expf8", 8},
-        {"llvm.exp.f32", "__svml_expf16", 16},
-
-        {"log", "__svml_log2", 2},
-        {"log", "__svml_log4", 4},
-        {"log", "__svml_log8", 8},
-
-        {"logf", "__svml_logf4", 4},
-        {"logf", "__svml_logf8", 8},
-        {"logf", "__svml_logf16", 16},
-
-        { "__log_finite", "__svml_log2", 2 },
-        { "__log_finite", "__svml_log4", 4 },
-        { "__log_finite", "__svml_log8", 8 },
-
-        { "__logf_finite", "__svml_logf4", 4 },
-        { "__logf_finite", "__svml_logf8", 8 },
-        { "__logf_finite", "__svml_logf16", 16 },
-
-        {"llvm.log.f64", "__svml_log2", 2},
-        {"llvm.log.f64", "__svml_log4", 4},
-        {"llvm.log.f64", "__svml_log8", 8},
-
-        {"llvm.log.f32", "__svml_logf4", 4},
-        {"llvm.log.f32", "__svml_logf8", 8},
-        {"llvm.log.f32", "__svml_logf16", 16},
+#define GET_SVML_VARIANTS
+#include "llvm/IR/SVML.inc"
+#undef GET_SVML_VARIANTS
     };
     addVectorizableFunctions(VecFuncs);
     break;
@@ -1615,19 +1520,26 @@ bool TargetLibraryInfoImpl::isFunctionVectorizable(StringRef funcName) const {
   return I != VectorDescs.end() && StringRef(I->ScalarFnName) == funcName;
 }
 
-StringRef TargetLibraryInfoImpl::getVectorizedFunction(StringRef F,
-                                                       unsigned VF) const {
+std::string TargetLibraryInfoImpl::getVectorizedFunction(StringRef F,
+                                                         unsigned VF,
+                                                         bool &FromSVML,
+                                                         bool IsFast) const {
+  FromSVML = ClVectorLibrary == SVML;
   F = sanitizeFunctionName(F);
   if (F.empty())
     return F;
   std::vector<VecDesc>::const_iterator I = std::lower_bound(
       VectorDescs.begin(), VectorDescs.end(), F, compareWithScalarFnName);
   while (I != VectorDescs.end() && StringRef(I->ScalarFnName) == F) {
-    if (I->VectorizationFactor == VF)
+    if (I->VectorizationFactor == VF) {
+      if (FromSVML) {
+        return svmlMangle(I->VectorFnName, IsFast);
+      }
       return I->VectorFnName;
+    }
     ++I;
   }
-  return StringRef();
+  return std::string();
 }
 
 StringRef TargetLibraryInfoImpl::getScalarizedFunction(StringRef F,
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index eab7ec8..370a9a4 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -601,6 +601,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(spir_kernel);
   KEYWORD(spir_func);
   KEYWORD(intel_ocl_bicc);
+  KEYWORD(intel_svmlcc);
   KEYWORD(x86_64_sysvcc);
   KEYWORD(win64cc);
   KEYWORD(x86_regcallcc);
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index ee63450..0d40501 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -1866,6 +1866,7 @@ void LLParser::ParseOptionalDLLStorageClass(unsigned &Res) {
 ///   ::= 'ccc'
 ///   ::= 'fastcc'
 ///   ::= 'intel_ocl_bicc'
+///   ::= 'intel_svmlcc'
 ///   ::= 'coldcc'
 ///   ::= 'x86_stdcallcc'
 ///   ::= 'x86_fastcallcc'
@@ -1927,6 +1928,7 @@ bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
   case lltok::kw_spir_kernel:    CC = CallingConv::SPIR_KERNEL; break;
   case lltok::kw_spir_func:      CC = CallingConv::SPIR_FUNC; break;
   case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break;
+  case lltok::kw_intel_svmlcc:   CC = CallingConv::Intel_SVML; break;
   case lltok::kw_x86_64_sysvcc:  CC = CallingConv::X86_64_SysV; break;
   case lltok::kw_win64cc:        CC = CallingConv::Win64; break;
   case lltok::kw_webkit_jscc:    CC = CallingConv::WebKit_JS; break;
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index c2e2795..989fc49 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -131,6 +131,7 @@ enum Kind {
   kw_fastcc,
   kw_coldcc,
   kw_intel_ocl_bicc,
+  kw_intel_svmlcc,
   kw_x86_stdcallcc,
   kw_x86_fastcallcc,
   kw_x86_thiscallcc,
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index a5dc623..63343c2 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -359,6 +359,7 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   case CallingConv::X86_RegCall:   Out << "x86_regcallcc"; break;
   case CallingConv::X86_VectorCall:Out << "x86_vectorcallcc"; break;
   case CallingConv::Intel_OCL_BI:  Out << "intel_ocl_bicc"; break;
+  case CallingConv::Intel_SVML:    Out << "intel_svmlcc"; break;
   case CallingConv::ARM_APCS:      Out << "arm_apcscc"; break;
   case CallingConv::ARM_AAPCS:     Out << "arm_aapcscc"; break;
   case CallingConv::ARM_AAPCS_VFP: Out << "arm_aapcs_vfpcc"; break;
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 30e77b9..50c656d 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -2162,6 +2162,7 @@ void Verifier::visitFunction(const Function &F) {
   case CallingConv::Fast:
   case CallingConv::Cold:
   case CallingConv::Intel_OCL_BI:
+  case CallingConv::Intel_SVML:
   case CallingConv::PTX_Kernel:
   case CallingConv::PTX_Device:
     Assert(!F.isVarArg(), "Calling convention does not support varargs or "
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index fe49c9f..af02be6 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -476,12 +476,29 @@ def RetCC_X86_64 : CallingConv<[
   CCDelegateTo<RetCC_X86_64_C>
 ]>;
 
+// Intel_SVML return-value convention.
+def RetCC_Intel_SVML : CallingConv<[
+  // Vector types are returned in XMM0,XMM1
+  CCIfType<[v4f32, v2f64],
+            CCAssignToReg<[XMM0,XMM1]>>,
+
+  // 256-bit FP vectors
+  CCIfType<[v8f32, v4f64],
+            CCAssignToReg<[YMM0,YMM1]>>,
+
+  // 512-bit FP vectors
+  CCIfType<[v16f32, v8f64],
+            CCAssignToReg<[ZMM0,ZMM1]>>
+]>;
+
 // This is the return-value convention used for the entire X86 backend.
 def RetCC_X86 : CallingConv<[
 
   // Check if this is the Intel OpenCL built-ins calling convention
   CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<RetCC_Intel_OCL_BI>>,
 
+  CCIfCC<"CallingConv::Intel_SVML", CCDelegateTo<RetCC_Intel_SVML>>,
+
   CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
   CCDelegateTo<RetCC_X86_32>
 ]>;
@@ -985,6 +1002,22 @@ def CC_Intel_OCL_BI : CallingConv<[
   CCDelegateTo<CC_X86_32_C>
 ]>;
 
+// X86-64 Intel Short Vector Math Library calling convention.
+def CC_Intel_SVML : CallingConv<[
+
+  // The SSE vector arguments are passed in XMM registers.
+  CCIfType<[v4f32, v2f64],
+           CCAssignToReg<[XMM0, XMM1, XMM2]>>,
+
+  // The 256-bit vector arguments are passed in YMM registers.
+  CCIfType<[v8f32, v4f64],
+           CCAssignToReg<[YMM0, YMM1, YMM2]>>,
+
+  // The 512-bit vector arguments are passed in ZMM registers.
+  CCIfType<[v16f32, v8f64],
+           CCAssignToReg<[ZMM0, ZMM1, ZMM2]>>
+]>;
+
 def CC_X86_32_Intr : CallingConv<[
   CCAssignToStack<4, 4>
 ]>;
@@ -1041,6 +1074,7 @@ def CC_X86_64 : CallingConv<[
 // This is the argument convention used for the entire X86 backend.
 def CC_X86 : CallingConv<[
   CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>,
+  CCIfCC<"CallingConv::Intel_SVML", CCDelegateTo<CC_Intel_SVML>>,
   CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>,
   CCDelegateTo<CC_X86_32>
 ]>;
@@ -1149,4 +1183,27 @@ def CSR_SysV64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, RSP,
                                                (sequence "R%u", 12, 15))>;
 def CSR_SysV64_RegCall       : CalleeSavedRegs<(add CSR_SysV64_RegCall_NoSSE,               
                                                (sequence "XMM%u", 8, 15))>;
-                                               
+
+// SVML calling convention 
+def CSR_32_Intel_SVML        : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE)>;
+def CSR_32_Intel_SVML_AVX512 : CalleeSavedRegs<(add CSR_32_Intel_SVML,
+                                                K4, K5, K6, K7)>;
+
+def CSR_64_Intel_SVML_NoSSE : CalleeSavedRegs<(add RBX, RSI, RDI, RBP, RSP, R12, R13, R14, R15)>;
+
+def CSR_64_Intel_SVML       : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
+                                               (sequence "XMM%u", 8, 15))>;
+def CSR_Win64_Intel_SVML    : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
+                                               (sequence "XMM%u", 6, 15))>;
+
+def CSR_64_Intel_SVML_AVX        : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
+                                                    (sequence "YMM%u", 8, 15))>;
+def CSR_Win64_Intel_SVML_AVX     : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
+                                                    (sequence "YMM%u", 6, 15))>;
+
+def CSR_64_Intel_SVML_AVX512     : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
+                                                    (sequence "ZMM%u", 16, 31),
+                                                    K4, K5, K6, K7)>;
+def CSR_Win64_Intel_SVML_AVX512  : CalleeSavedRegs<(add CSR_64_Intel_SVML_NoSSE,
+                                                    (sequence "ZMM%u", 6, 21),
+                                                    K4, K5, K6, K7)>;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 2dfee3a..6b7e94c 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -3428,7 +3428,8 @@ SDValue X86TargetLowering::LowerFormalArguments(
     // FIXME: Only some x86_32 calling conventions support AVX512.
     if (Subtarget.hasAVX512() &&
         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
-                     CallConv == CallingConv::Intel_OCL_BI)))
+                     CallConv == CallingConv::Intel_OCL_BI   ||
+                     CallConv == CallingConv::Intel_SVML)))
       VecVT = MVT::v16f32;
     else if (Subtarget.hasAVX())
       VecVT = MVT::v8f32;
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index bc39cee..faf2efd 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -311,6 +311,23 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
       return CSR_64_Intel_OCL_BI_SaveList;
     break;
   }
+  case CallingConv::Intel_SVML: {
+    if (Is64Bit) {
+      if (HasAVX512)
+        return IsWin64 ? CSR_Win64_Intel_SVML_AVX512_SaveList :
+                         CSR_64_Intel_SVML_AVX512_SaveList;
+      if (HasAVX)
+        return IsWin64 ? CSR_Win64_Intel_SVML_AVX_SaveList :
+                         CSR_64_Intel_SVML_AVX_SaveList;
+
+      return IsWin64 ? CSR_Win64_Intel_SVML_SaveList :
+                       CSR_64_Intel_SVML_SaveList;
+    } else { // Is32Bit
+        if (HasAVX512)
+            return CSR_32_Intel_SVML_AVX512_SaveList;
+        return CSR_32_Intel_SVML_SaveList;
+    }
+  }
   case CallingConv::HHVM:
     return CSR_64_HHVM_SaveList;
   case CallingConv::X86_RegCall:
@@ -425,6 +442,23 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
       return CSR_64_Intel_OCL_BI_RegMask;
     break;
   }
+  case CallingConv::Intel_SVML: {
+    if (Is64Bit) {
+      if (HasAVX512)
+        return IsWin64 ? CSR_Win64_Intel_SVML_AVX512_RegMask :
+                         CSR_64_Intel_SVML_AVX512_RegMask;
+      if (HasAVX)
+        return IsWin64 ? CSR_Win64_Intel_SVML_AVX_RegMask :
+                         CSR_64_Intel_SVML_AVX_RegMask;
+
+      return IsWin64 ? CSR_Win64_Intel_SVML_RegMask :
+                       CSR_64_Intel_SVML_RegMask;
+    } else { // Is32Bit
+        if (HasAVX512)
+            return CSR_32_Intel_SVML_AVX512_RegMask;
+        return CSR_32_Intel_SVML_RegMask;
+    }
+  }
   case CallingConv::HHVM:
     return CSR_64_HHVM_RegMask;
   case CallingConv::X86_RegCall:
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index b1103f8..69566f0 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -792,6 +792,7 @@ public:
     case CallingConv::X86_ThisCall:
     case CallingConv::X86_VectorCall:
     case CallingConv::Intel_OCL_BI:
+    case CallingConv::Intel_SVML:
       return isTargetWin64();
     // This convention allows using the Win64 convention on other targets.
     case CallingConv::Win64:
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index c45dee5..4521fc9 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -605,6 +605,27 @@ protected:
   /// vector of instructions.
   void addMetadata(ArrayRef<Value *> To, Instruction *From);
 
+  /// Check legality of given SVML call instruction \p VecCall generated for
+  /// scalar call \p Call. If illegal then the appropriate legal instruction
+  /// is returned.
+  Value *legalizeSVMLCall(CallInst *VecCall, CallInst *Call);
+
+  /// Returns the legal VF for a call instruction \p CI using TTI information
+  /// and vector type.
+  unsigned getLegalVFForCall(CallInst *CI);
+
+  /// Partially vectorize a given call \p Call by breaking it down into multiple
+  /// calls of \p LegalCall, decided by the variant VF \p LegalVF.
+  Value *partialVectorizeCall(CallInst *Call, CallInst *LegalCall,
+                              unsigned LegalVF);
+
+  /// Generate shufflevector instruction for a vector value \p V based on the
+  /// current \p Part and a smaller VF \p LegalVF.
+  Value *generateShuffleValue(Value *V, unsigned LegalVF, unsigned Part);
+
+  /// Combine partially vectorized calls stored in \p CallResults.
+  Value *combinePartialVecCalls(SmallVectorImpl<Value *> &CallResults);
+
   /// The original loop.
   Loop *OrigLoop;
 
@@ -4106,6 +4127,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
       }
 
       Function *VectorF;
+      bool FromSVML = false;
       if (UseVectorIntrinsic) {
         // Use vector version of the intrinsic.
         Type *TysForDecl[] = {CI->getType()};
@@ -4114,7 +4136,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
       } else {
         // Use vector version of the library call.
-        StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
+        bool IsFast = CI->getFastMathFlags().isFast();
+        std::string VFnName = TLI->getVectorizedFunction(FnName, VF, FromSVML, IsFast);
         assert(!VFnName.empty() && "Vector function name is empty.");
         VectorF = M->getFunction(VFnName);
         if (!VectorF) {
@@ -4133,9 +4156,21 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
 
       if (isa<FPMathOperator>(V))
         V->copyFastMathFlags(CI);
-
-      VectorLoopValueMap.setVectorValue(&I, Part, V);
-      addMetadata(V, &I);
+      // Perform legalization of SVML call instruction only if original call
+      // was not Intrinsic
+      if (FromSVML) {
+        assert((V->getCalledFunction()->getName()).startswith("__svml"));
+        LLVM_DEBUG(dbgs() << "LV(SVML): Vector call inst:"; V->dump());
+        V->setCallingConv(CallingConv::Intel_SVML);
+        auto *LegalV = cast<Instruction>(legalizeSVMLCall(V, CI));
+        LLVM_DEBUG(dbgs() << "LV: Completed SVML legalization.\n LegalV: ";
+                   LegalV->dump());
+        VectorLoopValueMap.setVectorValue(&I, Part, LegalV);
+        addMetadata(LegalV, &I);
+      } else {
+        VectorLoopValueMap.setVectorValue(&I, Part, V);
+        addMetadata(V, &I);
+      }
     }
 
     break;
@@ -4168,6 +4203,242 @@ void InnerLoopVectorizer::updateAnalysis() {
   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
 }
 
+//===----------------------------------------------------------------------===//
+// Implementation of functions for SVML vector call legalization.
+//===----------------------------------------------------------------------===//
+//
+// Unlike other VECLIBs, SVML needs to be used with target-legal
+// vector types. Otherwise, link failures and/or runtime failures
+// will occur. A motivating example could be -
+//
+//   double *a;
+//   float *b;
+//   #pragma clang loop vectorize_width(8)
+//   for(i = 0; i < N; ++i) {
+//     a[i] = sin(i);   // Legal SVML VF must be 4 or below on AVX
+//     b[i] = cosf(i);  // VF can be 8 on AVX since 8 floats can fit in YMM
+//    }
+//
+// Current implementation of vector code generation in LV is
+// driven based on a single VF (in InnerLoopVectorizer::VF). This
+// inhibits the flexibility of adjusting/choosing different VF
+// for different instructions.
+//
+// Due to this limitation it is much more straightforward to
+// first generate the illegal sin8 (svml_sin8 for SVML vector
+// library) call and then legalize it than trying to avoid
+// generating illegal code from the beginning.
+//
+// A solution for this problem is to check legality of the
+// call instruction right after generating it in vectorizer and
+// if it is illegal we split the call arguments and issue multiple
+// calls to match the legal VF. This is demonstrated currently for
+// the SVML vector library calls (non-intrinsic version only).
+//
+// Future directions and extensions:
+// 1) This legalization example shows us that a good direction
+//    for the VPlan framework would be to model the vector call
+//    instructions in a way that legal VF for each call is chosen
+//    correctly within vectorizer and illegal code generation is
+//    avoided.
+// 2) This logic can also be extended to general vector functions
+//    i.e. legalization OpenMP decalre simd functions. The
+//    requirements needed for this will be documented soon.
+
+Value *InnerLoopVectorizer::legalizeSVMLCall(CallInst *VecCall,
+                                             CallInst *Call) {
+  unsigned LegalVF = getLegalVFForCall(VecCall);
+
+  assert(LegalVF > 1 &&
+         "Legal VF for SVML call must be greater than 1 to vectorize");
+
+  if (LegalVF == VF)
+    return VecCall;
+  else if (LegalVF > VF)
+    // TODO: handle case when we are underfilling vectors
+    return VecCall;
+
+  // Legal VF for this SVML call is smaller than chosen VF, break it down into
+  // smaller call instructions
+
+  // Convert args, types and return type to match legal VF
+  SmallVector<Type *, 4> NewTys;
+  SmallVector<Value *, 4> NewArgs;
+  Type *NewRetTy = ToVectorTy(Call->getType(), LegalVF);
+
+  for (Value *ArgOperand : Call->arg_operands()) {
+    Type *Ty = ToVectorTy(ArgOperand->getType(), LegalVF);
+    NewTys.push_back(Ty);
+    NewArgs.push_back(UndefValue::get(Ty));
+  }
+
+  // Construct legal vector function
+  Function *F = Call->getCalledFunction();
+  StringRef FnName = F->getName();
+  Module *M = Call->getModule();
+  bool unused = false;
+  std::string LegalVFnName = TLI->getVectorizedFunction(FnName, LegalVF, unused, Call->getFastMathFlags().isFast());
+  LLVM_DEBUG(dbgs() << "LV(SVML): LegalVFnName: " << LegalVFnName << " FnName: " << FnName << "\n");
+  assert(!LegalVFnName.empty() && (LegalVFnName != FnName) &&
+         "Could not find legal vector function in TLI.");
+
+  Function *LegalVectorF = M->getFunction(LegalVFnName);
+  if (!LegalVectorF) {
+    FunctionType *LegalFTy = FunctionType::get(NewRetTy, NewTys, false);
+    LegalVectorF = Function::Create(LegalFTy, Function::ExternalLinkage, LegalVFnName, M);
+    LegalVectorF->copyAttributesFrom(F);
+  }
+  assert(LegalVectorF && "Can't create legal SVML vector function.");
+
+  LLVM_DEBUG(dbgs() << "LV(SVML): LegalVectorF: "; LegalVectorF->dump());
+
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  Call->getOperandBundlesAsDefs(OpBundles);
+  CallInst *LegalV = CallInst::Create(LegalVectorF, NewArgs, OpBundles);
+
+  if (isa<FPMathOperator>(LegalV))
+    LegalV->copyFastMathFlags(Call);
+
+  // Set SVML calling conventions
+  LegalV->setCallingConv(CallingConv::Intel_SVML);
+
+  LLVM_DEBUG(dbgs() << "LV(SVML): LegalV: "; LegalV->dump());
+
+  Value *LegalizedCall = partialVectorizeCall(VecCall, LegalV, LegalVF);
+
+  LLVM_DEBUG(dbgs() << "LV(SVML): LegalizedCall: "; LegalizedCall->dump());
+
+  // Remove the illegal call from Builder
+  VecCall->eraseFromParent();
+
+  if (LegalV)
+    delete LegalV;
+
+  return LegalizedCall;
+}
+
+unsigned InnerLoopVectorizer::getLegalVFForCall(CallInst *CI) {
+  const DataLayout DL = CI->getModule()->getDataLayout();
+  FunctionType *CallFT = CI->getFunctionType();
+  // All functions that need legalization should have a vector return type.
+  // This is true for all SVML functions that are currently supported.
+  assert(isa<VectorType>(CallFT->getReturnType()) &&
+         "Return type of call that needs legalization is not a vector.");
+  auto *VecCallRetType = cast<VectorType>(CallFT->getReturnType());
+  Type *ElemType = VecCallRetType->getElementType();
+
+  unsigned TypeBitWidth = DL.getTypeSizeInBits(ElemType);
+  unsigned VectorBitWidth = TTI->getRegisterBitWidth(true);
+  unsigned LegalVF = VectorBitWidth / TypeBitWidth;
+
+  LLVM_DEBUG(dbgs() << "LV(SVML): Type Bit Width: " << TypeBitWidth << "\n");
+  LLVM_DEBUG(dbgs() << "LV(SVML): Current VL: " << VF << "\n");
+  LLVM_DEBUG(dbgs() << "LV(SVML): Vector Bit Width: " << VectorBitWidth
+                    << "\n");
+  LLVM_DEBUG(dbgs() << "LV(SVML): Legal Target VL: " << LegalVF << "\n");
+
+  return LegalVF;
+}
+
+// Partial vectorization of a call instruction is achieved by making clones of
+// \p LegalCall and overwriting its argument operands with shufflevector
+// equivalent decided based on \p LegalVF and current Part being filled.
+Value *InnerLoopVectorizer::partialVectorizeCall(CallInst *Call,
+                                                 CallInst *LegalCall,
+                                                 unsigned LegalVF) {
+  unsigned NumParts = VF / LegalVF;
+  LLVM_DEBUG(dbgs() << "LV(SVML): NumParts: " << NumParts << "\n");
+  SmallVector<Value *, 8> CallResults;
+
+  for (unsigned Part = 0; Part < NumParts; ++Part) {
+    auto *ClonedCall = cast<CallInst>(LegalCall->clone());
+
+    // Update the arg operand of cloned call to shufflevector
+    for (unsigned i = 0, ie = Call->getNumArgOperands(); i != ie; ++i) {
+      auto *NewOp = generateShuffleValue(Call->getArgOperand(i), LegalVF, Part);
+      ClonedCall->setArgOperand(i, NewOp);
+    }
+
+    LLVM_DEBUG(dbgs() << "LV(SVML): ClonedCall: "; ClonedCall->dump());
+
+    auto *PartialVecCall = Builder.Insert(ClonedCall);
+    CallResults.push_back(PartialVecCall);
+  }
+
+  return combinePartialVecCalls(CallResults);
+}
+
+Value *InnerLoopVectorizer::generateShuffleValue(Value *V, unsigned LegalVF,
+                                                 unsigned Part) {
+  // Example:
+  // Consider the following vector code -
+  // %1 = sitofp <4 x i32> %0 to <4 x double>
+  // %2 = call <4 x double> @__svml_sin4(<4 x double> %1)
+  //
+  // If the LegalVF is 2, we partially vectorize the sin4 call by invoking
+  // generateShuffleValue on the operand %1
+  // If Part = 1, output value is -
+  // %shuffle = shufflevector <4 x double> %1, <4 x double> undef, <2 x i32><i32 0, i32 1>
+  // and if Part = 2, output is -
+  // %shuffle7 =shufflevector <4 x double> %1, <4 x double> undef, <2 x i32><i32 2, i32 3>
+
+  assert(isa<VectorType>(V->getType()) &&
+         "Cannot generate shuffles for non-vector values.");
+  SmallVector<unsigned, 4> ShuffleMask;
+  Value *Undef = UndefValue::get(V->getType());
+
+  unsigned ElemIdx = Part * LegalVF;
+
+  for (unsigned K = 0; K < LegalVF; K++)
+    ShuffleMask.push_back(ElemIdx + K);
+
+  auto *ShuffleInst =
+      Builder.CreateShuffleVector(V, Undef, ShuffleMask, "shuffle");
+
+  return ShuffleInst;
+}
+
+// Results of the calls executed by smaller legal call instructions must be
+// combined to match the original VF for later use. This is done by constructing
+// shufflevector instructions in a cumulative fashion.
+Value *InnerLoopVectorizer::combinePartialVecCalls(
+    SmallVectorImpl<Value *> &CallResults) {
+  assert(isa<VectorType>(CallResults[0]->getType()) &&
+         "Cannot combine calls with non-vector results.");
+  auto *CallType = cast<VectorType>(CallResults[0]->getType());
+
+  Value *CombinedShuffle;
+  unsigned NumElems = CallType->getNumElements() * 2;
+  unsigned NumRegs = CallResults.size();
+
+  assert(NumRegs >= 2 && isPowerOf2_32(NumRegs) &&
+         "Number of partial vector calls to combine must be a power of 2 "
+         "(atleast 2^1)");
+
+  while (NumRegs > 1) {
+    for (unsigned I = 0; I < NumRegs; I += 2) {
+      SmallVector<unsigned, 4> ShuffleMask;
+      for (unsigned J = 0; J < NumElems; J++)
+        ShuffleMask.push_back(J);
+
+      CombinedShuffle = Builder.CreateShuffleVector(
+          CallResults[I], CallResults[I + 1], ShuffleMask, "combined");
+      LLVM_DEBUG(dbgs() << "LV(SVML): CombinedShuffle:";
+                 CombinedShuffle->dump());
+      CallResults.push_back(CombinedShuffle);
+    }
+
+    SmallVector<Value *, 2>::iterator Start = CallResults.begin();
+    SmallVector<Value *, 2>::iterator End = Start + NumRegs;
+    CallResults.erase(Start, End);
+
+    NumElems *= 2;
+    NumRegs /= 2;
+  }
+
+  return CombinedShuffle;
+}
+
 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
   // We should not collect Scalars more than once per VF. Right now, this
   // function is called from collectUniformsAndScalars(), which already does
diff --git a/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/test/Transforms/LoopVectorize/X86/scatter_crash.ll
old mode 100755
new mode 100644
diff --git a/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll b/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
index 5a4bfe5..4da2e48 100644
--- a/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
+++ b/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
@@ -39,7 +39,8 @@ for.end:                                          ; preds = %for.body
 declare double @__exp_finite(double) #0
 
 ; CHECK-LABEL: @exp_f64
-; CHECK: <4 x double> @__svml_exp4
+; CHECK: <2 x double> @__svml_exp2
+; CHECK: <2 x double> @__svml_exp2
 ; CHECK: ret
 define void @exp_f64(double* nocapture %varray) {
 entry:
@@ -99,7 +100,8 @@ for.end:                                          ; preds = %for.body
 declare double @__log_finite(double) #0
 
 ; CHECK-LABEL: @log_f64
-; CHECK: <4 x double> @__svml_log4
+; CHECK: <2 x double> @__svml_log2
+; CHECK: <2 x double> @__svml_log2
 ; CHECK: ret
 define void @log_f64(double* nocapture %varray) {
 entry:
@@ -159,7 +161,8 @@ for.end:                                          ; preds = %for.body
 declare double @__pow_finite(double, double) #0
 
 ; CHECK-LABEL: @pow_f64
-; CHECK: <4 x double> @__svml_pow4
+; CHECK: <2 x double> @__svml_pow2
+; CHECK: <2 x double> @__svml_pow2
 ; CHECK: ret
 define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
 entry:
diff --git a/test/Transforms/LoopVectorize/X86/svml-calls.ll b/test/Transforms/LoopVectorize/X86/svml-calls.ll
index 8ff62f1..4d48d98 100644
--- a/test/Transforms/LoopVectorize/X86/svml-calls.ll
+++ b/test/Transforms/LoopVectorize/X86/svml-calls.ll
@@ -31,7 +31,7 @@ declare float @llvm.log.f32(float) #0
 
 define void @sin_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @sin_f64(
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -54,7 +54,7 @@ for.end:
 
 define void @sin_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @sin_f32(
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc <4 x float> @__svml_sinf4_ha(<4 x float> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -77,7 +77,7 @@ for.end:
 
 define void @sin_f64_intrinsic(double* nocapture %varray) {
 ; CHECK-LABEL: @sin_f64_intrinsic(
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -100,7 +100,7 @@ for.end:
 
 define void @sin_f32_intrinsic(float* nocapture %varray) {
 ; CHECK-LABEL: @sin_f32_intrinsic(
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc <4 x float> @__svml_sinf4_ha(<4 x float> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -123,7 +123,7 @@ for.end:
 
 define void @cos_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @cos_f64(
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -146,7 +146,7 @@ for.end:
 
 define void @cos_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @cos_f32(
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc <4 x float> @__svml_cosf4_ha(<4 x float> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -169,7 +169,7 @@ for.end:
 
 define void @cos_f64_intrinsic(double* nocapture %varray) {
 ; CHECK-LABEL: @cos_f64_intrinsic(
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -192,7 +192,7 @@ for.end:
 
 define void @cos_f32_intrinsic(float* nocapture %varray) {
 ; CHECK-LABEL: @cos_f32_intrinsic(
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc <4 x float> @__svml_cosf4_ha(<4 x float> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -215,7 +215,7 @@ for.end:
 
 define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
 ; CHECK-LABEL: @pow_f64(
-; CHECK:    [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
+; CHECK:    [[TMP8:%.*]] = call intel_svmlcc <4 x double> @__svml_pow4_ha(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -240,7 +240,7 @@ for.end:
 
 define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) {
 ; CHECK-LABEL: @pow_f64_intrinsic(
-; CHECK:    [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
+; CHECK:    [[TMP8:%.*]] = call intel_svmlcc <4 x double> @__svml_pow4_ha(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -265,7 +265,7 @@ for.end:
 
 define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
 ; CHECK-LABEL: @pow_f32(
-; CHECK:    [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
+; CHECK:    [[TMP8:%.*]] = call intel_svmlcc <4 x float> @__svml_powf4_ha(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -290,7 +290,7 @@ for.end:
 
 define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) {
 ; CHECK-LABEL: @pow_f32_intrinsic(
-; CHECK:    [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
+; CHECK:    [[TMP8:%.*]] = call intel_svmlcc <4 x float> @__svml_powf4_ha(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -315,7 +315,7 @@ for.end:
 
 define void @exp_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @exp_f64(
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -338,7 +338,7 @@ for.end:
 
 define void @exp_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @exp_f32(
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc <4 x float> @__svml_expf4_ha(<4 x float> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -361,7 +361,7 @@ for.end:
 
 define void @exp_f64_intrinsic(double* nocapture %varray) {
 ; CHECK-LABEL: @exp_f64_intrinsic(
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -384,7 +384,7 @@ for.end:
 
 define void @exp_f32_intrinsic(float* nocapture %varray) {
 ; CHECK-LABEL: @exp_f32_intrinsic(
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc <4 x float> @__svml_expf4_ha(<4 x float> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -407,7 +407,7 @@ for.end:
 
 define void @log_f64(double* nocapture %varray) {
 ; CHECK-LABEL: @log_f64(
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -430,7 +430,7 @@ for.end:
 
 define void @log_f32(float* nocapture %varray) {
 ; CHECK-LABEL: @log_f32(
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc <4 x float> @__svml_logf4_ha(<4 x float> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -453,7 +453,7 @@ for.end:
 
 define void @log_f64_intrinsic(double* nocapture %varray) {
 ; CHECK-LABEL: @log_f64_intrinsic(
-; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -476,7 +476,7 @@ for.end:
 
 define void @log_f32_intrinsic(float* nocapture %varray) {
 ; CHECK-LABEL: @log_f32_intrinsic(
-; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    [[TMP5:%.*]] = call intel_svmlcc <4 x float> @__svml_logf4_ha(<4 x float> [[TMP4:%.*]])
 ; CHECK:    ret void
 ;
 entry:
@@ -497,5 +497,44 @@ for.end:
   ret void
 }
 
-attributes #0 = { nounwind readnone }
+; CHECK-LABEL: @atan2_finite
+; CHECK: intel_svmlcc <4 x double> @__svml_atan24
+; CHECK: intel_svmlcc <4 x double> @__svml_atan24
+; CHECK: ret
+
+declare double @__atan2_finite(double, double) local_unnamed_addr #0
 
+define void @atan2_finite([100 x double]* nocapture %varray) local_unnamed_addr #0 {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc7, %entry
+  %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.inc7 ]
+  %0 = trunc i64 %indvars.iv19 to i32
+  %conv = sitofp i32 %0 to double
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %1 = trunc i64 %indvars.iv.next to i32
+  %conv4 = sitofp i32 %1 to double
+  %call = tail call fast double @__atan2_finite(double %conv, double %conv4)
+  %arrayidx6 = getelementptr inbounds [100 x double], [100 x double]* %varray, i64 %indvars.iv19, i64 %indvars.iv
+  store double %call, double* %arrayidx6, align 8
+  %exitcond = icmp eq i64 %indvars.iv.next, 100
+  br i1 %exitcond, label %for.inc7, label %for.body3, !llvm.loop !5
+
+for.inc7:                                         ; preds = %for.body3
+  %indvars.iv.next20 = add nuw nsw i64 %indvars.iv19, 1
+  %exitcond21 = icmp eq i64 %indvars.iv.next20, 100
+  br i1 %exitcond21, label %for.end9, label %for.cond1.preheader
+
+for.end9:                                         ; preds = %for.inc7
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+!5 = distinct !{!5, !6, !7}
+!6 = !{!"llvm.loop.vectorize.width", i32 8}
+!7 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll b/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll
new file mode 100644
index 0000000..0524c28
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/svml-legal-calls.ll
@@ -0,0 +1,513 @@
+; Check legalization of SVML calls, including intrinsic versions (like @llvm.<fn_name>.<type>).
+
+; RUN: opt -vector-library=SVML -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -mattr=avx -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare double @sin(double) #0
+declare float @sinf(float) #0
+declare double @llvm.sin.f64(double) #0
+declare float @llvm.sin.f32(float) #0
+
+declare double @cos(double) #0
+declare float @cosf(float) #0
+declare double @llvm.cos.f64(double) #0
+declare float @llvm.cos.f32(float) #0
+
+declare double @pow(double, double) #0
+declare float @powf(float, float) #0
+declare double @llvm.pow.f64(double, double) #0
+declare float @llvm.pow.f32(float, float) #0
+
+declare double @exp(double) #0
+declare float @expf(float) #0
+declare double @llvm.exp.f64(double) #0
+declare float @llvm.exp.f32(float) #0
+
+declare double @log(double) #0
+declare float @logf(float) #0
+declare double @llvm.log.f64(double) #0
+declare float @llvm.log.f32(float) #0
+
+
+define void @sin_f64(double* nocapture %varray) {
+; CHECK-LABEL: @sin_f64(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc <4 x double> @__svml_sin4_ha(<4 x double> [[TMP2:%.*]])
+; CHECK:    [[TMP3:%.*]] = call intel_svmlcc <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @sin(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @sin_f32(float* nocapture %varray) {
+; CHECK-LABEL: @sin_f32(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc <8 x float> @__svml_sinf8_ha(<8 x float> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @sinf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @sin_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @sin_f64_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc <4 x double> @__svml_sin4_ha(<4 x double> [[TMP2:%.*]])
+; CHECK:    [[TMP3:%.*]] = call intel_svmlcc <4 x double> @__svml_sin4_ha(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.sin.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @sin_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @sin_f32_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc <8 x float> @__svml_sinf8_ha(<8 x float> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.sin.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cos_f64(double* nocapture %varray) {
+; CHECK-LABEL: @cos_f64(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc <4 x double> @__svml_cos4_ha(<4 x double> [[TMP2:%.*]])
+; CHECK:    [[TMP3:%.*]] = call intel_svmlcc <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @cos(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cos_f32(float* nocapture %varray) {
+; CHECK-LABEL: @cos_f32(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc <8 x float> @__svml_cosf8_ha(<8 x float> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @cosf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cos_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @cos_f64_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc <4 x double> @__svml_cos4_ha(<4 x double> [[TMP2:%.*]])
+; CHECK:    [[TMP3:%.*]] = call intel_svmlcc <4 x double> @__svml_cos4_ha(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.cos.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cos_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @cos_f32_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc <8 x float> @__svml_cosf8_ha(<8 x float> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.cos.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f64(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc <4 x double> @__svml_pow4_ha(<4 x double> [[TMP2:%.*]], <4 x double> [[TMP3:%.*]])
+; CHECK:    [[TMP4:%.*]] = call intel_svmlcc <4 x double> @__svml_pow4_ha(<4 x double> [[TMP5:%.*]], <4 x double> [[TMP6:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv
+  %tmp1 = load double, double* %arrayidx, align 4
+  %tmp2 = tail call double @pow(double %conv, double %tmp1)
+  %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %tmp2, double* %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f64_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc <4 x double> @__svml_pow4_ha(<4 x double> [[TMP2:%.*]], <4 x double> [[TMP3:%.*]])
+; CHECK:    [[TMP4:%.*]] = call intel_svmlcc <4 x double> @__svml_pow4_ha(<4 x double> [[TMP5:%.*]], <4 x double> [[TMP6:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv
+  %tmp1 = load double, double* %arrayidx, align 4
+  %tmp2 = tail call double @llvm.pow.f64(double %conv, double %tmp1)
+  %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %tmp2, double* %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f32(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc <8 x float> @__svml_powf8_ha(<8 x float> [[TMP2:%.*]], <8 x float> [[WIDE_LOAD:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv
+  %tmp1 = load float, float* %arrayidx, align 4
+  %tmp2 = tail call float @powf(float %conv, float %tmp1)
+  %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %tmp2, float* %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f32_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc <8 x float> @__svml_powf8_ha(<8 x float> [[TMP2:%.*]], <8 x float> [[TMP3:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv
+  %tmp1 = load float, float* %arrayidx, align 4
+  %tmp2 = tail call float @llvm.pow.f32(float %conv, float %tmp1)
+  %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %tmp2, float* %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @exp_f64(double* nocapture %varray) {
+; CHECK-LABEL: @exp_f64(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc <4 x double> @__svml_exp4_ha(<4 x double> [[TMP2:%.*]])
+; CHECK:    [[TMP3:%.*]] = call intel_svmlcc <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @exp(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @exp_f32(float* nocapture %varray) {
+; CHECK-LABEL: @exp_f32(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc <8 x float> @__svml_expf8_ha(<8 x float> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @expf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @exp_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @exp_f64_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc <4 x double> @__svml_exp4_ha(<4 x double> [[TMP2:%.*]])
+; CHECK:    [[TMP3:%.*]] = call intel_svmlcc <4 x double> @__svml_exp4_ha(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.exp.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @exp_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @exp_f32_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc <8 x float> @__svml_expf8_ha(<8 x float> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.exp.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @log_f64(double* nocapture %varray) {
+; CHECK-LABEL: @log_f64(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc <4 x double> @__svml_log4_ha(<4 x double> [[TMP2:%.*]])
+; CHECK:    [[TMP3:%.*]] = call intel_svmlcc <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @log(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @log_f32(float* nocapture %varray) {
+; CHECK-LABEL: @log_f32(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc <8 x float> @__svml_logf8_ha(<8 x float> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @logf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @log_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @log_f64_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc <4 x double> @__svml_log4_ha(<4 x double> [[TMP2:%.*]])
+; CHECK:    [[TMP3:%.*]] = call intel_svmlcc <4 x double> @__svml_log4_ha(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.log.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @log_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @log_f32_intrinsic(
+; CHECK:    [[TMP1:%.*]] = call intel_svmlcc <8 x float> @__svml_logf8_ha(<8 x float> [[TMP2:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.log.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+
diff --git a/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll b/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll
new file mode 100644
index 0000000..007eea7
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/svml-legal-codegen.ll
@@ -0,0 +1,61 @@
+; Check that vector codegen splits illegal sin8 call to two sin4 calls on AVX for double datatype.
+; The C code used to generate this test:
+
+; #include <math.h>
+;
+; void foo(double *a, int N){
+;   int i;
+; #pragma clang loop vectorize_width(8)
+;   for (i=0;i<N;i++){
+;     a[i] = sin(i);
+;   }
+; }
+
+; RUN: opt -O2 -vector-library=SVML -loop-vectorize -force-vector-width=8 -mattr=avx -S < %s | FileCheck %s
+
+; CHECK: [[I1:%.*]] = sitofp <8 x i32> [[I0:%.*]] to <8 x double>
+; CHECK-NEXT: [[S1:%shuffle.*]] = shufflevector <8 x double> [[I1]], <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[I2:%.*]] = call fast intel_svmlcc <4 x double> @__svml_sin4(<4 x double> [[S1]])
+; CHECK-NEXT: [[S2:%shuffle.*]] = shufflevector <8 x double> [[I1]], <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[I3:%.*]] = call fast intel_svmlcc <4 x double> @__svml_sin4(<4 x double> [[S2]])
+; CHECK-NEXT: [[comb:%combined.*]] = shufflevector <4 x double> [[I2]], <4 x double> [[I3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: store <8 x double> [[comb]], <8 x double>* [[TMP:%.*]], align 8
+
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define dso_local void @foo(double* nocapture %a, i32 %N) local_unnamed_addr #0 {
+entry:
+  %cmp5 = icmp sgt i32 %N, 0
+  br i1 %cmp5, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %0 = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %0 to double
+  %call = tail call fast double @sin(double %conv) #2
+  %arrayidx = getelementptr inbounds double, double* %a, i64 %indvars.iv
+  store double %call, double* %arrayidx, align 8, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !6
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Function Attrs: nounwind
+declare dso_local double @sin(double) local_unnamed_addr #1
+
+!2 = !{!3, !3, i64 0}
+!3 = !{!"double", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = distinct !{!6, !7}
+!7 = !{!"llvm.loop.vectorize.width", i32 8}
diff --git a/utils/TableGen/CMakeLists.txt b/utils/TableGen/CMakeLists.txt
index c88365a..7fb5795 100644
--- a/utils/TableGen/CMakeLists.txt
+++ b/utils/TableGen/CMakeLists.txt
@@ -39,6 +39,7 @@ add_tablegen(llvm-tblgen LLVM
   SearchableTableEmitter.cpp
   SubtargetEmitter.cpp
   SubtargetFeatureInfo.cpp
+  SVMLEmitter.cpp
   TableGen.cpp
   Types.cpp
   X86DisassemblerTables.cpp
diff --git a/utils/TableGen/SVMLEmitter.cpp b/utils/TableGen/SVMLEmitter.cpp
new file mode 100644
index 0000000..8800ca8
--- /dev/null
+++ b/utils/TableGen/SVMLEmitter.cpp
@@ -0,0 +1,110 @@
+//===------ SVMLEmitter.cpp - Generate SVML function variants -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This tablegen backend emits the scalar to svml function map for TLI.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CodeGenTarget.h"
+#include "llvm/Support/Format.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+#include <map>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "SVMLVariants"
+#include "llvm/Support/Debug.h"
+
+namespace {
+
+class SVMLVariantsEmitter {
+
+  RecordKeeper &Records;
+
+private:
+  void emitSVMLVariants(raw_ostream &OS);
+
+public:
+  SVMLVariantsEmitter(RecordKeeper &R) : Records(R) {}
+
+  void run(raw_ostream &OS);
+};
+} // End anonymous namespace
+
+/// \brief Emit the set of SVML variant function names.
+// The default is to emit the high accuracy SVML variants until a mechanism is
+// introduced to allow a selection of different variants through precision
+// requirements specified by the user. This code generates mappings to svml
+// that are in the scalar form of llvm intrinsics, math library calls, or the
+// finite variants of math library calls.
+void SVMLVariantsEmitter::emitSVMLVariants(raw_ostream &OS) {
+
+  const unsigned MinSinglePrecVL = 4;
+  const unsigned MaxSinglePrecVL = 16;
+  const unsigned MinDoublePrecVL = 2;
+  const unsigned MaxDoublePrecVL = 8;
+
+  OS << "#ifdef GET_SVML_VARIANTS\n";
+
+  for (const auto &D : Records.getAllDerivedDefinitions("SvmlVariant")) {
+    StringRef SvmlVariantNameStr = D->getName();
+    // Single Precision SVML
+    for (unsigned VL = MinSinglePrecVL; VL <= MaxSinglePrecVL; VL *= 2) {
+      // Emit the scalar math library function to svml function entry.
+      OS << "{\"" << SvmlVariantNameStr << "f" << "\", ";
+      OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", "
+         << VL << "},\n";
+
+      // Emit the scalar intrinsic to svml function entry.
+      OS << "{\"" << "llvm." << SvmlVariantNameStr << ".f32" << "\", ";
+      OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", "
+         << VL << "},\n";
+
+      // Emit the finite math library function to svml function entry.
+      OS << "{\"__" << SvmlVariantNameStr << "f_finite" << "\", ";
+      OS << "\"" << "__svml_" << SvmlVariantNameStr << "f" << VL << "\", "
+         << VL << "},\n";
+    }
+
+    // Double Precision SVML
+    for (unsigned VL = MinDoublePrecVL; VL <= MaxDoublePrecVL; VL *= 2) {
+      // Emit the scalar math library function to svml function entry.
+      OS << "{\"" << SvmlVariantNameStr << "\", ";
+      OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", " << VL
+         << "},\n";
+
+      // Emit the scalar intrinsic to svml function entry.
+      OS << "{\"" << "llvm." << SvmlVariantNameStr << ".f64" << "\", ";
+      OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", " << VL
+         << "},\n";
+
+      // Emit the finite math library function to svml function entry.
+      OS << "{\"__" << SvmlVariantNameStr << "_finite" << "\", ";
+      OS << "\"" << "__svml_" << SvmlVariantNameStr << VL << "\", "
+         << VL << "},\n";
+    }
+  }
+
+  OS << "#endif // GET_SVML_VARIANTS\n\n";
+}
+
+void SVMLVariantsEmitter::run(raw_ostream &OS) {
+  emitSVMLVariants(OS);
+}
+
+namespace llvm {
+
+void EmitSVMLVariants(RecordKeeper &RK, raw_ostream &OS) {
+  SVMLVariantsEmitter(RK).run(OS);
+}
+
+} // End llvm namespace
diff --git a/utils/TableGen/TableGen.cpp b/utils/TableGen/TableGen.cpp
index d5b6a3c..64d65e5 100644
--- a/utils/TableGen/TableGen.cpp
+++ b/utils/TableGen/TableGen.cpp
@@ -54,6 +54,7 @@ enum ActionType {
   GenX86FoldTables,
   GenRegisterBank,
   GenExegesis,
+  GenSVMLVariants,
 };
 
 namespace {
@@ -120,7 +121,9 @@ namespace {
                     clEnumValN(GenRegisterBank, "gen-register-bank",
                                "Generate registers bank descriptions"),
                     clEnumValN(GenExegesis, "gen-exegesis",
-                               "Generate llvm-exegesis tables")));
+                               "Generate llvm-exegesis tables"),
+                    clEnumValN(GenSVMLVariants, "gen-svml",
+                               "Generate SVML variant function names")));
 
   cl::OptionCategory PrintEnumsCat("Options for -print-enums");
   cl::opt<std::string>
@@ -237,6 +240,9 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   case GenExegesis:
     EmitExegesis(Records, OS);
     break;
+  case GenSVMLVariants:
+    EmitSVMLVariants(Records, OS);
+    break;
   }
 
   return false;
diff --git a/utils/TableGen/TableGenBackends.h b/utils/TableGen/TableGenBackends.h
index f4f2909..544a142 100644
--- a/utils/TableGen/TableGenBackends.h
+++ b/utils/TableGen/TableGenBackends.h
@@ -90,6 +90,7 @@ void EmitX86EVEX2VEXTables(RecordKeeper &RK, raw_ostream &OS);
 void EmitX86FoldTables(RecordKeeper &RK, raw_ostream &OS);
 void EmitRegisterBank(RecordKeeper &RK, raw_ostream &OS);
 void EmitExegesis(RecordKeeper &RK, raw_ostream &OS);
+void EmitSVMLVariants(RecordKeeper &RK, raw_ostream &OS);
 
 } // End llvm namespace
 
diff --git a/utils/vim/syntax/llvm.vim b/utils/vim/syntax/llvm.vim
index 7ea007f..0abeadf 100644
--- a/utils/vim/syntax/llvm.vim
+++ b/utils/vim/syntax/llvm.vim
@@ -94,6 +94,7 @@ syn keyword llvmKeyword
       \ inreg
       \ inteldialect
       \ intel_ocl_bicc
+      \ intel_svmlcc
       \ internal
       \ linkonce
       \ linkonce_odr
-- 
1.8.3.1