diff --git a/VEX/priv/guest_arm64_toIR.c b/VEX/priv/guest_arm64_toIR.c
index e6b92c7a0..9f058db81 100644
--- a/VEX/priv/guest_arm64_toIR.c
+++ b/VEX/priv/guest_arm64_toIR.c
@@ -39,11 +39,6 @@
 
      FMULX is treated the same as FMUL.  That's also not correct.
 
-   * Floating multiply-add (etc) insns.  Are split into a multiply and 
-     an add, and so suffer double rounding and hence sometimes the
-     least significant mantissa bit is incorrect.  Fix: use the IR
-     multiply-add IROps instead.
-
    * FRINTX might be need updating to set the inexact computation FPSR flag
 
    * Ditto FCVTXN.  No idea what "round to odd" means.  This implementation
@@ -546,13 +541,13 @@ static IROp mkFMADDF ( IRType ty ) {
    }
 }
 
-static IROp mkFMSUBF ( IRType ty ) {
-   switch (ty) {
-      case Ity_F32: return Iop_MSubF32;
-      case Ity_F64: return Iop_MSubF64;
-      default: vpanic("mkFMSUBF");
-   }
-}
+/* static IROp mkFMSUBF ( IRType ty ) { */
+/*    switch (ty) { */
+/*       case Ity_F32: return Iop_MSubF32; */
+/*       case Ity_F64: return Iop_MSubF64; */
+/*       default: vpanic("mkFMSUBF"); */
+/*    } */
+/* } */
 
 static IROp mkSUBF ( IRType ty ) {
    switch (ty) {
@@ -2224,6 +2219,120 @@ static IRExpr* math_MAYBE_ZERO_HI64_fromE ( UInt bitQ, IRExpr* fullWidth )
    return math_MAYBE_ZERO_HI64(bitQ, fullWidthT);
 }
 
+/* Helper to unroll FMA*/
+static void unroll2_V128(IRExpr* aExpr,  IRTemp* ai_F64){
+   IRTemp ai_I64[2];
+   for(int i=0; i<2; i++) ai_I64[i]= newTemp(Ity_I64);
+
+   assign(ai_I64[0], unop(Iop_V128to64,   aExpr ));
+   assign(ai_I64[1], unop(Iop_V128HIto64, aExpr ));
+
+   for(int i=0; i<2; i++){
+      ai_F64[i]= newTemp(Ity_F64);
+      assign(ai_F64[i], unop(Iop_ReinterpI64asF64, mkexpr(ai_I64[i]) ));
+   }
+}
+
+static void unroll4_V128(IRExpr* aExpr,  IRTemp* ai_F32){
+   IRTemp a64_LO = newTemp(Ity_I64);
+   IRTemp a64_HI = newTemp(Ity_I64);
+   assign(a64_LO, unop(Iop_V128to64,   aExpr ));
+   assign(a64_HI, unop(Iop_V128HIto64, aExpr ));
+
+   IRTemp ai_I32[4];
+   for(int i=0; i<4 ; i++) ai_I32[i]=newTemp(Ity_I32);
+
+   assign(ai_I32[0], unop(Iop_64to32,   mkexpr(a64_LO) ));
+   assign(ai_I32[1], unop(Iop_64HIto32, mkexpr(a64_LO) ));
+   assign(ai_I32[2], unop(Iop_64to32,   mkexpr(a64_HI) ));
+   assign(ai_I32[3], unop(Iop_64HIto32, mkexpr(a64_HI) ));
+
+   for(int i=0; i<4 ; i++){
+      ai_F32[i]=newTemp(Ity_F32);
+      assign(ai_F32[i], unop(Iop_ReinterpI32asF32, mkexpr(ai_I32[i]) ));
+   }
+}
+
+static IRTemp unroll_vectorized2_fma(Bool isSub, IRExpr* rm, IRExpr* aExpr, IRExpr* bExpr, IRExpr* cExpr){
+   //a*b+c
+   //rm : rounding mode
+
+   // unroll arguments
+   IRTemp ai_F64[2], bi_F64[2], ci_F64[2];
+   unroll2_V128(aExpr,ai_F64);
+   unroll2_V128(bExpr,bi_F64);
+   unroll2_V128(cExpr,ci_F64);
+
+   // fma loop with type conversion (I32->F32 and F32->I32 )
+   IRTemp ri_F64[2],ri_I64[2];
+   // IROp   opMaddOrSub = isSub ? Iop_MSubF64 : Iop_MAddF64;
+   IROp   opMaddOrSub = isSub ? Iop_MAddF64 : Iop_MAddF64;//Workaround to avoid problem with Msub
+   for(int i=0; i<2; i++){
+      ri_F64[i]= newTemp(Ity_F64);
+      //apply fma or fms
+      if(isSub){//Workarround
+         assign(ri_F64[i], qop( opMaddOrSub, rm,
+                                unop(Iop_NegF64 ,mkexpr(ai_F64[i])), mkexpr(bi_F64[i]), mkexpr(ci_F64[i])));
+      }else{
+         assign(ri_F64[i], qop( opMaddOrSub, rm,
+                                mkexpr(ai_F64[i]), mkexpr(bi_F64[i]), mkexpr(ci_F64[i])));
+      }
+      ri_I64[i]=newTemp(Ity_I64);
+      assign(ri_I64[i], unop(Iop_ReinterpF64asI64, mkexpr(ri_F64[i])));
+   }
+
+   //roll up the results
+   IRTemp res = newTempV128();
+   assign(res, binop(Iop_64HLtoV128, mkexpr(ri_I64[1]), mkexpr(ri_I64[0])));
+   return res;
+}
+
+
+static IRTemp unroll_vectorized4_fma(Bool isSub, IRExpr* rm, IRExpr* aExpr, IRExpr* bExpr, IRExpr* cExpr){
+   //a*b+c rm : rounding mode
+
+   // unroll arguments
+   IRTemp ai_F32[4],bi_F32[4],ci_F32[4];
+   unroll4_V128(aExpr, ai_F32);
+   unroll4_V128(bExpr, bi_F32);
+   unroll4_V128(cExpr, ci_F32);
+
+   // fma loop with type conversion (I32->F32 and F32->I32 )
+   IRTemp ri_F32[4],ri_I32[4];
+   //IROp   opMaddOrSub = isSub ? Iop_MSubF32 : Iop_MAddF32;
+   IROp   opMaddOrSub = isSub ? Iop_MAddF32 : Iop_MAddF32; //Workaround to avoid problem with Msub
+   for(int i=0; i<4 ; i++){
+      ri_F32[i]=newTemp(Ity_F32);
+
+      if(isSub){
+         assign(ri_F32[i], qop( opMaddOrSub, rm,
+                                unop(Iop_NegF32 ,mkexpr(ai_F32[i])),mkexpr(bi_F32[i]),mkexpr(ci_F32[i])));
+      }else{
+         assign(ri_F32[i], qop( opMaddOrSub, rm,
+			     mkexpr(ai_F32[i]),mkexpr(bi_F32[i]),mkexpr(ci_F32[i])));
+      }
+      ri_I32[i]=newTemp(Ity_I32);
+      assign(ri_I32[i], unop(Iop_ReinterpF32asI32, mkexpr(ri_F32[i])));
+   }
+
+   //roll up the results
+   IRTemp resLo = newTemp(Ity_I64);
+   IRTemp resHi = newTemp(Ity_I64);
+   assign(resLo, binop(Iop_32HLto64, mkexpr(ri_I32[1]), mkexpr(ri_I32[0])));
+   assign(resHi, binop(Iop_32HLto64, mkexpr(ri_I32[3]), mkexpr(ri_I32[2])));
+   IRTemp res    = newTempV128();
+   assign(res, binop(Iop_64HLtoV128, mkexpr(resHi), mkexpr(resLo)));
+   return res;
+}
+
+static IRTemp unroll_vectorized_fma(UInt unrollSize, Bool isSub, IRExpr* rm,
+				    IRExpr* aExpr, IRExpr* bExpr, IRExpr* cExpr){
+   vassert(unrollSize == 2 || unrollSize == 4 );
+   if(unrollSize==2){
+      return unroll_vectorized2_fma(isSub, rm, aExpr, bExpr,cExpr);
+   }
+   return unroll_vectorized4_fma(isSub, rm, aExpr, bExpr,cExpr);
+}
 
 /*------------------------------------------------------------*/
 /*--- FP comparison helpers                                ---*/
@@ -11757,19 +11866,17 @@ Bool dis_AdvSIMD_scalar_x_indexed_element(/*MB_OUT*/DisResult* dres, UInt insn)
       UInt   mm    = (bitM << 4) | mmLO4;
       assign(elem, getQRegLane(mm, index, ity));
       IRTemp dupd  = math_DUP_TO_V128(elem, ity);
-      IROp   opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
-      IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
-      IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
       IRTemp rm    = mk_get_IR_rounding_mode();
-      IRTemp t1    = newTempV128();
-      IRTemp t2    = newTempV128();
-      // FIXME: double rounding; use FMA primops instead
-      assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
-      assign(t2, triop(isSUB ? opSUB : opADD,
-                       mkexpr(rm), getQReg128(dd), mkexpr(t1)));
+
+      UInt unrollSize= (isD ? 2 : 4);
+      IRTemp res=unroll_vectorized_fma(unrollSize, isSUB, mkexpr(rm),
+				       getQReg128(nn),
+				       mkexpr(dupd),
+				       getQReg128(dd));
+
       putQReg128(dd,
                  mkexpr(math_ZERO_ALL_EXCEPT_LOWEST_LANE(isD ? 3 : 2,
-                                                         mkexpr(t2))));
+                                                         mkexpr(res))));
       const HChar c = isD ? 'd' : 's';
       DIP("%s %c%u, %c%u, %s.%c[%u]\n", isSUB ? "fmls" : "fmla",
           c, dd, c, nn, nameQReg128(mm), c, index);
@@ -13214,18 +13321,15 @@ Bool dis_AdvSIMD_three_same(/*MB_OUT*/DisResult* dres, UInt insn)
       Bool isD   = (size & 1) == 1;
       Bool isSUB = (size & 2) == 2;
       if (bitQ == 0 && isD) return False; // implied 1d case
-      IROp opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
-      IROp opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
-      IROp opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
       IRTemp rm = mk_get_IR_rounding_mode();
-      IRTemp t1 = newTempV128();
-      IRTemp t2 = newTempV128();
-      // FIXME: double rounding; use FMA primops instead
-      assign(t1, triop(opMUL,
-                       mkexpr(rm), getQReg128(nn), getQReg128(mm)));
-      assign(t2, triop(isSUB ? opSUB : opADD,
-                       mkexpr(rm), getQReg128(dd), mkexpr(t1)));
-      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
+
+      UInt unrollSize= (isD ? 2 : 4);
+      IRTemp res= unroll_vectorized_fma(unrollSize, isSUB, mkexpr(rm),
+					getQReg128(nn),
+					getQReg128(mm),
+					getQReg128(dd));
+
+      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
       DIP("%s %s.%s, %s.%s, %s.%s\n", isSUB ? "fmls" : "fmla",
           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm), arr);
@@ -14438,17 +14542,15 @@ Bool dis_AdvSIMD_vector_x_indexed_elem(/*MB_OUT*/DisResult* dres, UInt insn)
       UInt   mm    = (bitM << 4) | mmLO4;
       assign(elem, getQRegLane(mm, index, ity));
       IRTemp dupd  = math_DUP_TO_V128(elem, ity);
-      IROp   opADD = isD ? Iop_Add64Fx2 : Iop_Add32Fx4;
-      IROp   opSUB = isD ? Iop_Sub64Fx2 : Iop_Sub32Fx4;
-      IROp   opMUL = isD ? Iop_Mul64Fx2 : Iop_Mul32Fx4;
       IRTemp rm    = mk_get_IR_rounding_mode();
-      IRTemp t1    = newTempV128();
-      IRTemp t2    = newTempV128();
-      // FIXME: double rounding; use FMA primops instead
-      assign(t1, triop(opMUL, mkexpr(rm), getQReg128(nn), mkexpr(dupd)));
-      assign(t2, triop(isSUB ? opSUB : opADD,
-                       mkexpr(rm), getQReg128(dd), mkexpr(t1)));
-      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, t2));
+
+      UInt unrollSize= (isD ? 2 : 4);
+      IRTemp res=unroll_vectorized_fma(unrollSize, isSUB, mkexpr(rm),
+				       getQReg128(nn),
+				       mkexpr(dupd),
+				       getQReg128(dd));
+
+      putQReg128(dd, math_MAYBE_ZERO_HI64(bitQ, res));
       const HChar* arr = bitQ == 0 ? "2s" : (isD ? "2d" : "4s");
       DIP("%s %s.%s, %s.%s, %s.%c[%u]\n", isSUB ? "fmls" : "fmla",
           nameQReg128(dd), arr, nameQReg128(nn), arr, nameQReg128(mm),
@@ -15682,7 +15784,7 @@ Bool dis_AdvSIMD_fp_data_proc_3_source(/*MB_OUT*/DisResult* dres, UInt insn)
       UInt    ix    = (bitO1 << 1) | bitO0;
       IRType  ity   = isD ? Ity_F64 : Ity_F32;
       IROp    opFMADD = mkFMADDF(ity);
-      IROp    opFMSUB = mkFMSUBF(ity);
+      //IROp    opFMSUB = mkFMSUBF(ity);
       IROp    opNEG = mkNEGF(ity);
       IRTemp  res   = newTemp(ity);
       IRExpr* eA    = getQRegLO(aa, ity);
@@ -15694,7 +15796,7 @@ Bool dis_AdvSIMD_fp_data_proc_3_source(/*MB_OUT*/DisResult* dres, UInt insn)
             assign(res, qop(opFMADD, rm, eN, eM, eA));
             break;
          case 1: /* FMSUB */
-            assign(res, qop(opFMSUB, rm, eN, eM, eA));
+            assign(res, qop(opFMADD, rm, unop(opNEG,eN), eM, eA));
             break;
          case 2: /* FNMADD */
             assign(res, qop(opFMADD, rm, unop(opNEG, eN), eM,
diff --git a/VEX/priv/host_arm64_isel.c b/VEX/priv/host_arm64_isel.c
index 34c526559..db5ccc8bf 100644
--- a/VEX/priv/host_arm64_isel.c
+++ b/VEX/priv/host_arm64_isel.c
@@ -3452,6 +3452,12 @@ static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
             addInstr(env, ARM64Instr_VUnaryD(ARM64fpu_RINTE, dst, src));
             return dst;
          }
+         case Iop_ReinterpI64asF64: {
+            HReg src   = iselIntExpr_R(env, e->Iex.Unop.arg);
+            HReg dst   = newVRegD(env);
+            addInstr(env, ARM64Instr_VDfromX(dst,src));
+            return dst;
+         }
          default:
             break;
       }
@@ -3652,6 +3658,12 @@ static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e )
             addInstr(env, ARM64Instr_VUnaryS(ARM64fpu_RINTE, dst, src));
             return dst;
          }
+         case Iop_ReinterpI32asF32: {
+            HReg src   = iselIntExpr_R(env, e->Iex.Unop.arg);
+            HReg dst   = newVRegD(env);
+            addInstr(env, ARM64Instr_VDfromX(dst,src));
+            return dst;
+         }
          default:
             break;
       }
diff --git a/none/tests/arm64/fp_and_simd.stdout.exp b/none/tests/arm64/fp_and_simd.stdout.exp
index aba3ed295..2b2337b04 100644
--- a/none/tests/arm64/fp_and_simd.stdout.exp
+++ b/none/tests/arm64/fp_and_simd.stdout.exp
@@ -26537,7 +26537,7 @@ fminp v2.2s, v23.2s, v11.2s   5d370116702dec2f0853b0775b14704a  c5c7be2ff6281ec1
 fmaxv s2, v23.4s   e50c87c40611df68b742b56dee7dd8be  61cd25cb19673e5118debc4ed1ae7687  00000000000000000000000061cd25cb  61cd25cb19673e5118debc4ed1ae7687 fpsr=00000000
 fminv s2, v23.4s   3ba37b9d5e20798112db006efa0a4cd2  0484896dbedf3b16a16bdeb5a97c1d4c  000000000000000000000000bedf3b16  0484896dbedf3b16a16bdeb5a97c1d4c fpsr=00000000
 fmla v2.2d, v23.2d, v11.2d   a24c16b6fbf1dbe7767b860520768f7a  28fff582518a22a319e3e171a571d9ac  537003f26d34a4719a85bb76992a9f95  3c7ffd64956e2bfc767b860520768f7a  28fff582518a22a319e3e171a571d9ac  537003f26d34a4719a85bb76992a9f95 fpsr=00000000
-fmla v2.4s, v23.4s, v11.4s   4b2e18537a0be303eac51d54243b2f57  6ac81f4aeb27b92437ba5bb916bb38b3  a28eb68cc46da8d47b4bccf5ea603cd7  cdd9af5a7a0be30d73945bbec1a3fe1b  6ac81f4aeb27b92437ba5bb916bb38b3  a28eb68cc46da8d47b4bccf5ea603cd7 fpsr=00000000
+fmla v2.4s, v23.4s, v11.4s   4b2e18537a0be303eac51d54243b2f57  6ac81f4aeb27b92437ba5bb916bb38b3  a28eb68cc46da8d47b4bccf5ea603cd7  cdd9af5b7a0be30d73945bbec1a3fe1b  6ac81f4aeb27b92437ba5bb916bb38b3  a28eb68cc46da8d47b4bccf5ea603cd7 fpsr=00000000
 fmla v2.2s, v23.2s, v11.2s   bd3a1d66fb90508d9b554cc88bd9417c  64c60fab54e44979227656a5f72b2f2d  3cf1f48789bda456e4e968a851c12790  0000000000000000c7e09991ff800000  64c60fab54e44979227656a5f72b2f2d  3cf1f48789bda456e4e968a851c12790 fpsr=00000000
 fmls v2.2d, v23.2d, v11.2d   8d01c3adf5066458cc3f2583a0d0c181  6da307c3b831f3561c1f07e7838ee807  c934b1a91030fb0fd6ad847a846bf852  76e89d03a0a967eecc3f2583a0d0c181  6da307c3b831f3561c1f07e7838ee807  c934b1a91030fb0fd6ad847a846bf852 fpsr=00000000
 fmls v2.4s, v23.4s, v11.4s   1a24700fe0e892172f4cd975a0cd533c  8febd45002ac2b1d9a7e85640e989632  e67a3a51cd8d0bc03515c85211aaefae  b6e6830fe0e892172f4cd975a0cd533c  8febd45002ac2b1d9a7e85640e989632  e67a3a51cd8d0bc03515c85211aaefae fpsr=00000000
@@ -26554,7 +26554,7 @@ fmla v2.2d, v11.2d, v29.d[0]   618ac9772a2770959500ac0e16c1e17f  c4464ff780bf931
 fmla v2.2d, v11.2d, v29.d[1]   2013ede0878f12d99a437060904c189e  9b6eb24aa9cf28f51a19be44a1098c3a  9305f1b5675893e794bbcad76929d605  2013ede0878f12d99a437060904c189e  9b6eb24aa9cf28f51a19be44a1098c3a  9305f1b5675893e794bbcad76929d605 fpsr=00000000
 fmla v2.4s, v11.4s, v29.s[0]   ca9647a19b4fca4f05513f650acfc98c  48ce928e2795f701340652530ab07605  639299ffca0408200680c4682e35a72e  ca9647a19b4f951a22be9fd90acfc98c  48ce928e2795f701340652530ab07605  639299ffca0408200680c4682e35a72e fpsr=00000000
 fmla v2.4s, v11.4s, v29.s[3]   d0f4ee2ee71de562dc700e0df7d8a732  95bfbe48e522971eef5ebfce191b1b56  8ce26234dbd360878142d79e1f367610  d0f4ee2ee71de562dc700e0df7d8a732  95bfbe48e522971eef5ebfce191b1b56  8ce26234dbd360878142d79e1f367610 fpsr=00000000
-fmla v2.2s, v11.2s, v29.s[0]   1f35b9149b2ad61ba7ab915bef14eb2c  b0c6183186fa1016b81832c8103c2999  4e7bc70b85821deba4c837b0b1c88948  00000000000000002a691609ef14eb2c  b0c6183186fa1016b81832c8103c2999  4e7bc70b85821deba4c837b0b1c88948 fpsr=00000000
+fmla v2.2s, v11.2s, v29.s[0]   1f35b9149b2ad61ba7ab915bef14eb2c  b0c6183186fa1016b81832c8103c2999  4e7bc70b85821deba4c837b0b1c88948  00000000000000002a69160aef14eb2c  b0c6183186fa1016b81832c8103c2999  4e7bc70b85821deba4c837b0b1c88948 fpsr=00000000
 fmla v2.2s, v11.2s, v29.s[3]   ddf389ddc7dd949e723328dbefc60b40  4051ef84669665d8865b6ab40a8f385a  1bb053a3944a2bf1bfdf2b18655c7492  0000000000000000723328dbefc60b40  4051ef84669665d8865b6ab40a8f385a  1bb053a3944a2bf1bfdf2b18655c7492 fpsr=00000000
 fmls v2.2d, v11.2d, v29.d[0]   0b305c876a361cea3c09d28cf6ee076d  d46c77c8cebb0584e895f472b500778c  54f54a78f103a62d31c254b0164fb13d  46404eb80a118e805a692736056207d9  d46c77c8cebb0584e895f472b500778c  54f54a78f103a62d31c254b0164fb13d fpsr=00000000
 randV128: 768 calls, 1012 iters